#!/usr/bin/env python import os import re import time import glob import imp from robots import awstats_robots; print '==> Start' current_visit = {} log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ '"$request" $status $body_bytes_sent ' +\ '"$http_referer" "$http_user_agent"'; log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format); log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted) http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') #09/Nov/2014:06:35:16 +0100 time_format = '%d/%b/%Y:%H:%M:%S +0100' #print "Log format : " + log_format_extracted log_re = re.compile(log_format_extracted) uri_re = re.compile(r'(?P[^\?]*)\?(?P.*)') pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200] cur_time = None print '==> Generating robot dictionary' awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) def isPage(request): for e in pages_extensions: if request.endswith(e): return True return False def appendHit(hit): super_hit = current_visit[hit['remote_addr']] super_hit['pages'].append(hit) super_hit['bandwith'] += int(hit['body_bytes_sent']) request = hit['extract_request'] if 'extract_uri' in request.keys(): uri = request['extract_uri'] else: uri = request['http_uri'] hit['is_page'] = isPage(uri) # Don't count redirect status if int(hit['status']) == 302: return if super_hit['robot'] or\ not int(hit['status']) in viewed_http_codes: page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def createGeneric(hit): super_hit = current_visit[hit['remote_addr']] = {} super_hit['viewed_pages'] = 0; super_hit['viewed_hits'] = 0; super_hit['not_viewed_pages'] = 0; super_hit['not_viewed_hits'] = 0; super_hit['bandwith'] = 0; super_hit['pages'] = []; return super_hit def createUser(hit, robot): super_hit = createGeneric(hit) super_hit['robot'] = robot; appendHit(hit) def isRobot(hit): for r in awstats_robots: if r.match(hit['http_user_agent']): return True return False def decode_http_request(hit): if not 'request' in hit.keys(): return False groups = http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = uri_re.match(hit['extract_request']['http_uri']); if uri_groups: hit['extract_request']['extract_uri'] = uri_groups.group('extract_uri') hit['extract_request']['extract_parameters'] = uri_groups.group('extract_parameters') else: print "Bad request extraction " + hit['request'] return False referer_groups = uri_re.match(hit['http_referer']); if referer_groups: hit['extract_referer']['extract_uri'] = referer_groups.group('extract_uri') hit['extract_referer']['extract_parameters'] = referer_groups.group('extract_parameters') return True def decode_time(hit): t = hit['time_local'] hit['time_decoded'] = time.strptime(t, time_format) def newHit(hit): global cur_time if not decode_http_request(hit): return for k in hit.keys(): if hit[k] == '-': hit[k] = '' decode_time(hit) t = hit['time_decoded'] current_visit['last_time'] = t if cur_time == None: cur_time = t else: if cur_time.tm_mday != t.tm_mday: return False remote_addr = hit['remote_addr'] if remote_addr in current_visit.keys(): appendHit(hit) else: createUser(hit, isRobot(hit)) return True print '==> Analysing log' f = open("access.log") for l in f: # print "line " + l; groups = log_re.match(l) if groups: if not newHit(groups.groupdict()): break else: print "No match " + l f.close(); print '==> Call plugins' plugins = glob.glob('./hooks_pre/*.py') plugins.sort() for p in plugins: print '\t%s' % (p) mod = imp.load_source('hook', p) mod.hook(current_visit) for ip in current_visit.keys(): hit = current_visit[ip] if hit['robot']: continue print "%s =>" % (ip) for k in hit.keys(): if k != 'pages': print "\t%s : %s" % (k, current_visit[ip][k])