#!/usr/bin/env python import os import re import time import glob import imp import pickle import gzip from robots import awstats_robots; print '==> Start' meta_visit = {} analyse_started = False current_visits = {} cache_plugins = {} log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ '"$request" $status $body_bytes_sent ' +\ '"$http_referer" "$http_user_agent"'; log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format); log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted) http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') #09/Nov/2014:06:35:16 +0100 time_format = '%d/%b/%Y:%H:%M:%S +0100' #print "Log format : " + log_format_extracted log_re = re.compile(log_format_extracted) uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200] PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py' POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py' DB_ROOT = './output/' META_PATH = DB_ROOT + 'meta.db' DB_FILENAME = 'iwla.db' print '==> Generating robot dictionary' awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) def createEmptyVisits(): visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}} return visits def createEmptyMeta(): meta = {'last_time':None} return meta def getDBFilename(time): return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) def serialize(obj, filename): base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # TODO : remove return return with open(filename + '.tmp', 'wb+') as f: pickle.dump(obj, f) f.seek(0) with gzip.open(filename, 'w') as fzip: fzip.write(f.read()) os.remove(filename + '.tmp') def deserialize(filename): if not os.path.exists(filename): return None with gzip.open(filename, 'r') as f: return pickle.load(f) return None def callPlugins(path, *kwargs): print '==> Call plugins (%s)' % path plugins = glob.glob(path) plugins.sort() for p in plugins: print '\t%s' % (p) if not p in cache_plugins: mod = imp.load_source('hook', p) cache_plugins[p] = mod else: mod = cache_plugins[p] mod.hook(*kwargs) def isPage(request): for e in pages_extensions: if request.endswith(e): return True return False def appendHit(hit): remote_addr = hit['remote_addr'] if not remote_addr in current_visits['visits'].keys(): createUser(hit) return super_hit = current_visits['visits'][remote_addr] super_hit['pages'].append(hit) super_hit['bandwith'] += int(hit['body_bytes_sent']) super_hit['last_access'] = meta_visit['last_time'] request = hit['extract_request'] if 'extract_uri' in request.keys(): uri = request['extract_uri'] else: uri = request['http_uri'] hit['is_page'] = isPage(uri) # Don't count 3xx status status = int(hit['status']) if status >= 300 and status < 400: return if super_hit['robot'] or\ not status in viewed_http_codes: page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def createUser(hit): super_hit = current_visits['visits'][hit['remote_addr']] = {} super_hit['viewed_pages'] = 0; super_hit['viewed_hits'] = 0; super_hit['not_viewed_pages'] = 0; super_hit['not_viewed_hits'] = 0; super_hit['bandwith'] = 0; super_hit['last_access'] = meta_visit['last_time'] super_hit['pages'] = []; super_hit['robot'] = isRobot(hit); appendHit(hit) def isRobot(hit): for r in awstats_robots: if r.match(hit['http_user_agent']): return True return False def decodeHTTPRequest(hit): if not 'request' in hit.keys(): return False groups = http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = uri_re.match(hit['extract_request']['http_uri']); if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] if 'extract_parameters' in d.keys(): hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: print "Bad request extraction " + hit['request'] return False referer_groups = uri_re.match(hit['http_referer']); if referer_groups: referer = hit['extract_referer'] = referer_groups.groupdict() return True def decodeTime(hit): t = hit['time_local'] hit['time_decoded'] = time.strptime(t, time_format) def generateStats(visits): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 #stats['pages'] = set() stats['nb_visitors'] = 0 for k in visits.keys(): super_hit = visits[k] if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwith'] continue print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwith'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits'] # for p in super_hit['pages']: # if not p['is_page']: continue # req = p['extract_request'] # stats['pages'].add(req['extract_uri']) return stats def generateMonthStats(): visits = current_visits['visits'] stats = generateStats(visits) cur_time = meta_visit['last_time'] print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print stats valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} callPlugins(POST_HOOK_DIRECTORY, valid_visitors) current_visits['month_stats'] = stats path = getDBFilename(cur_time) if os.path.exists(path): os.remove(path) print "==> Serialize to %s" % path serialize(current_visits, path) def generateDayStats(): visits = current_visits['visits'] callPlugins(PRE_HOOK_DIRECTORY, visits) stats = generateStats(visits) cur_time = meta_visit['last_time'] print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) if cur_time.tm_mday > 1: last_day = cur_time.tm_mday - 1 while last_day: if last_day in current_visits['days_stats'].keys(): break last_day -= 1 if last_day: for k in stats.keys(): stats[k] -= current_visits['days_stats'][last_day][k] stats['nb_visitors'] = 0 for k in visits.keys(): if visits[k]['robot']: continue if visits[k]['last_access'].tm_mday == cur_time.tm_mday: stats['nb_visitors'] += 1 print stats current_visits['days_stats'][cur_time.tm_mday] = stats def newHit(hit): global current_visits global analyse_started decodeTime(hit) t = hit['time_decoded'] cur_time = meta_visit['last_time'] if cur_time == None: current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() analyse_started = True else: if not analyse_started: if time.mktime(cur_time) >= time.mktime(t): return else: analyse_started = True current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() if cur_time.tm_mon != t.tm_mon: generateMonthStats() current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() elif cur_time.tm_mday != t.tm_mday: generateDayStats() meta_visit['last_time'] = t if not decodeHTTPRequest(hit): return False for k in hit.keys(): if hit[k] == '-': hit[k] = '' appendHit(hit) return True print '==> Analysing log' meta_visit = deserialize(META_PATH) or createEmptyMeta() current_visits = createEmptyVisits() f = open("access.log") for l in f: # print "line " + l; groups = log_re.match(l) if groups: if not newHit(groups.groupdict()): break else: print "No match " + l f.close(); if analyse_started: generateDayStats() generateMonthStats() serialize(meta_visit, META_PATH) else: print '==> Analyse not started : nothing to do'