#!/usr/bin/env python import os import re import time import glob import imp import pickle import gzip from display import * from default_conf import * from conf import * class IWLA(object): ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 def __init__(self): print '==> Start' self.meta_infos = {} self.analyse_started = False self.current_analysis = {} self.cache_plugins = {} self.display = {} self.valid_visitors = None self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') self.plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, POST_HOOK_DIRECTORY : post_analysis_hooks, DISPLAY_HOOK_DIRECTORY : display_hooks} def _preloadPlugins(self): ret = True for root in self.plugins.keys(): for plugin_name in self.plugins[root]: p = root + '/' + plugin_name try: fp, pathname, description = imp.find_module(plugin_name, [root]) self.cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description) mod = self.cache_plugins[p] infos = mod.get_plugins_infos() if infos['class'] != IWLA.ANALYSIS_CLASS or \ IWLA.API_VERSION < infos['min_version'] or\ (infos['max_version'] != -1 and (IWLA.API_VERSION > infos['max_version'])): del self.cache_plugins[p] elif not mod.load(): del self.cache_plugins[p] except Exception as e: print 'Error loading \'%s\' => %s' % (p, e) ret = False return ret def _clearVisits(self): self.current_analysis = { 'days_stats' : {}, 'month_stats' : {}, 'visits' : {} } self.valid_visitors = None return self.current_analysis def getDaysStats(self): return self.current_analysis['days_stats'] def getMonthStatsStats(self): return self.current_analysis['month_stats'] def getCurrentVisists(self): return self.current_analysis['visits'] def getValidVisitors(self): return self.current_analysis['visits'] def _clearMeta(self): self.meta_infos = { 'last_time' : None } return self.meta_infos def _clearDisplay(self): self.display = {} return self.display def getDBFilename(self, time): return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) def _serialize(self, obj, filename): base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # TODO : remove return return with open(filename + '.tmp', 'wb+') as f: pickle.dump(obj, f) f.seek(0) with gzip.open(filename, 'w') as fzip: fzip.write(f.read()) os.remove(filename + '.tmp') def _deserialize(self, filename): if not os.path.exists(filename): return None with gzip.open(filename, 'r') as f: return pickle.load(f) return None def _callPlugins(self, root, *args): print '==> Call plugins (%s)' % root for p in self.plugins[root]: print '\t%s' % (p) mod = self.cache_plugins[root + '/' + p] mod.hook(*args) def isPage(self, request): for e in pages_extensions: if request.endswith(e): return True return False def _appendHit(self, hit): remote_addr = hit['remote_addr'] if not remote_addr in self.current_analysis['visits'].keys(): self._createUser(hit) return super_hit = self.current_analysis['visits'][remote_addr] super_hit['requests'].append(hit) super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] request = hit['extract_request'] if 'extract_uri' in request.keys(): uri = request['extract_uri'] else: uri = request['http_uri'] hit['is_page'] = self.isPage(uri) # Don't count 3xx status status = int(hit['status']) if status >= 300 and status < 400: return if super_hit['robot'] or\ not status in viewed_http_codes: page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def _createUser(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 super_hit['viewed_hits'] = 0 super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_hits'] = 0 super_hit['bandwidth'] = 0 super_hit['last_access'] = self.meta_infos['last_time'] super_hit['requests'] = [] super_hit['robot'] = False super_hit['hit_only'] = 0 self._appendHit(hit) def _decodeHTTPRequest(self, hit): if not 'request' in hit.keys(): return False groups = self.http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] if 'extract_parameters' in d.keys(): hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: print "Bad request extraction " + hit['request'] return False referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: referer = hit['extract_referer'] = referer_groups.groupdict() return True def _decodeTime(self, hit): hit['time_decoded'] = time.strptime(hit['time_local'], time_format) def getDisplayIndex(self): cur_time = self.meta_infos['last_time'] filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) return self.display.get(filename, None) def _generateDisplayDaysStat(self): cur_time = self.meta_infos['last_time'] title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) page = createPage(self.display, filename, title) days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth']) keys = self.current_analysis['days_stats'].keys() keys.sort() nb_visits = 0 for k in keys: stats = self.current_analysis['days_stats'][k] row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] row = map(lambda(v): str(v), row) appendRowToTable(days, row) nb_visits += stats['nb_visitors'] stats = self.current_analysis['month_stats'] nb_days = len(keys) row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: row = map(lambda(v): str(int(v/nb_days)), row) else: row = map(lambda(v): '0', row) row[0] = 'Average' appendRowToTable(days, row) row = ['Total', nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] row = map(lambda(v): str(v), row) appendRowToTable(days, row) appendBlockToPage(page, days) def _generateDisplay(self): self._generateDisplayDaysStat() self._callPlugins(DISPLAY_HOOK_DIRECTORY, self.current_analysis, self.display) buildPages(DISPLAY_ROOT, self.display) def _generateStats(self, visits): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 #stats['requests'] = set() stats['nb_visitors'] = 0 for k in visits.keys(): super_hit = visits[k] if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwidth'] continue #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) if not super_hit['hit_only']: stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwidth'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits'] # for p in super_hit['requests']: # if not p['is_page']: continue # req = p['extract_request'] # stats['requests'].add(req['extract_uri']) return stats def _generateMonthStats(self): self._clearDisplay() visits = self.current_analysis['visits'] stats = self._generateStats(visits) cur_time = self.meta_infos['last_time'] print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print stats self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} self._callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats) self.current_analysis['month_stats'] = stats path = self.getDBFilename(cur_time) if os.path.exists(path): os.remove(path) print "==> Serialize to %s" % path self._serialize(self.current_analysis, path) self._generateDisplay() def _generateDayStats(self): visits = self.current_analysis['visits'] self._callPlugins(PRE_HOOK_DIRECTORY, visits) stats = self._generateStats(visits) cur_time = self.meta_infos['last_time'] print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) if cur_time.tm_mday > 1: last_day = cur_time.tm_mday - 1 while last_day: if last_day in self.current_analysis['days_stats'].keys(): break last_day -= 1 if last_day: for k in stats.keys(): stats[k] -= self.current_analysis['days_stats'][last_day][k] stats['nb_visitors'] = 0 for k in visits.keys(): if visits[k]['robot']: continue if visits[k]['last_access'].tm_mday == cur_time.tm_mday: stats['nb_visitors'] += 1 print stats self.current_analysis['days_stats'][cur_time.tm_mday] = stats def _newHit(self, hit): self._decodeTime(hit) t = hit['time_decoded'] cur_time = self.meta_infos['last_time'] if cur_time == None: self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: if not self.analyse_started: if time.mktime(cur_time) >= time.mktime(t): return else: self.analyse_started = True if cur_time.tm_mon != t.tm_mon: self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() elif cur_time.tm_mday != t.tm_mday: self._generateDayStats() self.meta_infos['last_time'] = t if not self._decodeHTTPRequest(hit): return False for k in hit.keys(): if hit[k] == '-': hit[k] = '' self._appendHit(hit) return True def start(self): self._preloadPlugins() print '==> Analysing log' self.meta_infos = self._deserialize(META_PATH) or self._clearMeta() if self.meta_infos['last_time']: self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() with open(analyzed_filename) as f: for l in f: # print "line " + l groups = self.log_re.match(l) if groups: if not self._newHit(groups.groupdict()): break else: print "No match for " + l if self.analyse_started: self._generateDayStats() self._generateMonthStats() self._serialize(meta_infos, META_PATH) else: print '==> Analyse not started : nothing to do' self._generateMonthStats() iwla = IWLA() iwla.start()