#!/usr/bin/env python import os import shutil import sys import re import time import pickle import gzip import importlib import argparse from calendar import monthrange from datetime import date import default_conf as conf import conf as _ conf.__dict__.update(_.__dict__) del _ from iplugin import * from display import * # # Main class IWLA # Parse Log, compute them, call plugins and produce output # For now, only HTTP log are valid # # Plugin requirements : # None # # Conf values needed : # analyzed_filename # domain_name # # Output files : # DB_ROOT/meta.db # DB_ROOT/year/month/iwla.db # OUTPUT_ROOT/index.html # OUTPUT_ROOT/year/month/index.html # # Statistics creation : # # meta => # last_time # start_analysis_time # stats => # year => # month => # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visitors # # month_stats : # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visitors # # days_stats : # day => # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visitors # # visits : # remote_addr => # remote_addr # remote_ip # viewed_pages # viewed_hits # not_viewed_pages # not_viewed_hits # bandwidth # last_access # requests => # [fields_from_format_log] # extract_request => # extract_uri # extract_parameters* # extract_referer* => # extract_uri # extract_parameters* # robot # hit_only # is_page # # valid_visitors: # month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors) # # Statistics update : # None # # Statistics deletion : # None # class IWLA(object): ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 IWLA_VERSION = '0.1' def __init__(self): print '==> Start' self.meta_infos = {} self.analyse_started = False self.current_analysis = {} self.cache_plugins = {} self.display = DisplayHTMLBuild(self) self.valid_visitors = None self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?]+)(\?(?P.+))?') self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks), (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks), (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)] def getVersion(self): return IWLA.IWLA_VERSION def getConfValue(self, key, default=None): if not key in dir(conf): return default else: return conf.__dict__[key] def _clearVisits(self): self.current_analysis = { 'days_stats' : {}, 'month_stats' : {}, 'visits' : {} } self.valid_visitors = None return self.current_analysis def getDaysStats(self): return self.current_analysis['days_stats'] def getMonthStats(self): return self.current_analysis['month_stats'] def getCurrentVisists(self): return self.current_analysis['visits'] def getValidVisitors(self): return self.valid_visitors def getDisplay(self): return self.display def getCurTime(self): return self.meta_infos['last_time'] def getStartAnalysisTime(self): return self.meta_infos['start_analysis_time'] def isValidForCurrentAnalysis(self, request): cur_time = self.meta_infos['start_analysis_time'] # Analyse not started if not cur_time: return False return (time.mktime(cur_time) < time.mktime(request['time_decoded'])) def hasBeenViewed(self, request): return int(request['status']) in conf.viewed_http_codes def getCurDisplayPath(self, filename): cur_time = self.meta_infos['last_time'] return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename) def getResourcesPath(self): return conf.resources_path def getCSSPath(self): return conf.css_path def _clearMeta(self): self.meta_infos = { 'last_time' : None, 'start_analysis_time' : None } return self.meta_infos def _clearDisplay(self): self.display = DisplayHTMLBuild(self) return self.display def getDBFilename(self, time): return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME) def _serialize(self, obj, filename): base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # TODO : remove return #return with open(filename + '.tmp', 'wb+') as f: pickle.dump(obj, f) f.seek(0) with gzip.open(filename, 'w') as fzip: fzip.write(f.read()) os.remove(filename + '.tmp') def _deserialize(self, filename): if not os.path.exists(filename): return None with gzip.open(filename, 'r') as f: return pickle.load(f) return None def _callPlugins(self, target_root, *args): print '==> Call plugins (%s)' % target_root for (root, plugins) in self.plugins: if root != target_root: continue for p in plugins: mod = self.cache_plugins.get(root + '.' + p, None) if mod: print '\t%s' % (p) mod.hook(*args) def isPage(self, request): for e in conf.pages_extensions: if request.endswith(e): return True return False def _appendHit(self, hit): remote_addr = hit['remote_addr'] if not remote_addr: return if not remote_addr in self.current_analysis['visits'].keys(): self._createVisitor(hit) super_hit = self.current_analysis['visits'][remote_addr] super_hit['requests'].append(hit) super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] request = hit['extract_request'] uri = request.get('extract_uri', request['http_uri']) hit['is_page'] = self.isPage(uri) status = int(hit['status']) if status not in conf.viewed_http_codes: return if super_hit['robot'] or\ not status in conf.viewed_http_codes: page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_ip'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 super_hit['viewed_hits'] = 0 super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_hits'] = 0 super_hit['bandwidth'] = 0 super_hit['last_access'] = self.meta_infos['last_time'] super_hit['requests'] = [] super_hit['robot'] = False super_hit['hit_only'] = 0 def _decodeHTTPRequest(self, hit): if not 'request' in hit.keys(): return False groups = self.http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] if 'extract_parameters' in d.keys(): hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: print "Bad request extraction " + hit['request'] return False if hit['http_referer']: referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: hit['extract_referer'] = referer_groups.groupdict() return True def _decodeTime(self, hit): try: hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format) except ValueError, e: if sys.version_info < (3, 2): # Try without UTC value at the end (%z not recognized) gmt_offset_str = hit['time_local'][-5:] gmt_offset_hours = int(gmt_offset_str[1:3])*60*60 gmt_offset_minutes = int(gmt_offset_str[3:5])*60 gmt_offset = gmt_offset_hours + gmt_offset_minutes hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3]) if gmt_offset_str[0] == '+': hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset) else: hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset) else: raise e return hit['time_decoded'] def getDisplayIndex(self): cur_time = self.meta_infos['last_time'] filename = self.getCurDisplayPath('index.html') return self.display.getPage(filename) def _generateDisplayDaysStats(self): cur_time = self.meta_infos['last_time'] title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) filename = self.getCurDisplayPath('index.html') print '==> Generate display (%s)' % (filename) page = self.display.createPage(title, filename, conf.css_path) _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon) days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, 'By day', ['Day', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], None, nb_month_days, range(1,6)) days.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth']) nb_visits = 0 nb_days = 0 for i in range(1, nb_month_days+1): day = '%d
%s' % (i, time.strftime('%b', cur_time)) full_day = '%d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year) if i in self.current_analysis['days_stats'].keys(): stats = self.current_analysis['days_stats'][i] row = [full_day, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] nb_visits += stats['nb_visitors'] nb_days += 1 else: row = [full_day, 0, 0, 0, 0, 0] days.appendRow(row) days.setCellValue(i-1, 4, bytesToStr(row[4])) days.setCellValue(i-1, 5, bytesToStr(row[5])) days.appendShortTitle(day) adate = date(cur_time.tm_year, cur_time.tm_mon, i) week_day = adate.weekday() if week_day == 5 or week_day == 6: days.setRowCSSClass(i-1, 'iwla_weekend') if adate == date.today(): css = days.getCellCSSClass(i, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' days.setCellCSSClass(i-1, 0, css) stats = self.current_analysis['month_stats'] row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: average_row = map(lambda(v): int(v/nb_days), row) else: average_row = map(lambda(v): 0, row) average_row[0] = 'Average' average_row[4] = bytesToStr(average_row[4]) average_row[5] = bytesToStr(average_row[5]) days.appendRow(average_row) row[0] = 'Total' row[4] = bytesToStr(row[4]) row[5] = bytesToStr(row[5]) days.appendRow(row) page.appendBlock(days) self.display.addPage(page) def _generateDisplayMonthStats(self, page, year, month_stats): cur_time = time.localtime() months_name = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] title = 'Summary %d' % (year) cols = ['Month', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth', 'Details'] graph_cols=range(1,6) months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols) months.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', '']) total = [0] * len(cols) for i in range(1, 13): month = '%s
%d' % (months_name[i], year) full_month = '%s %d' % (months_name[i], year) if i in month_stats.keys(): stats = month_stats[i] link = 'Details' % (year, i) row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link] for j in graph_cols: total[j] += row[j] else: row = [full_month, 0, 0, 0, 0, 0, ''] months.appendRow(row) months.setCellValue(i-1, 4, bytesToStr(row[4])) months.setCellValue(i-1, 5, bytesToStr(row[5])) months.appendShortTitle(month) if year == cur_time.tm_year and i == cur_time.tm_mon: css = months.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' months.setCellCSSClass(i-1, 0, css) total[0] = 'Total' total[4] = bytesToStr(total[4]) total[5] = bytesToStr(total[5]) months.appendRow(total) page.appendBlock(months) def _generateDisplayWholeMonthStats(self): title = 'Stats for %s' % (conf.domain_name) filename = 'index.html' print '==> Generate main page (%s)' % (filename) page = self.display.createPage(title, filename, conf.css_path) last_update = 'Last update %s
' % (time.strftime('%d %b %Y %H:%M', time.localtime())) page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update)) for year in self.meta_infos['stats'].keys(): self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year]) self.display.addPage(page) def _generateDisplay(self): self._generateDisplayDaysStats() self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY) self._generateDisplayWholeMonthStats() self.display.build(conf.DISPLAY_ROOT) def _generateStats(self, visits): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 #stats['requests'] = set() stats['nb_visitors'] = 0 for (k, super_hit) in visits.items(): if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwidth'] continue #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) if conf.count_hit_only_visitors or\ super_hit['viewed_pages']: stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwidth'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits'] # for p in super_hit['requests']: # if not p['is_page']: continue # req = p['extract_request'] # stats['requests'].add(req['extract_uri']) return stats def _generateMonthStats(self): self._clearDisplay() visits = self.current_analysis['visits'] stats = self._generateStats(visits) duplicated_stats = {k:v for (k,v) in stats.items()} cur_time = self.meta_infos['last_time'] print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print stats if not 'month_stats' in self.current_analysis.keys(): self.current_analysis['month_stats'] = stats else: for (k,v) in stats.items(): self.current_analysis['month_stats'][k] = v self.valid_visitors = {} for (k,v) in visits.items(): if v['robot']: continue if conf.count_hit_only_visitors and\ (not v['viewed_pages']): continue self.valid_visitors[k] = v duplicated_stats['visitors'] = stats['visitors'] = len(self.valid_visitors.keys()) self._callPlugins(conf.POST_HOOK_DIRECTORY) path = self.getDBFilename(cur_time) if os.path.exists(path): os.remove(path) print "==> Serialize to %s" % path self._serialize(self.current_analysis, path) # Save month stats year = cur_time.tm_year month = cur_time.tm_mon if not 'stats' in self.meta_infos.keys(): self.meta_infos['stats'] = {} if not year in self.meta_infos['stats'].keys(): self.meta_infos['stats'][year] = {} self.meta_infos['stats'][year][month] = duplicated_stats self._generateDisplay() def _generateDayStats(self): visits = self.current_analysis['visits'] self._callPlugins(conf.PRE_HOOK_DIRECTORY) stats = self._generateStats(visits) cur_time = self.meta_infos['last_time'] print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) if cur_time.tm_mday > 1: last_day = cur_time.tm_mday - 1 while last_day: if last_day in self.current_analysis['days_stats'].keys(): break last_day -= 1 if last_day: for k in stats.keys(): stats[k] -= self.current_analysis['days_stats'][last_day][k] stats['nb_visitors'] = 0 for (k,v) in visits.items(): if v['robot']: continue if conf.count_hit_only_visitors and\ (not v['viewed_pages']): continue if v['last_access'].tm_mday == cur_time.tm_mday: stats['nb_visitors'] += 1 print stats self.current_analysis['days_stats'][cur_time.tm_mday] = stats def _newHit(self, hit): t = self._decodeTime(hit) cur_time = self.meta_infos['last_time'] if cur_time == None: self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: if not self.analyse_started: if time.mktime(t) < time.mktime(cur_time): return False else: self.analyse_started = True if cur_time.tm_mon != t.tm_mon: self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() elif cur_time.tm_mday != t.tm_mday: self._generateDayStats() self.meta_infos['last_time'] = t if not self.meta_infos['start_analysis_time']: self.meta_infos['start_analysis_time'] = t if not self._decodeHTTPRequest(hit): return False for k in hit.keys(): if hit[k] == '-' or hit[k] == '*': hit[k] = '' self._appendHit(hit) return True def start(self, _file): print '==> Load previous database' self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() self.meta_infos['start_analysis_time'] = None self.cache_plugins = preloadPlugins(self.plugins, self) print '==> Analysing log' for l in _file: # print "line " + l groups = self.log_re.match(l) if groups: if not self._newHit(groups.groupdict()): break else: print "No match for " + l #break if self.analyse_started: self._generateDayStats() self._generateMonthStats() del self.meta_infos['start_analysis_time'] self._serialize(self.meta_infos, conf.META_PATH) else: print '==> Analyse not started : nothing to do' self._generateMonthStats() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer') parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true', default=False, help='Clean output before starting') parser.add_argument('-i', '--stdin', dest='stdin', action='store_true', default=False, help='Read data from stdin instead of conf.analyzed_filename') args = parser.parse_args() if args.clean_output: if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT) if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT) iwla = IWLA() required_conf = ['analyzed_filename', 'domain_name'] if not validConfRequirements(required_conf, iwla, 'Main Conf'): sys.exit(0) if args.stdin: iwla.start(sys.stdin) else: if not os.path.exists(conf.analyzed_filename): print 'No such file \'%s\'' % (conf.analyzed_filename) sys.exit(-1) with open(conf.analyzed_filename) as f: iwla.start(f)