#!/usr/bin/env python import os import shutil import sys import re import time import pickle import gzip import importlib import argparse import logging import gettext from calendar import monthrange from datetime import date import default_conf as conf import conf as _ conf.__dict__.update(_.__dict__) del _ from iplugin import * from display import * # # Main class IWLA # Parse Log, compute them, call plugins and produce output # For now, only HTTP log are valid # # Plugin requirements : # None # # Conf values needed : # analyzed_filename # domain_name # locales_path # compress_output_files* # # Output files : # DB_ROOT/meta.db # DB_ROOT/year/month/iwla.db # OUTPUT_ROOT/index.html # OUTPUT_ROOT/year/month/index.html # # Statistics creation : # # meta : # last_time # start_analysis_time # stats => # year => # month => # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visits # nb_visitors # # month_stats : # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visits # # days_stats : # day => # viewed_bandwidth # not_viewed_bandwidth # viewed_pages # viewed_hits # nb_visits # nb_visitors # # visits : # remote_addr => # remote_addr # remote_ip # viewed_pages # viewed_hits # not_viewed_pages # not_viewed_hits # bandwidth # last_access # requests => # [fields_from_format_log] # extract_request => # extract_uri # extract_parameters* # extract_referer* => # extract_uri # extract_parameters* # robot # hit_only # is_page # # valid_visitors: # month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors) # # Statistics update : # None # # Statistics deletion : # None # class IWLA(object): ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 IWLA_VERSION = '0.1' def __init__(self, logLevel): self.meta_infos = {} self.analyse_started = False self.current_analysis = {} self.cache_plugins = {} self.display = DisplayHTMLBuild(self) self.valid_visitors = None self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?]+)(\?(?P.+))?') self.domain_name_re = re.compile(r'.*%s' % conf.domain_name) self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks), (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks), (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)] logging.basicConfig(format='%(name)s %(message)s', level=logLevel) self.logger = logging.getLogger(self.__class__.__name__) self.logger.info('==> Start') try: t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8') self.logger.info('\tUsing locale %s' % (conf.locale)) except IOError: t = gettext.NullTranslations() self.logger.info('\tUsing default locale en_EN') self._ = t.ugettext def getVersion(self): return IWLA.IWLA_VERSION def getConfValue(self, key, default=None): if not key in dir(conf): return default else: return conf.__dict__[key] def _clearVisits(self): self.current_analysis = { 'days_stats' : {}, 'month_stats' : {}, 'visits' : {} } self.valid_visitors = None return self.current_analysis def getDaysStats(self): return self.current_analysis['days_stats'] def getMonthStats(self): return self.current_analysis['month_stats'] def getCurrentVisists(self): return self.current_analysis['visits'] def getValidVisitors(self): return self.valid_visitors def getDisplay(self): return self.display def getCurTime(self): return self.meta_infos['last_time'] def getStartAnalysisTime(self): return self.meta_infos['start_analysis_time'] def isValidForCurrentAnalysis(self, request): cur_time = self.meta_infos['start_analysis_time'] # Analyse not started if not cur_time: return False return (time.mktime(cur_time) < time.mktime(request['time_decoded'])) def hasBeenViewed(self, request): return int(request['status']) in conf.viewed_http_codes def getCurDisplayPath(self, filename): cur_time = self.meta_infos['last_time'] return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename) def getResourcesPath(self): return conf.resources_path def getCSSPath(self): return conf.css_path def _clearMeta(self): self.meta_infos = { 'last_time' : None, 'start_analysis_time' : None } return self.meta_infos def _clearDisplay(self): self.display = DisplayHTMLBuild(self) return self.display def getDBFilename(self, time): return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME) def _serialize(self, obj, filename): base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # TODO : remove return #return with open(filename + '.tmp', 'wb+') as f: pickle.dump(obj, f) f.seek(0) with gzip.open(filename, 'w') as fzip: fzip.write(f.read()) os.remove(filename + '.tmp') def _deserialize(self, filename): if not os.path.exists(filename): return None with gzip.open(filename, 'r') as f: return pickle.load(f) return None def _callPlugins(self, target_root, *args): self.logger.info('==> Call plugins (%s)' % (target_root)) for (root, plugins) in self.plugins: if root != target_root: continue for p in plugins: mod = self.cache_plugins.get(root + '.' + p, None) if mod: self.logger.info('\t%s' % (p)) mod.hook(*args) def isPage(self, request): for e in conf.pages_extensions: if request.endswith(e): return True return False def _appendHit(self, hit): remote_addr = hit['remote_addr'] if not remote_addr: return if not remote_addr in self.current_analysis['visits'].keys(): self._createVisitor(hit) super_hit = self.current_analysis['visits'][remote_addr] super_hit['requests'].append(hit) super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] request = hit['extract_request'] uri = request.get('extract_uri', request['http_uri']) hit['is_page'] = self.isPage(uri) if super_hit['robot'] or\ not self.hasBeenViewed(hit): page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_ip'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 super_hit['viewed_hits'] = 0 super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_hits'] = 0 super_hit['bandwidth'] = 0 super_hit['last_access'] = self.meta_infos['last_time'] super_hit['requests'] = [] super_hit['robot'] = False super_hit['hit_only'] = 0 def _decodeHTTPRequest(self, hit): if not 'request' in hit.keys(): return False groups = self.http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] if 'extract_parameters' in d.keys(): hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: self.logger.warning("Bad request extraction %s" % (hit['request'])) return False if hit['http_referer']: referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: hit['extract_referer'] = referer_groups.groupdict() return True def _decodeTime(self, hit): try: hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format) except ValueError, e: if sys.version_info < (3, 2): # Try without UTC value at the end (%z not recognized) gmt_offset_str = hit['time_local'][-5:] gmt_offset_hours = int(gmt_offset_str[1:3])*60*60 gmt_offset_minutes = int(gmt_offset_str[3:5])*60 gmt_offset = gmt_offset_hours + gmt_offset_minutes hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3]) if gmt_offset_str[0] == '+': hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset) else: hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset) else: raise e return hit['time_decoded'] def getDisplayIndex(self): cur_time = self.meta_infos['last_time'] filename = self.getCurDisplayPath('index.html') return self.display.getPage(filename) def _generateDisplayDaysStats(self): cur_time = self.meta_infos['last_time'] title = '%s %d/%02d' % (self._('Statistics'), cur_time.tm_year, cur_time.tm_mon) filename = self.getCurDisplayPath('index.html') self.logger.info('==> Generate display (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon) days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6)) days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth']) nb_visits = 0 nb_days = 0 for i in range(1, nb_month_days+1): day = '%d
%s' % (i, time.strftime('%b', cur_time)) full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year) if i in self.current_analysis['days_stats'].keys(): stats = self.current_analysis['days_stats'][i] row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] nb_visits += stats['nb_visits'] nb_days += 1 else: row = [full_day, 0, 0, 0, 0, 0] days.appendRow(row) days.setCellValue(i-1, 4, bytesToStr(row[4])) days.setCellValue(i-1, 5, bytesToStr(row[5])) days.appendShortTitle(day) adate = date(cur_time.tm_year, cur_time.tm_mon, i) week_day = adate.weekday() if week_day == 5 or week_day == 6: days.setRowCSSClass(i-1, 'iwla_weekend') if adate == date.today(): css = days.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' days.setCellCSSClass(i-1, 0, css) stats = self.current_analysis['month_stats'] row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: average_row = map(lambda(v): int(v/nb_days), row) else: average_row = map(lambda(v): 0, row) average_row[0] = self._('Average') average_row[4] = bytesToStr(average_row[4]) average_row[5] = bytesToStr(average_row[5]) days.appendRow(average_row) row[0] = self._('Total') row[4] = bytesToStr(row[4]) row[5] = bytesToStr(row[5]) days.appendRow(row) page.appendBlock(days) self.display.addPage(page) def _generateDisplayMonthStats(self, page, year, month_stats): cur_time = time.localtime() months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('July'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')] title = '%s %d' % (self._('Summary'), year) cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')] graph_cols=range(1,7) months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols) months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', '']) total = [0] * len(cols) for i in range(1, 13): month = '%s
%d' % (months_name[i], year) full_month = '%s %d' % (months_name[i], year) if i in month_stats.keys(): stats = month_stats[i] link = '%s' % (year, i, self._('Details')) row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link] for j in graph_cols: total[j] += row[j] else: row = [full_month, 0, 0, 0, 0, 0, 0, ''] months.appendRow(row) months.setCellValue(i-1, 5, bytesToStr(row[5])) months.setCellValue(i-1, 6, bytesToStr(row[6])) months.appendShortTitle(month) if year == cur_time.tm_year and i == cur_time.tm_mon: css = months.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' months.setCellCSSClass(i-1, 0, css) total[0] = self._('Total') total[5] = bytesToStr(total[5]) total[6] = bytesToStr(total[6]) months.appendRow(total) page.appendBlock(months) def _generateDisplayWholeMonthStats(self): title = '%s %s' % (self._('Statistics for'), conf.domain_name) filename = 'index.html' self.logger.info('==> Generate main page (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) last_update = '%s %s
' % (self._('Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime())) page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update)) for year in sorted(self.meta_infos['stats'].keys(), reverse=True): self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year]) self.display.addPage(page) def _compressFile(self, build_time, root, filename): path = os.path.join(root, filename) gz_path = path + '.gz' self.logger.debug('Compress %s => %s' % (path, gz_path)) if not os.path.exists(gz_path) or\ os.stat(path).st_mtime > build_time: with open(path, 'rb') as f_in: with gzip.open(gz_path, 'wb') as f_out: f_out.write(f_in.read()) def _compressFiles(self, build_time, root): if not conf.compress_output_files: return for rootdir, subdirs, files in os.walk(root, followlinks=True): for f in files: for ext in conf.compress_output_files: if f.endswith(ext): self._compressFile(build_time, rootdir, f) break def _generateDisplay(self): self._generateDisplayDaysStats() self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY) self._generateDisplayWholeMonthStats() build_time = time.localtime() self.display.build(conf.DISPLAY_ROOT) self._compressFiles(build_time, conf.DISPLAY_ROOT) def _createEmptyStats(self): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 stats['nb_visits'] = 0 return stats def _generateMonthStats(self): self._clearDisplay() visits = self.current_analysis['visits'] stats = self._createEmptyStats() for (day, stat) in self.current_analysis['days_stats'].items(): for k in stats.keys(): stats[k] += stat[k] duplicated_stats = {k:v for (k,v) in stats.items()} cur_time = self.meta_infos['last_time'] self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)) self.logger.info(stats) if not 'month_stats' in self.current_analysis.keys(): self.current_analysis['month_stats'] = stats else: for (k,v) in stats.items(): self.current_analysis['month_stats'][k] = v self.valid_visitors = {} for (k,v) in visits.items(): if v['robot']: continue if not (conf.count_hit_only_visitors or\ v['viewed_pages']): continue self.valid_visitors[k] = v duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys()) self._callPlugins(conf.POST_HOOK_DIRECTORY) path = self.getDBFilename(cur_time) if os.path.exists(path): os.remove(path) self.logger.info("==> Serialize to %s" % (path)) self._serialize(self.current_analysis, path) # Save month stats year = cur_time.tm_year month = cur_time.tm_mon if not 'stats' in self.meta_infos.keys(): self.meta_infos['stats'] = {} if not year in self.meta_infos['stats'].keys(): self.meta_infos['stats'][year] = {} self.meta_infos['stats'][year][month] = duplicated_stats self._generateDisplay() def _generateDayStats(self): visits = self.current_analysis['visits'] cur_time = self.meta_infos['last_time'] self._callPlugins(conf.PRE_HOOK_DIRECTORY) stats = self._createEmptyStats() for (k, super_hit) in visits.items(): if super_hit['last_access'].tm_mday != cur_time.tm_mday: continue viewed_page = False for hit in super_hit['requests'][::-1]: if hit['time_decoded'].tm_mday != cur_time.tm_mday: break if super_hit['robot'] or\ not self.hasBeenViewed(hit): stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent']) continue stats['viewed_bandwidth'] += int(hit['body_bytes_sent']) if hit['is_page']: stats['viewed_pages'] += 1 viewed_pages = True else: stats['viewed_hits'] += 1 if (conf.count_hit_only_visitors or\ viewed_pages) and\ not super_hit['robot']: stats['nb_visits'] += 1 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)) self.logger.info(stats) self.current_analysis['days_stats'][cur_time.tm_mday] = stats def _newHit(self, hit): if not self.domain_name_re.match(hit['server_name']): return False t = self._decodeTime(hit) cur_time = self.meta_infos['last_time'] if cur_time == None: self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: if time.mktime(t) <= time.mktime(cur_time): return False self.analyse_started = True if cur_time.tm_mon != t.tm_mon: self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() elif cur_time.tm_mday != t.tm_mday: self._generateDayStats() self.meta_infos['last_time'] = t if not self.meta_infos['start_analysis_time']: self.meta_infos['start_analysis_time'] = t if not self._decodeHTTPRequest(hit): return False for k in hit.keys(): if hit[k] == '-' or hit[k] == '*': hit[k] = '' self._appendHit(hit) return True def start(self, _file): self.logger.info('==> Load previous database') self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: self.logger.info('Last time') self.logger.info(self.meta_infos['last_time']) self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() self.meta_infos['start_analysis_time'] = None self.cache_plugins = preloadPlugins(self.plugins, self) self.logger.info('==> Analysing log') for l in _file: # print "line " + l groups = self.log_re.match(l) if groups: self._newHit(groups.groupdict()) else: self.logger.warning("No match for %s" % (l)) #break if self.analyse_started: self._generateDayStats() self._generateMonthStats() del self.meta_infos['start_analysis_time'] self._serialize(self.meta_infos, conf.META_PATH) else: self.logger.info('==> Analyse not started : nothing new') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer') parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true', default=False, help='Clean output before starting') parser.add_argument('-i', '--stdin', dest='stdin', action='store_true', default=False, help='Read data from stdin instead of conf.analyzed_filename') parser.add_argument('-f', '--file', dest='file', help='Analyse this log file') parser.add_argument('-d', '--log-level', dest='loglevel', default='INFO', type=str, help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO')) args = parser.parse_args() if args.clean_output: if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT) if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT) loglevel = getattr(logging, args.loglevel.upper(), None) if not isinstance(loglevel, int): raise ValueError('Invalid log level: %s' % (args.loglevel)) iwla = IWLA(loglevel) required_conf = ['analyzed_filename', 'domain_name'] if not validConfRequirements(required_conf, iwla, 'Main Conf'): sys.exit(0) if args.stdin: iwla.start(sys.stdin) else: filename = args.file or conf.analyzed_filename if not os.path.exists(filename): print 'No such file \'%s\'' % (filename) sys.exit(-1) with open(filename) as f: iwla.start(f)