#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright Grégory Soutadé 2015 # This file is part of iwla # iwla is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # iwla is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with iwla. If not, see . # import os import shutil import sys import re import time import pickle import gzip import importlib import argparse import logging import gettext from calendar import monthrange from datetime import date import default_conf as conf import conf as _ conf.__dict__.update(_.__dict__) del _ from iplugin import * from display import * """ Main class IWLA Parse Log, compute them, call plugins and produce output For now, only HTTP log are valid Plugin requirements : None Conf values needed : analyzed_filename domain_name locales_path compress_output_files* Output files : DB_ROOT/meta.db DB_ROOT/year/month/iwla.db OUTPUT_ROOT/index.html OUTPUT_ROOT/year/month/index.html Statistics creation : meta : last_time start_analysis_time stats => year => month => viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits nb_visitors month_stats : viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits days_stats : day => viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits nb_visitors visits : remote_addr => remote_addr remote_ip viewed_pages viewed_hits not_viewed_pages not_viewed_hits bandwidth last_access requests => [fields_from_format_log] extract_request => extract_uri extract_parameters* extract_referer* => extract_uri extract_parameters* robot hit_only is_page valid_visitors: month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors) Statistics update : None Statistics deletion : None """ class IWLA(object): ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 IWLA_VERSION = '0.1' def __init__(self, logLevel): self.meta_infos = {} self.analyse_started = False self.current_analysis = {} self.cache_plugins = {} self.display = DisplayHTMLBuild(self) self.valid_visitors = None self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?#]+)(\?(?P[^#]+))?(#.*)?') self.domain_name_re = re.compile(r'.*%s' % conf.domain_name) self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks), (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks), (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)] logging.basicConfig(format='%(name)s %(message)s', level=logLevel) self.logger = logging.getLogger(self.__class__.__name__) self.logger.info('==> Start') try: t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8') self.logger.info('\tUsing locale %s' % (conf.locale)) except IOError: t = gettext.NullTranslations() self.logger.info('\tUsing default locale en_EN') self._ = t.ugettext def getVersion(self): return IWLA.IWLA_VERSION def getConfValue(self, key, default=None): if not key in dir(conf): return default else: return conf.__dict__[key] def _clearVisits(self): self.current_analysis = { 'days_stats' : {}, 'month_stats' : {}, 'visits' : {} } self.valid_visitors = None return self.current_analysis def getDaysStats(self): return self.current_analysis['days_stats'] def getMonthStats(self): return self.current_analysis['month_stats'] def getCurrentVisists(self): return self.current_analysis['visits'] def getValidVisitors(self): return self.valid_visitors def getDisplay(self): return self.display def getCurTime(self): return self.meta_infos['last_time'] def getStartAnalysisTime(self): return self.meta_infos['start_analysis_time'] def isValidForCurrentAnalysis(self, request): cur_time = self.meta_infos['start_analysis_time'] # Analyse not started if not cur_time: return False return (time.mktime(cur_time) < time.mktime(request['time_decoded'])) def hasBeenViewed(self, request): return int(request['status']) in conf.viewed_http_codes def getCurDisplayPath(self, filename): cur_time = self.meta_infos['last_time'] return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename) def getResourcesPath(self): return conf.resources_path def getCSSPath(self): return conf.css_path def _clearMeta(self): self.meta_infos = { 'last_time' : None, 'start_analysis_time' : None } return self.meta_infos def _clearDisplay(self): self.display = DisplayHTMLBuild(self) return self.display def getDBFilename(self, time): return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME) def _serialize(self, obj, filename): base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # TODO : remove return #return with open(filename + '.tmp', 'wb+') as f, gzip.open(filename, 'w') as fzip: pickle.dump(obj, f) f.seek(0) fzip.write(f.read()) os.remove(filename + '.tmp') def _deserialize(self, filename): if not os.path.exists(filename): return None with gzip.open(filename, 'r') as f: return pickle.load(f) return None def _callPlugins(self, target_root, *args): self.logger.info('==> Call plugins (%s)' % (target_root)) for (root, plugins) in self.plugins: if root != target_root: continue for p in plugins: mod = self.cache_plugins.get(root + '.' + p, None) if mod: self.logger.info('\t%s' % (p)) mod.hook(*args) def isPage(self, request): for e in conf.pages_extensions: if request.endswith(e): return True return False def _appendHit(self, hit): remote_addr = hit['remote_addr'] if not remote_addr: return if not remote_addr in self.current_analysis['visits'].keys(): self._createVisitor(hit) super_hit = self.current_analysis['visits'][remote_addr] super_hit['requests'].append(hit) super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] request = hit['extract_request'] uri = request.get('extract_uri', request['http_uri']) hit['is_page'] = self.isPage(uri) if super_hit['robot'] or\ not self.hasBeenViewed(hit): page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key] += 1 else: super_hit[hit_key] += 1 def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_ip'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 super_hit['viewed_hits'] = 0 super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_hits'] = 0 super_hit['bandwidth'] = 0 super_hit['last_access'] = self.meta_infos['last_time'] super_hit['requests'] = [] super_hit['robot'] = False super_hit['hit_only'] = 0 def _decodeHTTPRequest(self, hit): if not 'request' in hit.keys(): return False groups = self.http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict() uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] if 'extract_parameters' in d.keys(): hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: self.logger.warning("Bad request extraction %s" % (hit['request'])) return False if hit['http_referer']: referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: hit['extract_referer'] = referer_groups.groupdict() return True def _decodeTime(self, hit): try: hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format) except ValueError, e: if sys.version_info < (3, 2): # Try without UTC value at the end (%z not recognized) gmt_offset_str = hit['time_local'][-5:] gmt_offset_hours = int(gmt_offset_str[1:3])*60*60 gmt_offset_minutes = int(gmt_offset_str[3:5])*60 gmt_offset = gmt_offset_hours + gmt_offset_minutes hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3]) if gmt_offset_str[0] == '+': hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset) else: hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset) else: raise e return hit['time_decoded'] def getDisplayIndex(self): cur_time = self.meta_infos['last_time'] filename = self.getCurDisplayPath('index.html') return self.display.getPage(filename) def _generateDisplayDaysStats(self): cur_time = self.meta_infos['last_time'] title = createCurTitle(self, self._('Statistics')) filename = self.getCurDisplayPath('index.html') self.logger.info('==> Generate display (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon) days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6)) days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth']) nb_visits = 0 nb_days = 0 for i in range(1, nb_month_days+1): day = '%d
%s' % (i, time.strftime('%b', cur_time)) full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year) if i in self.current_analysis['days_stats'].keys(): stats = self.current_analysis['days_stats'][i] row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] nb_visits += stats['nb_visits'] nb_days += 1 else: row = [full_day, 0, 0, 0, 0, 0] days.appendRow(row) days.setCellValue(i-1, 4, bytesToStr(row[4])) days.setCellValue(i-1, 5, bytesToStr(row[5])) days.appendShortTitle(day) adate = date(cur_time.tm_year, cur_time.tm_mon, i) week_day = adate.weekday() if week_day == 5 or week_day == 6: days.setRowCSSClass(i-1, 'iwla_weekend') if adate == date.today(): css = days.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' days.setCellCSSClass(i-1, 0, css) stats = self.current_analysis['month_stats'] row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: average_row = map(lambda(v): int(v/nb_days), row) else: average_row = map(lambda(v): 0, row) average_row[0] = self._('Average') average_row[4] = bytesToStr(average_row[4]) average_row[5] = bytesToStr(average_row[5]) days.appendRow(average_row) row[0] = self._('Total') row[4] = bytesToStr(row[4]) row[5] = bytesToStr(row[5]) days.appendRow(row) page.appendBlock(days) self.display.addPage(page) def _generateDisplayMonthStats(self, page, year, month_stats): cur_time = time.localtime() months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')] title = '%s %d' % (self._('Summary'), year) cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')] graph_cols=range(1,7) months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols) months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', '']) total = [0] * len(cols) for i in range(1, 13): month = '%s
%d' % (months_name[i], year) full_month = '%s %d' % (months_name[i], year) if i in month_stats.keys(): stats = month_stats[i] link = '%s' % (year, i, self._('Details')) row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link] for j in graph_cols: total[j] += row[j] else: row = [full_month, 0, 0, 0, 0, 0, 0, ''] months.appendRow(row) months.setCellValue(i-1, 5, bytesToStr(row[5])) months.setCellValue(i-1, 6, bytesToStr(row[6])) months.appendShortTitle(month) if year == cur_time.tm_year and i == cur_time.tm_mon: css = months.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' months.setCellCSSClass(i-1, 0, css) total[0] = self._('Total') total[5] = bytesToStr(total[5]) total[6] = bytesToStr(total[6]) total[7] = u'' months.appendRow(total) page.appendBlock(months) def _generateDisplayWholeMonthStats(self): title = '%s %s' % (self._('Statistics for'), conf.domain_name) filename = 'index.html' self.logger.info('==> Generate main page (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) last_update = '%s %s
' % (self._('Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime())) page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update)) for year in sorted(self.meta_infos['stats'].keys(), reverse=True): self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year]) self.display.addPage(page) def _compressFile(self, build_time, root, filename): path = os.path.join(root, filename) gz_path = path + '.gz' self.logger.debug('Compress %s => %s' % (path, gz_path)) if not os.path.exists(gz_path) or\ os.stat(path).st_mtime > build_time: with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out: f_out.write(f_in.read()) def _compressFiles(self, build_time, root): if not conf.compress_output_files: return for rootdir, subdirs, files in os.walk(root, followlinks=True): for f in files: for ext in conf.compress_output_files: if f.endswith(ext): self._compressFile(build_time, rootdir, f) break def _generateDisplay(self): self._generateDisplayDaysStats() self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY) self._generateDisplayWholeMonthStats() build_time = time.localtime() self.display.build(conf.DISPLAY_ROOT) self._compressFiles(build_time, conf.DISPLAY_ROOT) def _createEmptyStats(self): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 stats['nb_visits'] = 0 return stats def _generateMonthStats(self): self._clearDisplay() visits = self.current_analysis['visits'] stats = self._createEmptyStats() for (day, stat) in self.current_analysis['days_stats'].items(): for k in stats.keys(): stats[k] += stat[k] duplicated_stats = {k:v for (k,v) in stats.items()} cur_time = self.meta_infos['last_time'] self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)) self.logger.info(stats) if not 'month_stats' in self.current_analysis.keys(): self.current_analysis['month_stats'] = stats else: for (k,v) in stats.items(): self.current_analysis['month_stats'][k] = v self.valid_visitors = {} for (k,v) in visits.items(): if v['robot']: continue if not (conf.count_hit_only_visitors or\ v['viewed_pages']): continue self.valid_visitors[k] = v duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys()) self._callPlugins(conf.POST_HOOK_DIRECTORY) path = self.getDBFilename(cur_time) if os.path.exists(path): os.remove(path) self.logger.info("==> Serialize to %s" % (path)) self._serialize(self.current_analysis, path) # Save month stats year = cur_time.tm_year month = cur_time.tm_mon if not 'stats' in self.meta_infos.keys(): self.meta_infos['stats'] = {} if not year in self.meta_infos['stats'].keys(): self.meta_infos['stats'][year] = {} self.meta_infos['stats'][year][month] = duplicated_stats self._generateDisplay() def _generateDayStats(self): visits = self.current_analysis['visits'] cur_time = self.meta_infos['last_time'] self._callPlugins(conf.PRE_HOOK_DIRECTORY) stats = self._createEmptyStats() for (k, super_hit) in visits.items(): if super_hit['last_access'].tm_mday != cur_time.tm_mday: continue viewed_page = False for hit in super_hit['requests'][::-1]: if hit['time_decoded'].tm_mday != cur_time.tm_mday: break if super_hit['robot'] or\ not self.hasBeenViewed(hit): stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent']) continue stats['viewed_bandwidth'] += int(hit['body_bytes_sent']) if hit['is_page']: stats['viewed_pages'] += 1 viewed_pages = True else: stats['viewed_hits'] += 1 if (conf.count_hit_only_visitors or\ viewed_pages) and\ not super_hit['robot']: stats['nb_visits'] += 1 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)) self.logger.info(stats) self.current_analysis['days_stats'][cur_time.tm_mday] = stats def _newHit(self, hit): if not self.domain_name_re.match(hit['server_name']): return False t = self._decodeTime(hit) cur_time = self.meta_infos['last_time'] if cur_time == None: self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: if time.mktime(t) <= time.mktime(cur_time): return False self.analyse_started = True if cur_time.tm_mon != t.tm_mon: self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() elif cur_time.tm_mday != t.tm_mday: self._generateDayStats() self.meta_infos['last_time'] = t if not self.meta_infos['start_analysis_time']: self.meta_infos['start_analysis_time'] = t if not self._decodeHTTPRequest(hit): return False for k in hit.keys(): if hit[k] == '-' or hit[k] == '*': hit[k] = '' self._appendHit(hit) return True def start(self, _file): self.logger.info('==> Load previous database') self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: self.logger.info('Last time') self.logger.info(self.meta_infos['last_time']) self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() self.meta_infos['start_analysis_time'] = None self.cache_plugins = preloadPlugins(self.plugins, self) self.logger.info('==> Analysing log') for l in _file: # print "line " + l groups = self.log_re.match(l) if groups: self._newHit(groups.groupdict()) else: self.logger.warning("No match for %s" % (l)) #break if self.analyse_started: self._generateDayStats() self._generateMonthStats() del self.meta_infos['start_analysis_time'] self._serialize(self.meta_infos, conf.META_PATH) else: self.logger.info('==> Analyse not started : nothing new') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer') parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true', default=False, help='Clean output before starting') parser.add_argument('-i', '--stdin', dest='stdin', action='store_true', default=False, help='Read data from stdin instead of conf.analyzed_filename') parser.add_argument('-f', '--file', dest='file', help='Analyse this log file') parser.add_argument('-d', '--log-level', dest='loglevel', default='INFO', type=str, help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO')) args = parser.parse_args() if args.clean_output: if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT) if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT) loglevel = getattr(logging, args.loglevel.upper(), None) if not isinstance(loglevel, int): raise ValueError('Invalid log level: %s' % (args.loglevel)) iwla = IWLA(loglevel) required_conf = ['analyzed_filename', 'domain_name'] if not validConfRequirements(required_conf, iwla, 'Main Conf'): sys.exit(0) if args.stdin: iwla.start(sys.stdin) else: filename = args.file or conf.analyzed_filename if not os.path.exists(filename): print 'No such file \'%s\'' % (filename) sys.exit(-1) with open(filename) as f: iwla.start(f)