From c3c201fda1ff70981d8f95f0c1651bc7df475598 Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Fri, 21 Nov 2014 14:46:12 +0100 Subject: [PATCH] Start using classes --- iwla.py | 645 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 319 insertions(+), 326 deletions(-) diff --git a/iwla.py b/iwla.py index d14695b..daf2b15 100755 --- a/iwla.py +++ b/iwla.py @@ -10,100 +10,95 @@ import gzip from display import * -# Default configuration - -DB_ROOT = './output/' -DISPLAY_ROOT = './output/' - -log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ - '"$request" $status $body_bytes_sent ' +\ - '"$http_referer" "$http_user_agent"' - -time_format = '%d/%b/%Y:%H:%M:%S +0100' - -pre_analysis_hooks = [] -post_analysis_hooks = [] -display_hooks = [] - +from default_conf import * from conf import * -print '==> Start' +class IWLA(object): -meta_visit = {} -analyse_started = False -current_visits = {} -cache_plugins = {} -display = {} + ANALYSIS_CLASS = 'HTTP' + API_VERSION = 1 -log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format) -log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted) -http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') + def __init__(self): + print '==> Start' -log_re = re.compile(log_format_extracted) -uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') -pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] -viewed_http_codes = [200] + self.meta_infos = {} + self.analyse_started = False + self.current_analysis = {} + self.cache_plugins = {} + self.display = {} + self.valid_visitors = None -HOOKS_ROOT = './plugins/' -PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/' -POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/' -DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/' -META_PATH = DB_ROOT + 'meta.db' -DB_FILENAME = 'iwla.db' + self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format) + self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) + self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') + self.log_re = re.compile(self.log_format_extracted) + self.uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') + self.plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, + POST_HOOK_DIRECTORY : post_analysis_hooks, + DISPLAY_HOOK_DIRECTORY : display_hooks} -plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, POST_HOOK_DIRECTORY : post_analysis_hooks, DISPLAY_HOOK_DIRECTORY : display_hooks} + def _preloadPlugins(self): + ret = True + for root in self.plugins.keys(): + for plugin_name in self.plugins[root]: + p = root + '/' + plugin_name + try: + fp, pathname, description = imp.find_module(plugin_name, [root]) + self.cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description) + mod = self.cache_plugins[p] + infos = mod.get_plugins_infos() + if infos['class'] != IWLA.ANALYSIS_CLASS or \ + IWLA.API_VERSION < infos['min_version'] or\ + (infos['max_version'] != -1 and (IWLA.API_VERSION > infos['max_version'])): + del self.cache_plugins[p] + elif not mod.load(): + del self.cache_plugins[p] + except Exception as e: + print 'Error loading \'%s\' => %s' % (p, e) + ret = False + return ret -ANALYSIS_CLASS = 'HTTP' -API_VERSION = 1 + def _clearVisits(self): + self.current_analysis = { + 'days_stats' : {}, + 'month_stats' : {}, + 'visits' : {} + } + self.valid_visitors = None + return self.current_analysis -def preloadPlugins(): - ret = True - for root in plugins.keys(): - for plugin_name in plugins[root]: - p = root + '/' + plugin_name - try: - fp, pathname, description = imp.find_module(plugin_name, [root]) - cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description) - #cache_plugins[p] = imp.load_module(p,None,p,("py","r",imp.PKG_DIRECTORY)) - #cache_plugins[p] = imp.load_source(p, p) - mod = cache_plugins[p] - #print dir(mod) - #print "Register %s -> %s" % (p, mod) - infos = mod.get_plugins_infos() - if infos['class'] != ANALYSIS_CLASS or \ - API_VERSION < infos['min_version'] or\ - (infos['max_version'] != -1 and (API_VERSION > infos['max_version'])): - del cache_plugins[p] - elif not mod.load(): - del cache_plugins[p] - except Exception as e: - print 'Error loading \'%s\' => %s' % (p, e) - ret = False - return ret - + def getDaysStats(self): + return self.current_analysis['days_stats'] -def createEmptyVisits(): - visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}} - return visits + def getMonthStatsStats(self): + return self.current_analysis['month_stats'] -def createEmptyMeta(): - meta = {'last_time' : None} - return meta + def getCurrentVisists(self): + return self.current_analysis['visits'] -def createEmptyDisplay(): - display = {} - return display + def getValidVisitors(self): + return self.current_analysis['visits'] -def getDBFilename(time): - return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) + def _clearMeta(self): + self.meta_infos = { + 'last_time' : None + } + return self.meta_infos -def serialize(obj, filename): - base = os.path.dirname(filename) - if not os.path.exists(base): - os.makedirs(base) + def _clearDisplay(self): + self.display = {} + return self.display - # TODO : remove return - return + def getDBFilename(self, time): + return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) + + def _serialize(self, obj, filename): + base = os.path.dirname(filename) + if not os.path.exists(base): + os.makedirs(base) + + # TODO : remove return + return with open(filename + '.tmp', 'wb+') as f: pickle.dump(obj, f) @@ -112,300 +107,298 @@ def serialize(obj, filename): fzip.write(f.read()) os.remove(filename + '.tmp') -def deserialize(filename): - if not os.path.exists(filename): + def _deserialize(self, filename): + if not os.path.exists(filename): + return None + + with gzip.open(filename, 'r') as f: + return pickle.load(f) return None - with gzip.open(filename, 'r') as f: - return pickle.load(f) - return None + def _callPlugins(self, root, *args): + print '==> Call plugins (%s)' % root + for p in self.plugins[root]: + print '\t%s' % (p) + mod = self.cache_plugins[root + '/' + p] + mod.hook(*args) -def callPlugins(root, *args): - print '==> Call plugins (%s)' % root - for p in plugins[root]: - print '\t%s' % (p) - mod = cache_plugins[root + '/' + p] - mod.hook(*args) + def isPage(self, request): + for e in pages_extensions: + if request.endswith(e): + return True -def isPage(request): - for e in pages_extensions: - if request.endswith(e): - return True - - return False - -def appendHit(hit): - remote_addr = hit['remote_addr'] - - if not remote_addr in current_visits['visits'].keys(): - createUser(hit) - return - - super_hit = current_visits['visits'][remote_addr] - super_hit['requests'].append(hit) - super_hit['bandwidth'] += int(hit['body_bytes_sent']) - super_hit['last_access'] = meta_visit['last_time'] - - request = hit['extract_request'] - - if 'extract_uri' in request.keys(): - uri = request['extract_uri'] - else: - uri = request['http_uri'] - - hit['is_page'] = isPage(uri) - - # Don't count 3xx status - status = int(hit['status']) - if status >= 300 and status < 400: return - - if super_hit['robot'] or\ - not status in viewed_http_codes: - page_key = 'not_viewed_pages' - hit_key = 'not_viewed_hits' - else: - page_key = 'viewed_pages' - hit_key = 'viewed_hits' - - if hit['is_page']: - super_hit[page_key] += 1 - else: - super_hit[hit_key] += 1 - -def createUser(hit): - super_hit = current_visits['visits'][hit['remote_addr']] = {} - super_hit['remote_addr'] = hit['remote_addr'] - super_hit['viewed_pages'] = 0 - super_hit['viewed_hits'] = 0 - super_hit['not_viewed_pages'] = 0 - super_hit['not_viewed_hits'] = 0 - super_hit['bandwidth'] = 0 - super_hit['last_access'] = meta_visit['last_time'] - super_hit['requests'] = [] - super_hit['robot'] = False - super_hit['hit_only'] = 0 - appendHit(hit) - -def decodeHTTPRequest(hit): - if not 'request' in hit.keys(): return False - - groups = http_request_extracted.match(hit['request']) - - if groups: - hit['extract_request'] = groups.groupdict() - uri_groups = uri_re.match(hit['extract_request']['http_uri']) - if uri_groups: - d = uri_groups.groupdict() - hit['extract_request']['extract_uri'] = d['extract_uri'] - if 'extract_parameters' in d.keys(): - hit['extract_request']['extract_parameters'] = d['extract_parameters'] - else: - print "Bad request extraction " + hit['request'] return False - referer_groups = uri_re.match(hit['http_referer']) - if referer_groups: - referer = hit['extract_referer'] = referer_groups.groupdict() - return True + def _appendHit(self, hit): + remote_addr = hit['remote_addr'] + + if not remote_addr in self.current_analysis['visits'].keys(): + self._createUser(hit) + return + + super_hit = self.current_analysis['visits'][remote_addr] + super_hit['requests'].append(hit) + super_hit['bandwidth'] += int(hit['body_bytes_sent']) + super_hit['last_access'] = self.meta_infos['last_time'] -def decodeTime(hit): - t = hit['time_local'] + request = hit['extract_request'] - hit['time_decoded'] = time.strptime(t, time_format) + if 'extract_uri' in request.keys(): + uri = request['extract_uri'] + else: + uri = request['http_uri'] -def getDisplayIndex(): - cur_time = meta_visit['last_time'] - filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) + hit['is_page'] = self.isPage(uri) - return display.get(filename, None) + # Don't count 3xx status + status = int(hit['status']) + if status >= 300 and status < 400: return -def generateDisplayDaysStat(): - cur_time = meta_visit['last_time'] - title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) - filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) - page = createPage(display, filename, title) - - days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth']) + if super_hit['robot'] or\ + not status in viewed_http_codes: + page_key = 'not_viewed_pages' + hit_key = 'not_viewed_hits' + else: + page_key = 'viewed_pages' + hit_key = 'viewed_hits' - keys = current_visits['days_stats'].keys() - keys.sort() - nb_visits = 0 - for k in keys: - stats = current_visits['days_stats'][k] - row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] + if hit['is_page']: + super_hit[page_key] += 1 + else: + super_hit[hit_key] += 1 + + def _createUser(self, hit): + super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} + super_hit['remote_addr'] = hit['remote_addr'] + super_hit['viewed_pages'] = 0 + super_hit['viewed_hits'] = 0 + super_hit['not_viewed_pages'] = 0 + super_hit['not_viewed_hits'] = 0 + super_hit['bandwidth'] = 0 + super_hit['last_access'] = self.meta_infos['last_time'] + super_hit['requests'] = [] + super_hit['robot'] = False + super_hit['hit_only'] = 0 + self._appendHit(hit) + + def _decodeHTTPRequest(self, hit): + if not 'request' in hit.keys(): return False + + groups = self.http_request_extracted.match(hit['request']) + + if groups: + hit['extract_request'] = groups.groupdict() + uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) + if uri_groups: + d = uri_groups.groupdict() + hit['extract_request']['extract_uri'] = d['extract_uri'] + if 'extract_parameters' in d.keys(): + hit['extract_request']['extract_parameters'] = d['extract_parameters'] + else: + print "Bad request extraction " + hit['request'] + return False + + referer_groups = self.uri_re.match(hit['http_referer']) + if referer_groups: + referer = hit['extract_referer'] = referer_groups.groupdict() + return True + + def _decodeTime(self, hit): + hit['time_decoded'] = time.strptime(hit['time_local'], time_format) + + def getDisplayIndex(self): + cur_time = self.meta_infos['last_time'] + filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) + + return self.display.get(filename, None) + + def _generateDisplayDaysStat(self): + cur_time = self.meta_infos['last_time'] + title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) + filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) + page = createPage(self.display, filename, title) + + days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth']) + + keys = self.current_analysis['days_stats'].keys() + keys.sort() + nb_visits = 0 + for k in keys: + stats = self.current_analysis['days_stats'][k] + row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] + row = map(lambda(v): str(v), row) + appendRowToTable(days, row) + nb_visits += stats['nb_visitors'] + + stats = self.current_analysis['month_stats'] + + nb_days = len(keys) + row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] + if nb_days: + row = map(lambda(v): str(int(v/nb_days)), row) + else: + row = map(lambda(v): '0', row) + + row[0] = 'Average' + appendRowToTable(days, row) + + row = ['Total', nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] row = map(lambda(v): str(v), row) appendRowToTable(days, row) - nb_visits += stats['nb_visitors'] + appendBlockToPage(page, days) - stats = current_visits['month_stats'] + def _generateDisplay(self): + self._generateDisplayDaysStat() + self._callPlugins(DISPLAY_HOOK_DIRECTORY, self.current_analysis, self.display) + buildPages(DISPLAY_ROOT, self.display) - nb_days = len(keys) - row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] - if nb_days: - row = map(lambda(v): str(int(v/nb_days)), row) - else: - row = map(lambda(v): '0', row) + def _generateStats(self, visits): + stats = {} + stats['viewed_bandwidth'] = 0 + stats['not_viewed_bandwidth'] = 0 + stats['viewed_pages'] = 0 + stats['viewed_hits'] = 0 + #stats['requests'] = set() + stats['nb_visitors'] = 0 - row[0] = 'Average' - appendRowToTable(days, row) + for k in visits.keys(): + super_hit = visits[k] + if super_hit['robot']: + stats['not_viewed_bandwidth'] += super_hit['bandwidth'] + continue - row = ['Total', nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] - row = map(lambda(v): str(v), row) - appendRowToTable(days, row) - appendBlockToPage(page, days) + #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) -def generateDisplay(): - generateDisplayDaysStat() - callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display) - buildPages(DISPLAY_ROOT, display) + if not super_hit['hit_only']: + stats['nb_visitors'] += 1 + stats['viewed_bandwidth'] += super_hit['bandwidth'] + stats['viewed_pages'] += super_hit['viewed_pages'] + stats['viewed_hits'] += super_hit['viewed_hits'] -def generateStats(visits): - stats = {} - stats['viewed_bandwidth'] = 0 - stats['not_viewed_bandwidth'] = 0 - stats['viewed_pages'] = 0 - stats['viewed_hits'] = 0 - #stats['requests'] = set() - stats['nb_visitors'] = 0 + # for p in super_hit['requests']: + # if not p['is_page']: continue + # req = p['extract_request'] + # stats['requests'].add(req['extract_uri']) - for k in visits.keys(): - super_hit = visits[k] - if super_hit['robot']: - stats['not_viewed_bandwidth'] += super_hit['bandwidth'] - continue + return stats - #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) - - if not super_hit['hit_only']: - stats['nb_visitors'] += 1 - stats['viewed_bandwidth'] += super_hit['bandwidth'] - stats['viewed_pages'] += super_hit['viewed_pages'] - stats['viewed_hits'] += super_hit['viewed_hits'] + def _generateMonthStats(self): + self._clearDisplay() - # for p in super_hit['requests']: - # if not p['is_page']: continue - # req = p['extract_request'] - # stats['requests'].add(req['extract_uri']) + visits = self.current_analysis['visits'] - return stats - -def generateMonthStats(): - display = createEmptyDisplay() + stats = self._generateStats(visits) - visits = current_visits['visits'] - - stats = generateStats(visits) - - cur_time = meta_visit['last_time'] - print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) - print stats + cur_time = self.meta_infos['last_time'] + print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) + print stats - valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} - callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats) + self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} + self._callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats) - current_visits['month_stats'] = stats + self.current_analysis['month_stats'] = stats - path = getDBFilename(cur_time) - if os.path.exists(path): - os.remove(path) + path = self.getDBFilename(cur_time) + if os.path.exists(path): + os.remove(path) - print "==> Serialize to %s" % path + print "==> Serialize to %s" % path - serialize(current_visits, path) + self._serialize(self.current_analysis, path) - generateDisplay() + self._generateDisplay() -def generateDayStats(): - visits = current_visits['visits'] - - callPlugins(PRE_HOOK_DIRECTORY, visits) + def _generateDayStats(self): + visits = self.current_analysis['visits'] - stats = generateStats(visits) - - cur_time = meta_visit['last_time'] - print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) + self._callPlugins(PRE_HOOK_DIRECTORY, visits) - if cur_time.tm_mday > 1: - last_day = cur_time.tm_mday - 1 - while last_day: - if last_day in current_visits['days_stats'].keys(): - break - last_day -= 1 - if last_day: - for k in stats.keys(): - stats[k] -= current_visits['days_stats'][last_day][k] - stats['nb_visitors'] = 0 - for k in visits.keys(): - if visits[k]['robot']: continue - if visits[k]['last_access'].tm_mday == cur_time.tm_mday: - stats['nb_visitors'] += 1 - print stats + stats = self._generateStats(visits) - current_visits['days_stats'][cur_time.tm_mday] = stats + cur_time = self.meta_infos['last_time'] + print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) -def newHit(hit): - global current_visits - global analyse_started + if cur_time.tm_mday > 1: + last_day = cur_time.tm_mday - 1 + while last_day: + if last_day in self.current_analysis['days_stats'].keys(): + break + last_day -= 1 + if last_day: + for k in stats.keys(): + stats[k] -= self.current_analysis['days_stats'][last_day][k] + stats['nb_visitors'] = 0 + for k in visits.keys(): + if visits[k]['robot']: continue + if visits[k]['last_access'].tm_mday == cur_time.tm_mday: + stats['nb_visitors'] += 1 + print stats - decodeTime(hit) + self.current_analysis['days_stats'][cur_time.tm_mday] = stats - t = hit['time_decoded'] + def _newHit(self, hit): + self._decodeTime(hit) - cur_time = meta_visit['last_time'] + t = hit['time_decoded'] - if cur_time == None: - current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() - analyse_started = True - else: - if not analyse_started: - if time.mktime(cur_time) >= time.mktime(t): - return - else: - analyse_started = True - if cur_time.tm_mon != t.tm_mon: - generateMonthStats() - current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() - elif cur_time.tm_mday != t.tm_mday: - generateDayStats() + cur_time = self.meta_infos['last_time'] - meta_visit['last_time'] = t + if cur_time == None: + self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() + self.analyse_started = True + else: + if not self.analyse_started: + if time.mktime(cur_time) >= time.mktime(t): + return + else: + self.analyse_started = True + if cur_time.tm_mon != t.tm_mon: + self._generateMonthStats() + self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() + elif cur_time.tm_mday != t.tm_mday: + self._generateDayStats() - if not decodeHTTPRequest(hit): return False + self.meta_infos['last_time'] = t - for k in hit.keys(): - if hit[k] == '-': hit[k] = '' + if not self._decodeHTTPRequest(hit): return False - appendHit(hit) + for k in hit.keys(): + if hit[k] == '-': hit[k] = '' - return True + self._appendHit(hit) -preloadPlugins() + return True -print '==> Analysing log' + def start(self): + self._preloadPlugins() -meta_visit = deserialize(META_PATH) or createEmptyMeta() -if meta_visit['last_time']: - current_visits = deserialize(getDBFilename(meta_visit['last_time'])) or createEmptyVisits() -else: - current_visits = createEmptyVisits() + print '==> Analysing log' -f = open(analyzed_filename) -for l in f: - # print "line " + l - - groups = log_re.match(l) - - if groups: - if not newHit(groups.groupdict()): - break - else: - print "No match " + l -f.close() + self.meta_infos = self._deserialize(META_PATH) or self._clearMeta() + if self.meta_infos['last_time']: + self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() + else: + self._clearVisits() -if analyse_started: - generateDayStats() - generateMonthStats() - serialize(meta_visit, META_PATH) -else: - print '==> Analyse not started : nothing to do' - generateMonthStats() + with open(analyzed_filename) as f: + for l in f: + # print "line " + l + + groups = self.log_re.match(l) + + if groups: + if not self._newHit(groups.groupdict()): + break + else: + print "No match for " + l + + if self.analyse_started: + self._generateDayStats() + self._generateMonthStats() + self._serialize(meta_infos, META_PATH) + else: + print '==> Analyse not started : nothing to do' + self._generateMonthStats() + +iwla = IWLA() +iwla.start()