From 7405cf237acfbc4fa6d4a7d5a5e3f1d6ae05b368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Tue, 25 Nov 2014 16:22:07 +0100 Subject: [PATCH] Do a more generic plugin : page_to_hit --- conf.py | 3 ++- iplugin.py | 2 ++ iwla.py | 13 +++++----- plugins/pre_analysis/page_to_hit.py | 38 +++++++++++++++++++++++++++++ plugins/pre_analysis/soutade.py | 35 -------------------------- 5 files changed, 48 insertions(+), 43 deletions(-) create mode 100644 plugins/pre_analysis/page_to_hit.py delete mode 100644 plugins/pre_analysis/soutade.py diff --git a/conf.py b/conf.py index e491559..e4c6378 100644 --- a/conf.py +++ b/conf.py @@ -11,11 +11,12 @@ analyzed_filename = 'access.log' DB_ROOT = './output/' DISPLAY_ROOT = './output/' -pre_analysis_hooks = ['soutade', 'robots'] +pre_analysis_hooks = ['page_to_hit', 'robots'] post_analysis_hooks = ['top_visitors', 'reverse_dns'] display_hooks = ['top_visitors'] reverse_dns_timeout = 0.2 +page_to_hit_conf = [r'^.+/logo/$'] # pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] # post_analysis_hooks = ['top_visitors.py'] # display_hooks = ['top_visitors.py'] diff --git a/iplugin.py b/iplugin.py index 91bfdc1..d13e62a 100644 --- a/iplugin.py +++ b/iplugin.py @@ -31,6 +31,8 @@ class IPlugin(object): def preloadPlugins(plugins, iwla): cache_plugins = {} + print "==> Preload plugins" + for root in plugins.keys(): for plugin_filename in plugins[root]: plugin_path = root + '.' + plugin_filename diff --git a/iwla.py b/iwla.py index 054ec1a..62deb66 100755 --- a/iwla.py +++ b/iwla.py @@ -1,11 +1,8 @@ #!/usr/bin/env python -import sys import os import re import time -import glob -import imp import pickle import gzip import importlib @@ -126,7 +123,7 @@ class IWLA(object): remote_addr = hit['remote_addr'] if not remote_addr in self.current_analysis['visits'].keys(): - self._createUser(hit) + self._createVisitor(hit) return super_hit = self.current_analysis['visits'][remote_addr] @@ -160,7 +157,7 @@ class IWLA(object): else: super_hit[hit_key] += 1 - def _createUser(self, hit): + def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['viewed_pages'] = 0 @@ -347,7 +344,7 @@ class IWLA(object): else: if not self.analyse_started: if time.mktime(cur_time) >= time.mktime(t): - return + return False else: self.analyse_started = True if cur_time.tm_mon != t.tm_mon: @@ -370,7 +367,7 @@ class IWLA(object): def start(self): self.cache_plugins = preloadPlugins(self.plugins, self) - print '==> Analysing log' + print '==> Analyse previous database' self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: @@ -378,6 +375,8 @@ class IWLA(object): else: self._clearVisits() + print '==> Analysing log' + with open(conf.analyzed_filename) as f: for l in f: # print "line " + l diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py new file mode 100644 index 0000000..8c046b6 --- /dev/null +++ b/plugins/pre_analysis/page_to_hit.py @@ -0,0 +1,38 @@ +import re + +from iwla import IWLA +from iplugin import IPlugin + +# Basic rule to detect robots + +class IWLAPreAnalysisPageToHit(IPlugin): + + def __init__(self, iwla): + super(IWLAPreAnalysisPageToHit, self).__init__(iwla) + self.API_VERSION = 1 + + def load(self): +# Remove logo from indefero + self.regexps = self.iwla.getConfValue('page_to_hit_conf', []) + if not self.regexps: return False + self.regexps = map(lambda(r): re.compile(r), self.regexps) + + return True + + def hook(self, iwla): + hits = iwla.getCurrentVisists() + + for (k, super_hit) in hits.items(): + if super_hit['robot']: continue + + for p in super_hit['requests']: + if not p['is_page']: continue + if int(p['status']) != 200: continue + if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue + uri = p['extract_request']['extract_uri'] + for r in self.regexps: + if r.match(uri): + p['is_page'] = False + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1 + break diff --git a/plugins/pre_analysis/soutade.py b/plugins/pre_analysis/soutade.py deleted file mode 100644 index 0ec4e69..0000000 --- a/plugins/pre_analysis/soutade.py +++ /dev/null @@ -1,35 +0,0 @@ -import re - -from iwla import IWLA -from iplugin import IPlugin - -# Basic rule to detect robots - -class IWLAPreAnalysisSoutade(IPlugin): - - def __init__(self, iwla): - super(IWLAPreAnalysisSoutade, self).__init__(iwla) - self.API_VERSION = 1 - - def load(self): -# Remove logo from indefero - self.logo_re = re.compile(r'^.+/logo/$') - - return True - - def hook(self, iwla): - hits = iwla.getCurrentVisists() - - for k in hits.keys(): - super_hit = hits[k] - - if super_hit['robot']: continue - - for p in super_hit['requests']: - if not p['is_page']: continue - if int(p['status']) != 200: continue - if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue - if self.logo_re.match(p['extract_request']['extract_uri']): - p['is_page'] = False - super_hit['viewed_pages'] -= 1 - super_hit['viewed_hits'] += 1