diff --git a/robots.py b/awstats_robots_data.py similarity index 100% rename from robots.py rename to awstats_robots_data.py diff --git a/conf.py b/conf.py index c42a7ba..9a0f235 100644 --- a/conf.py +++ b/conf.py @@ -10,3 +10,5 @@ analyzed_filename = 'access.log' DB_ROOT = './output/' DISPLAY_ROOT = './output/' + +pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] diff --git a/hooks/pre_analysis/H001_soutade.py b/hooks/pre_analysis/H001_soutade.py deleted file mode 100644 index 6d683be..0000000 --- a/hooks/pre_analysis/H001_soutade.py +++ /dev/null @@ -1,22 +0,0 @@ -import re - -# Remove logo from indefero -logo_re = re.compile(r'^.+/logo/$') - -# Basic rule to detect robots - -def hook(hits): - for k in hits.keys(): - super_hit = hits[k] - - if super_hit['robot']: continue - - for p in super_hit['pages']: - if not p['is_page']: continue - if int(p['status']) != 200: continue - - if logo_re.match(p['extract_request']['extract_uri']): - p['is_page'] = False - if super_hit['viewed_pages']: - super_hit['viewed_pages'] -= 1 - super_hit['viewed_hits'] += 1 diff --git a/hooks/pre_analysis/H002_robot.py b/hooks/pre_analysis/H002_robot.py deleted file mode 100644 index 8a6e721..0000000 --- a/hooks/pre_analysis/H002_robot.py +++ /dev/null @@ -1,42 +0,0 @@ - -# Basic rule to detect robots - -def hook(hits): - for k in hits.keys(): - super_hit = hits[k] - - if super_hit['robot']: continue - - isRobot = False - referers = 0 - -# 1) no pages view --> robot - # if not super_hit['viewed_pages']: - # super_hit['robot'] = 1 - # continue - -# 2) pages without hit --> robot - if not super_hit['viewed_hits']: - super_hit['robot'] = 1 - continue - elif not super_hit['viewed_pages']: -# Hit only - super_hit['hit_only'] = 1 - - for hit in super_hit['pages']: -# 3) /robots.txt read - if hit['extract_request']['http_uri'] == '/robots.txt': - isRobot = True - break - -# 4) Any referer for hits - if not hit['is_page'] and hit['http_referer']: - referers += 1 - - if isRobot: - super_hit['robot'] = 1 - continue - - if super_hit['viewed_hits'] and not referers: - super_hit['robot'] = 1 - continue diff --git a/iwla.py b/iwla.py index 8ef4f6a..a60d9ec 100755 --- a/iwla.py +++ b/iwla.py @@ -8,8 +8,6 @@ import imp import pickle import gzip -from robots import awstats_robots; - # Default configuration DB_ROOT = './output/' @@ -21,6 +19,10 @@ log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local time_format = '%d/%b/%Y:%H:%M:%S +0100' +pre_analysis_hooks = [] +post_analysis_hooks = [] +display_hooks = [] + from conf import * print '==> Start' @@ -40,16 +42,36 @@ uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200] -HOOKS_ROOT = './hooks/' -PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/*.py' -POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/*.py' -DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/*.py' +HOOKS_ROOT = './plugins/' +PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/' +POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/' +DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/' META_PATH = DB_ROOT + 'meta.db' DB_FILENAME = 'iwla.db' -print '==> Generating robot dictionary' +plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, POST_HOOK_DIRECTORY : post_analysis_hooks, DISPLAY_HOOK_DIRECTORY : display_hooks} -awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) +ANALYSIS_CLASS = 'HTTP' +API_VERSION = 1 + +def preloadPlugins(): + for root in plugins.keys(): + for plugin_name in plugins[root]: + p = root + '/' + plugin_name + try: + mod = cache_plugins[p] = imp.load_source('hook', p) + infos = mod.get_plugins_infos() + if infos['class'] != ANALYSIS_CLASS or \ + API_VERSION < infos['min_version'] or\ + (infos['max_version'] != -1 and (API_VERSION > infos['max_version'])): + del cache_plugins[p] + elif not mod.load(): + del cache_plugins[p] + except Exception as e: + print 'Error loading \'%s\' => %s' % (p, e) + return False + return True + def createEmptyVisits(): visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}} @@ -97,17 +119,11 @@ def deserialize(filename): return pickle.load(f) return None -def callPlugins(path, *kwargs): - print '==> Call plugins (%s)' % path - plugins = glob.glob(path) - plugins.sort() - for p in plugins: +def callPlugins(root, *kwargs): + print '==> Call plugins (%s)' % root + for p in plugins[root]: print '\t%s' % (p) - if not p in cache_plugins: - mod = imp.load_source('hook', p) - cache_plugins[p] = mod - else: - mod = cache_plugins[p] + mod = cache_plugins[root + '/' + p] mod.hook(*kwargs) def isPage(request): @@ -164,16 +180,10 @@ def createUser(hit): super_hit['bandwith'] = 0; super_hit['last_access'] = meta_visit['last_time'] super_hit['pages'] = []; - super_hit['robot'] = isRobot(hit); + super_hit['robot'] = False super_hit['hit_only'] = 0; appendHit(hit) -def isRobot(hit): - for r in awstats_robots: - if r.match(hit['http_user_agent']): - return True - return False - def decodeHTTPRequest(hit): if not 'request' in hit.keys(): return False @@ -385,6 +395,8 @@ def newHit(hit): return True +preloadPlugins() + print '==> Analysing log' meta_visit = deserialize(META_PATH) or createEmptyMeta() diff --git a/plugins/pre_analysis/H001_robot.py b/plugins/pre_analysis/H001_robot.py index 9ec45cb..91cd5fc 100644 --- a/plugins/pre_analysis/H001_robot.py +++ b/plugins/pre_analysis/H001_robot.py @@ -1,3 +1,23 @@ +import re + +from awstats_robots_data import awstats_robots + +PLUGIN_CLASS = 'HTTP' +API_VERSION = 1 + +def get_plugins_infos(): + infos = {'class' : PLUGIN_CLASS, + 'min_version' : API_VERSION, + 'max_version' : -1} + return infos + +def load(): + global awstats_robots + print '==> Generating robot dictionary' + + awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) + + return True # Basic rule to detect robots @@ -10,6 +30,11 @@ def hook(hits): isRobot = False referers = 0 + for r in awstats_robots: + if r.match(super_hit['pages'][0]['http_user_agent']): + super_hit['robot'] = 1 + continue + # 1) no pages view --> robot if not super_hit['viewed_pages']: super_hit['robot'] = 1 diff --git a/plugins/pre_analysis/H002_soutade.py b/plugins/pre_analysis/H002_soutade.py index d6767aa..f546d76 100644 --- a/plugins/pre_analysis/H002_soutade.py +++ b/plugins/pre_analysis/H002_soutade.py @@ -3,6 +3,18 @@ import re # Remove logo from indefero logo_re = re.compile(r'^.+/logo/$') +PLUGIN_CLASS = 'HTTP' +API_VERSION = 1 + +def get_plugins_infos(): + infos = {'class' : PLUGIN_CLASS, + 'min_version' : API_VERSION, + 'max_version' : -1} + return infos + +def load(): + return True + # Basic rule to detect robots def hook(hits): @@ -13,6 +25,7 @@ def hook(hits): for p in super_hit['pages']: if not p['is_page']: continue + if int(p['status']) != 200: continue if logo_re.match(p['extract_request']['extract_uri']): p['is_page'] = False super_hit['viewed_pages'] -= 1