diff --git a/conf.py b/conf.py index 7bf2c3b..76981bb 100644 --- a/conf.py +++ b/conf.py @@ -11,7 +11,7 @@ analyzed_filename = 'access.log' DB_ROOT = './output/' DISPLAY_ROOT = './output/' -pre_analysis_hooks = ['H002_soutade', 'H001_robot'] +pre_analysis_hooks = ['soutade', 'robots'] post_analysis_hooks = ['top_visitors', 'reverse_dns'] display_hooks = ['top_visitors'] diff --git a/iplugin.py b/iplugin.py new file mode 100644 index 0000000..e2701ae --- /dev/null +++ b/iplugin.py @@ -0,0 +1,82 @@ +import importlib +import inspect +import traceback + +class IPlugin(object): + + def __init__(self, iwla): + self.iwla = iwla + self.requires = [] + self.API_VERSION = 1 + self.ANALYSIS_CLASS = 'HTTP' + + def isValid(self, analysis_class, api_version): + if analysis_class != self.ANALYSIS_CLASS: return False + + # For now there is only version 1 + if self.API_VERSION != api_version: + return False + + return True + + def getRequirements(self): + return self.requires + + def load(self): + return True + + def hook(self, iwla): + pass + +def preloadPlugins(plugins, iwla): + cache_plugins = {} + + for root in plugins.keys(): + for plugin_filename in plugins[root]: + plugin_path = root + '.' + plugin_filename + try: + mod = importlib.import_module(plugin_path) + classes = [c for _,c in inspect.getmembers(mod)\ + if inspect.isclass(c) and \ + issubclass(c, IPlugin) and \ + c.__name__ != 'IPlugin' + ] + + if not classes: + print 'No plugin defined in %s' % (plugin_path) + continue + + plugin = classes[0](iwla) + plugin_name = plugin.__class__.__name__ + + if not plugin.isValid(iwla.ANALYSIS_CLASS, iwla.API_VERSION): + #print 'Plugin not valid %s' % (plugin_filename) + continue + + #print 'Load plugin %s' % (plugin_name) + + requirements = plugin.getRequirements() + + if requirements: + requirement_validated = False + for r in requirements: + for (_,p) in cache_plugins.items(): + if p.__class__.__name__ == r: + requirement_validated = True + break + if not requirement_validated: + print 'Missing requirements for plugin %s' % (plugin_path) + break + if not requirement_validated: continue + + if not plugin.load(): + print 'Plugin %s load failed' % (plugin_path) + continue + + print '\tRegister %s' % (plugin_path) + cache_plugins[plugin_path] = plugin + except Exception as e: + print 'Error loading \'%s\' => %s' % (plugin_path, e) + traceback.print_exc() + + return cache_plugins diff --git a/iwla.py b/iwla.py index b7c4072..5c69a6e 100755 --- a/iwla.py +++ b/iwla.py @@ -10,6 +10,7 @@ import pickle import gzip import importlib +from iplugin import * from display import * from default_conf import * @@ -40,18 +41,13 @@ class IWLA(object): DISPLAY_HOOK_DIRECTORY : display_hooks} def _preloadPlugins(self): + self.cache_plugins = preloadPlugins(self.plugins, self) + return ret = True for root in self.plugins.keys(): for plugin_name in self.plugins[root]: - #p = root + '/' + plugin_name p = root + '.' + plugin_name try: - # fp, pathname, description = imp.find_module(plugin_name, [root]) - # self.cache_plugins[p] = imp.load_module(p, fp, pathname, description) - #p = 'plugins.display.top_visitors' - #sys.path.append(root) - #self.cache_plugins[p] = importlib.import_module(plugin_name, root) - #sys.path.remove(root) self.cache_plugins[p] = importlib.import_module(p) mod = self.cache_plugins[p] infos = mod.get_plugins_infos() diff --git a/plugins/display/top_visitors.py b/plugins/display/top_visitors.py index 796af31..93f455a 100644 --- a/plugins/display/top_visitors.py +++ b/plugins/display/top_visitors.py @@ -1,37 +1,27 @@ import time + +from iwla import IWLA +from iplugin import IPlugin from display import * -PLUGIN_CLASS = 'HTTP' -API_VERSION = 1 +class IWLADisplayTopVisitors(IPlugin): + def __init__(self, iwla): + super(IWLADisplayTopVisitors, self).__init__(iwla) + self.API_VERSION = 1 + self.requires = ['IWLAPostAnalysisTopVisitors'] -def get_plugins_infos(): - infos = { - 'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1 - } - return infos + def hook(self, iwla): + stats = iwla.getMonthStats() -def load(): - return True - -def hook(iwla): - stats = iwla.getMonthStats() - - top_visitors = stats.get('top_visitors', None) - if not top_visitors: - print 'Top visitors post analysis plugin not installed' - return - - index = iwla.getDisplayIndex() - table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) - for super_hit in top_visitors: - row = [ - super_hit['remote_addr'], - super_hit['viewed_pages'], - super_hit['viewed_hits'], - bytesToStr(super_hit['bandwidth']), - time.asctime(super_hit['last_access']) - ] - table.appendRow(row) - index.appendBlock(table) + index = iwla.getDisplayIndex() + table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) + for super_hit in stats['top_visitors']: + row = [ + super_hit['remote_addr'], + super_hit['viewed_pages'], + super_hit['viewed_hits'], + bytesToStr(super_hit['bandwidth']), + time.asctime(super_hit['last_access']) + ] + table.appendRow(row) + index.appendBlock(table) diff --git a/plugins/post_analysis/reverse_dns.py b/plugins/post_analysis/reverse_dns.py index b935efa..10b3903 100644 --- a/plugins/post_analysis/reverse_dns.py +++ b/plugins/post_analysis/reverse_dns.py @@ -1,30 +1,20 @@ -import socket from iwla import IWLA +from iplugin import IPlugin -PLUGIN_CLASS = 'HTTP' -API_VERSION = 1 +class IWLAPostAnalysisReverseDNS(IPlugin): + def __init__(self, iwla): + super(IWLAPostAnalysisReverseDNS, self).__init__(iwla) + self.API_VERSION = 1 -def get_plugins_infos(): - infos = { - 'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1 - } - return infos - -def load(): - socket.setdefaulttimeout(0.5) - return True - -def hook(iwla): - hits = iwla.getValidVisitors() - for (k, hit) in hits.items(): - if hit.get('dns_analysed', False): continue - try: - name, _, _ = socket.gethostbyaddr(k) - hit['remote_addr'] = name - except: - pass - finally: - hit['dns_analysed'] = True + def hook(self, iwla): + hits = iwla.getValidVisitors() + for (k, hit) in hits.items(): + if hit.get('dns_analysed', False): continue + try: + name, _, _ = socket.gethostbyaddr(k) + hit['remote_addr'] = name + except: + pass + finally: + hit['dns_analysed'] = True diff --git a/plugins/post_analysis/top_visitors.py b/plugins/post_analysis/top_visitors.py index 345b947..c7de05b 100644 --- a/plugins/post_analysis/top_visitors.py +++ b/plugins/post_analysis/top_visitors.py @@ -1,23 +1,15 @@ from iwla import IWLA +from iplugin import IPlugin -PLUGIN_CLASS = 'HTTP' -API_VERSION = 1 +class IWLAPostAnalysisTopVisitors(IPlugin): + def __init__(self, iwla): + super(IWLAPostAnalysisTopVisitors, self).__init__(iwla) + self.API_VERSION = 1 -def get_plugins_infos(): - infos = { - 'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1 - } - return infos - -def load(): - return True - -def hook(iwla): - hits = iwla.getValidVisitors() - stats = iwla.getMonthStats() - top_bandwidth = [(k,hits[k]['bandwidth']) for (k,v) in hits.items()] - top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) - stats['top_visitors'] = [hits[h[0]] for h in top_bandwidth[:10]] + def hook(self, iwla): + hits = iwla.getValidVisitors() + stats = iwla.getMonthStats() + top_bandwidth = [(k,hits[k]['bandwidth']) for (k,v) in hits.items()] + top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) + stats['top_visitors'] = [hits[h[0]] for h in top_bandwidth[:10]] diff --git a/plugins/pre_analysis/H001_robot.py b/plugins/pre_analysis/H001_robot.py deleted file mode 100644 index a299fa5..0000000 --- a/plugins/pre_analysis/H001_robot.py +++ /dev/null @@ -1,68 +0,0 @@ -import re -from iwla import IWLA - -from awstats_robots_data import awstats_robots - -PLUGIN_CLASS = 'HTTP' -API_VERSION = 1 - -def get_plugins_infos(): - infos = {'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1} - return infos - -def load(): - global awstats_robots - print '==> Generating robot dictionary' - - awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) - - return True - -# Basic rule to detect robots - -def hook(iwla): - hits = iwla.getCurrentVisists() - for k in hits.keys(): - super_hit = hits[k] - - if super_hit['robot']: continue - - isRobot = False - referers = 0 - - first_page = super_hit['requests'][0] - if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday: - for r in awstats_robots: - if r.match(first_page['http_user_agent']): - super_hit['robot'] = 1 - continue - -# 1) no pages view --> robot - if not super_hit['viewed_pages']: - super_hit['robot'] = 1 - continue - -# 2) pages without hit --> robot - if not super_hit['viewed_hits']: - super_hit['robot'] = 1 - continue - - for hit in super_hit['requests']: -# 3) /robots.txt read - if hit['extract_request']['http_uri'] == '/robots.txt': - isRobot = True - break - -# 4) Any referer for hits - if not hit['is_page'] and hit['http_referer']: - referers += 1 - - if isRobot: - super_hit['robot'] = 1 - continue - - if super_hit['viewed_hits'] and not referers: - super_hit['robot'] = 1 - continue diff --git a/plugins/pre_analysis/H002_soutade.py b/plugins/pre_analysis/H002_soutade.py deleted file mode 100644 index b893715..0000000 --- a/plugins/pre_analysis/H002_soutade.py +++ /dev/null @@ -1,38 +0,0 @@ -import re -from iwla import IWLA - -# Remove logo from indefero -logo_re = re.compile(r'^.+/logo/$') - -PLUGIN_CLASS = 'HTTP' -API_VERSION = 1 - -def get_plugins_infos(): - infos = { - 'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1 - } - return infos - -def load(): - return True - -# Basic rule to detect robots - -def hook(iwla): - hits = iwla.getCurrentVisists() - - for k in hits.keys(): - super_hit = hits[k] - - if super_hit['robot']: continue - - for p in super_hit['requests']: - if not p['is_page']: continue - if int(p['status']) != 200: continue - if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue - if logo_re.match(p['extract_request']['extract_uri']): - p['is_page'] = False - super_hit['viewed_pages'] -= 1 - super_hit['viewed_hits'] += 1 diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py new file mode 100644 index 0000000..596552e --- /dev/null +++ b/plugins/pre_analysis/robots.py @@ -0,0 +1,64 @@ +import re + +from iwla import IWLA +from iplugin import IPlugin + +from awstats_robots_data import awstats_robots + +class IWLAPreAnalysisRobots(IPlugin): + def __init__(self, iwla): + super(IWLAPreAnalysisRobots, self).__init__(iwla) + self.API_VERSION = 1 + + def load(self): + global awstats_robots + + self.awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) + + return True + +# Basic rule to detect robots + def hook(self, iwla): + hits = iwla.getCurrentVisists() + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + isRobot = False + referers = 0 + + first_page = super_hit['requests'][0] + if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday: + for r in self.awstats_robots: + if r.match(first_page['http_user_agent']): + super_hit['robot'] = 1 + continue + +# 1) no pages view --> robot + if not super_hit['viewed_pages']: + super_hit['robot'] = 1 + continue + +# 2) pages without hit --> robot + if not super_hit['viewed_hits']: + super_hit['robot'] = 1 + continue + + for hit in super_hit['requests']: +# 3) /robots.txt read + if hit['extract_request']['http_uri'] == '/robots.txt': + isRobot = True + break + +# 4) Any referer for hits + if not hit['is_page'] and hit['http_referer']: + referers += 1 + + if isRobot: + super_hit['robot'] = 1 + continue + + if super_hit['viewed_hits'] and not referers: + super_hit['robot'] = 1 + continue diff --git a/plugins/pre_analysis/soutade.py b/plugins/pre_analysis/soutade.py new file mode 100644 index 0000000..0ec4e69 --- /dev/null +++ b/plugins/pre_analysis/soutade.py @@ -0,0 +1,35 @@ +import re + +from iwla import IWLA +from iplugin import IPlugin + +# Basic rule to detect robots + +class IWLAPreAnalysisSoutade(IPlugin): + + def __init__(self, iwla): + super(IWLAPreAnalysisSoutade, self).__init__(iwla) + self.API_VERSION = 1 + + def load(self): +# Remove logo from indefero + self.logo_re = re.compile(r'^.+/logo/$') + + return True + + def hook(self, iwla): + hits = iwla.getCurrentVisists() + + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + for p in super_hit['requests']: + if not p['is_page']: continue + if int(p['status']) != 200: continue + if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue + if self.logo_re.match(p['extract_request']['extract_uri']): + p['is_page'] = False + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1