diff --git a/conf.py b/conf.py index 6d9af11..2a84620 100644 --- a/conf.py +++ b/conf.py @@ -16,7 +16,7 @@ DB_ROOT = './output/' DISPLAY_ROOT = './output/' pre_analysis_hooks = ['page_to_hit', 'robots'] -post_analysis_hooks = ['top_visitors'] +post_analysis_hooks = ['referers'] # post_analysis_hooks = ['top_visitors', 'reverse_dns'] display_hooks = ['top_visitors', 'all_visits', 'referers'] diff --git a/plugins/display/referers.py b/plugins/display/referers.py index a8bd891..be743f0 100644 --- a/plugins/display/referers.py +++ b/plugins/display/referers.py @@ -1,120 +1,21 @@ import time -import re -import xml.sax.saxutils as saxutils from iwla import IWLA from iplugin import IPlugin from display import * -import awstats_data - class IWLADisplayReferers(IPlugin): def __init__(self, iwla): super(IWLADisplayReferers, self).__init__(iwla) self.API_VERSION = 1 - - def _getSearchEngine(self, hashid): - #print 'Look for %s' % engine - for (k, e) in self.search_engines.items(): - for (h,h_re) in e['hashid']: - if hashid == h: - return k - #print 'Not found %s' % (hashid) - return None - - def load(self): - domain_name = self.iwla.getConfValue('domain_name', '') - - if not domain_name: - print 'domain_name required in conf' - return False - - self.own_domain_re = re.compile(r'.*%s.*' % (domain_name)) - self.search_engines = {} - - for (hashid, name) in awstats_data.search_engines_hashid.items(): - hashid_re = re.compile(r'.*%s.*' % (hashid)) - if not name in self.search_engines.keys(): - self.search_engines[name] = { - 'hashid' : [(hashid, hashid_re)] - } - else: - self.search_engines[name]['hashid'].append((hashid, hashid_re)) - #print 'Hashid %s => %s' % (name, hashid) - - for (name, known_url) in awstats_data.search_engines_knwown_url.items(): - self.search_engines[name]['known_url'] = re.compile(known_url + '(?P.+)') - - for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): - not_engine_re = re.compile(r'.*%s.*' % (not_engine)) - key = self._getSearchEngine(engine) - if key: - self.search_engines[key]['not_search_engine'] = not_engine_re - - #self.html_parser = html.parser.HTMLParser() - - return True - - def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): - if not parameters or not key_phrase_re: return - - for p in parameters.split('&'): - groups = key_phrase_re.match(p) - if groups: - key_phrase = groups.groupdict()['key_phrase'] - key_phrase = key_phrase.replace('+', ' ').lower() - key_phrase = saxutils.unescape(key_phrase) - if not key_phrase in key_phrases.keys(): - key_phrases[key_phrase] = 1 - else: - key_phrases[key_phrase] += 1 - break + self.requires = ['IWLAPostAnalysisReferers'] def hook(self): - stats = self.iwla.getCurrentVisists() - referers = {} - robots_referers = {} - search_engine_referers = {} - key_phrases = {} - - for (k, super_hit) in stats.items(): - for r in super_hit['requests']: - if not r['http_referer']: continue - - uri = r['extract_referer']['extract_uri'] - is_search_engine = False - - if self.own_domain_re.match(uri): continue - - for (name, engine) in self.search_engines.items(): - for (hashid, hashid_re) in engine['hashid']: - if not hashid_re.match(uri): continue - - not_engine = engine.get('not_search_engine', None) - # Try not engine - if not_engine and not_engine.match(uri): break - is_search_engine = True - uri = name - - parameters = r['extract_referer'].get('extract_parameters', None) - key_phrase_re = engine.get('known_url', None) - - self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) - break - - if is_search_engine: - dictionary = search_engine_referers - elif super_hit['robot']: - dictionary = robots_referers - # print '%s => %s' % (uri, super_hit['remote_ip']) - else: - dictionary = referers - if r['is_page']: - key = 'pages' - else: - key = 'hits' - if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} - dictionary[uri][key] += 1 + month_stats = self.iwla.getMonthStats() + referers = month_stats.get('referers', {}) + robots_referers = month_stats.get('robots_referers', {}) + search_engine_referers = month_stats.get('search_engine_referers', {}) + key_phrases = month_stats.get('key_phrases', {}) top_referers = [(k, referers[k]['pages']) for k in referers.keys()] top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True) diff --git a/plugins/display/top_visitors.py b/plugins/display/top_visitors.py index 0da035b..b60f5ec 100644 --- a/plugins/display/top_visitors.py +++ b/plugins/display/top_visitors.py @@ -8,14 +8,17 @@ class IWLADisplayTopVisitors(IPlugin): def __init__(self, iwla): super(IWLADisplayTopVisitors, self).__init__(iwla) self.API_VERSION = 1 - self.requires = ['IWLAPostAnalysisTopVisitors'] def hook(self): - stats = self.iwla.getMonthStats() + hits = self.iwla.getValidVisitors() + + top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()] + top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) + top_visitors = [hits[h[0]] for h in top_bandwidth[:10]] index = self.iwla.getDisplayIndex() table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) - for super_hit in stats['top_visitors']: + for super_hit in top_visitors: address = super_hit['remote_addr'] if self.iwla.getConfValue('display_visitor_ip', False) and\ super_hit.get('dns_name_replaced', False): diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py new file mode 100644 index 0000000..c21da87 --- /dev/null +++ b/plugins/post_analysis/referers.py @@ -0,0 +1,121 @@ +import time +import re +import xml.sax.saxutils as saxutils + +from iwla import IWLA +from iplugin import IPlugin + +import awstats_data + +class IWLAPostAnalysisReferers(IPlugin): + def __init__(self, iwla): + super(IWLAPostAnalysisReferers, self).__init__(iwla) + self.API_VERSION = 1 + + def _getSearchEngine(self, hashid): + for (k, e) in self.search_engines.items(): + for (h,h_re) in e['hashid']: + if hashid == h: + return k + return None + + def load(self): + domain_name = self.iwla.getConfValue('domain_name', '') + + if not domain_name: + print 'domain_name required in conf' + return False + + self.own_domain_re = re.compile(r'.*%s.*' % (domain_name)) + self.search_engines = {} + + for (hashid, name) in awstats_data.search_engines_hashid.items(): + hashid_re = re.compile(r'.*%s.*' % (hashid)) + if not name in self.search_engines.keys(): + self.search_engines[name] = { + 'hashid' : [(hashid, hashid_re)] + } + else: + self.search_engines[name]['hashid'].append((hashid, hashid_re)) + #print 'Hashid %s => %s' % (name, hashid) + + for (name, known_url) in awstats_data.search_engines_knwown_url.items(): + self.search_engines[name]['known_url'] = re.compile(known_url + '(?P.+)') + + for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): + not_engine_re = re.compile(r'.*%s.*' % (not_engine)) + key = self._getSearchEngine(engine) + if key: + self.search_engines[key]['not_search_engine'] = not_engine_re + + return True + + def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): + if not parameters or not key_phrase_re: return + + for p in parameters.split('&'): + groups = key_phrase_re.match(p) + if groups: + key_phrase = groups.groupdict()['key_phrase'] + key_phrase = key_phrase.replace('+', ' ').lower() + key_phrase = saxutils.unescape(key_phrase) + if not key_phrase in key_phrases.keys(): + key_phrases[key_phrase] = 1 + else: + key_phrases[key_phrase] += 1 + break + + def hook(self): + start_time = self.iwla.getStartAnalysisTime() + start_time = time.mktime(start_time) + stats = self.iwla.getCurrentVisists() + month_stats = self.iwla.getMonthStats() + referers = month_stats.get('referers', {}) + robots_referers = month_stats.get('robots_referers', {}) + search_engine_referers = month_stats.get('search_engine_referers', {}) + key_phrases = month_stats.get('key_phrases', {}) + + for (k, super_hit) in stats.items(): + for r in super_hit['requests']: + if time.mktime(r['time_decoded']) < start_time: continue + if not r['http_referer']: continue + + uri = r['extract_referer']['extract_uri'] + is_search_engine = False + + if self.own_domain_re.match(uri): continue + + for (name, engine) in self.search_engines.items(): + for (hashid, hashid_re) in engine['hashid']: + if not hashid_re.match(uri): continue + + not_engine = engine.get('not_search_engine', None) + # Try not engine + if not_engine and not_engine.match(uri): break + is_search_engine = True + uri = name + + parameters = r['extract_referer'].get('extract_parameters', None) + key_phrase_re = engine.get('known_url', None) + + self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) + break + + if is_search_engine: + dictionary = search_engine_referers + elif super_hit['robot']: + dictionary = robots_referers + # print '%s => %s' % (uri, super_hit['remote_ip']) + else: + dictionary = referers + if r['is_page']: + key = 'pages' + else: + key = 'hits' + if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} + dictionary[uri][key] += 1 + + month_stats['referers'] = referers + month_stats['robots_referers'] = robots_referers + month_stats['search_engine_referers'] = search_engine_referers + month_stats['key_phrases'] = key_phrases diff --git a/plugins/post_analysis/top_visitors.py b/plugins/post_analysis/top_visitors.py deleted file mode 100644 index d20d5ca..0000000 --- a/plugins/post_analysis/top_visitors.py +++ /dev/null @@ -1,15 +0,0 @@ -from iwla import IWLA -from iplugin import IPlugin - -class IWLAPostAnalysisTopVisitors(IPlugin): - def __init__(self, iwla): - super(IWLAPostAnalysisTopVisitors, self).__init__(iwla) - self.API_VERSION = 1 - - def hook(self): - hits = self.iwla.getValidVisitors() - stats = self.iwla.getMonthStats() - top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()] - top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) - stats['top_visitors'] = [hits[h[0]] for h in top_bandwidth[:10]] -