diff --git a/conf.py b/conf.py index 2a84620..7b205cb 100644 --- a/conf.py +++ b/conf.py @@ -16,9 +16,9 @@ DB_ROOT = './output/' DISPLAY_ROOT = './output/' pre_analysis_hooks = ['page_to_hit', 'robots'] -post_analysis_hooks = ['referers'] +post_analysis_hooks = ['referers', 'top_pages'] # post_analysis_hooks = ['top_visitors', 'reverse_dns'] -display_hooks = ['top_visitors', 'all_visits', 'referers'] +display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages'] reverse_dns_timeout = 0.2 page_to_hit_conf = [r'^.+/logo/$'] diff --git a/iwla.py b/iwla.py index a11016b..ec8b40c 100755 --- a/iwla.py +++ b/iwla.py @@ -140,9 +140,8 @@ class IWLA(object): request = hit['extract_request'] if 'extract_uri' in request.keys(): - uri = request['extract_uri'] - else: - uri = request['http_uri'] + uri = request['extract_uri'] = request['http_uri'] + uri = request['extract_uri'] hit['is_page'] = self.isPage(uri) diff --git a/plugins/display/top_pages.py b/plugins/display/top_pages.py new file mode 100644 index 0000000..8168b1a --- /dev/null +++ b/plugins/display/top_pages.py @@ -0,0 +1,42 @@ +import time + +from iwla import IWLA +from iplugin import IPlugin +from display import * + +class IWLADisplayTopPages(IPlugin): + def __init__(self, iwla): + super(IWLADisplayTopPages, self).__init__(iwla) + self.API_VERSION = 1 + self.requires = ['IWLAPostAnalysisTopPages'] + + def hook(self): + top_pages = self.iwla.getMonthStats()['top_pages'] + + top_pages = sorted(top_pages.items(), key=lambda t: t[1], reverse=True) + + index = self.iwla.getDisplayIndex() + + table = DisplayHTMLBlockTable('Top Pages', ['URI', 'Entrance']) + for (uri, entrance) in top_pages[:10]: + table.appendRow([uri, entrance]) + index.appendBlock(table) + + cur_time = self.iwla.getCurTime() + title = time.strftime('Top Pages - %B %Y', cur_time) + + filename = 'top_pages_%d.html' % (cur_time.tm_mon) + path = '%d/%s' % (cur_time.tm_year, filename) + + page = DisplayHTMLPage(title, path) + table = DisplayHTMLBlockTable('Top Pages', ['URI', 'Entrance']) + for (uri, entrance) in top_pages: + table.appendRow([uri, entrance]) + page.appendBlock(table) + + display = self.iwla.getDisplay() + display.addPage(page) + + block = DisplayHTMLRawBlock() + block.setRawHTML('All pages' % (filename)) + index.appendBlock(block) diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index c21da87..af3b9f5 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -70,6 +70,7 @@ class IWLAPostAnalysisReferers(IPlugin): start_time = time.mktime(start_time) stats = self.iwla.getCurrentVisists() month_stats = self.iwla.getMonthStats() + referers = month_stats.get('referers', {}) robots_referers = month_stats.get('robots_referers', {}) search_engine_referers = month_stats.get('search_engine_referers', {}) diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py new file mode 100644 index 0000000..e4f01e7 --- /dev/null +++ b/plugins/post_analysis/top_pages.py @@ -0,0 +1,43 @@ +import time +import re + +from iwla import IWLA +from iplugin import IPlugin + +class IWLAPostAnalysisTopPages(IPlugin): + def __init__(self, iwla): + super(IWLAPostAnalysisTopPages, self).__init__(iwla) + self.API_VERSION = 1 + + def load(self): + self.index_re = re.compile(r'/index.*') + return True + + def hook(self): + start_time = self.iwla.getStartAnalysisTime() + start_time = time.mktime(start_time) + + stats = self.iwla.getCurrentVisists() + month_stats = self.iwla.getMonthStats() + + top_pages = month_stats.get('top_pages', {}) + + for (k, super_hit) in stats.items(): + if super_hit['robot']: continue + for r in super_hit['requests']: + if not r['is_page']: continue + + if time.mktime(r['time_decoded']) < start_time: continue + + uri = r['extract_request']['extract_uri'] + if self.index_re.match(uri): + uri = '/' + + uri = "%s%s" % (r.get('server_name', ''), uri) + + if not uri in top_pages.keys(): + top_pages[uri] = 1 + else: + top_pages[uri] += 1 + + month_stats['top_pages'] = top_pages