diff --git a/conf.py b/conf.py index 5f957d6..da31821 100644 --- a/conf.py +++ b/conf.py @@ -16,11 +16,12 @@ DB_ROOT = './output/' DISPLAY_ROOT = './output/' pre_analysis_hooks = ['page_to_hit', 'robots'] -post_analysis_hooks = ['referers', 'top_pages', 'top_downloads'] +post_analysis_hooks = ['referers', 'top_pages', 'top_downloads', 'top_hits'] # post_analysis_hooks = ['top_visitors', 'reverse_dns'] -display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages', 'top_downloads'] +display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages', 'top_downloads', 'top_hits'] reverse_dns_timeout = 0.2 -page_to_hit_conf = [r'^.+/logo[/]?$', r'^.+/category/.+$', r'^.+/tag/.+$', r'^.+/archive/.+$', r'^.+/ljdc[/]?$'] +page_to_hit_conf = [r'^.+/logo[/]?$'] +hit_to_page_conf = [r'^.+/category/.+$', r'^.+/tag/.+$', r'^.+/archive/.+$', r'^.+/ljdc[/]?$'] count_hit_only_visitors = True diff --git a/plugins/display/top_downloads.py b/plugins/display/top_downloads.py index 47fedaf..970e104 100644 --- a/plugins/display/top_downloads.py +++ b/plugins/display/top_downloads.py @@ -29,7 +29,7 @@ class IWLADisplayTopDownloads(IPlugin): path = '%d/%s' % (cur_time.tm_year, filename) page = DisplayHTMLPage(title, path) - table = DisplayHTMLBlockTable('Top Downloads', ['URI', 'Hit']) + table = DisplayHTMLBlockTable('All Downloads', ['URI', 'Hit']) for (uri, entrance) in top_downloads: table.appendRow([uri, entrance]) page.appendBlock(table) diff --git a/plugins/display/top_pages.py b/plugins/display/top_pages.py index 8168b1a..100f97b 100644 --- a/plugins/display/top_pages.py +++ b/plugins/display/top_pages.py @@ -23,7 +23,7 @@ class IWLADisplayTopPages(IPlugin): index.appendBlock(table) cur_time = self.iwla.getCurTime() - title = time.strftime('Top Pages - %B %Y', cur_time) + title = time.strftime('All Pages - %B %Y', cur_time) filename = 'top_pages_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py index 106f903..bb71f9d 100644 --- a/plugins/post_analysis/top_pages.py +++ b/plugins/post_analysis/top_pages.py @@ -23,7 +23,9 @@ class IWLAPostAnalysisTopPages(IPlugin): for r in super_hit['requests']: if not r['is_page']: continue - if not self.iwla.isValidForCurrentAnalysis(r): continue + if not self.iwla.isValidForCurrentAnalysis(r) or\ + not self.iwla.hasBeenViewed(r): + continue uri = r['extract_request']['extract_uri'] if self.index_re.match(uri): diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py index 7f18c91..51102c8 100644 --- a/plugins/pre_analysis/page_to_hit.py +++ b/plugins/pre_analysis/page_to_hit.py @@ -12,10 +12,15 @@ class IWLAPreAnalysisPageToHit(IPlugin): self.API_VERSION = 1 def load(self): -# Remove logo from indefero - self.regexps = self.iwla.getConfValue('page_to_hit_conf', []) - if not self.regexps: return False - self.regexps = map(lambda(r): re.compile(r), self.regexps) + # Page to hit + self.ph_regexps = self.iwla.getConfValue('page_to_hit_conf', []) + if not self.ph_regexps: return False + self.ph_regexps = map(lambda(r): re.compile(r), self.ph_regexps) + + # Hit to page + self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', []) + if not self.hp_regexps: return False + self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps) return True @@ -29,12 +34,24 @@ class IWLAPreAnalysisPageToHit(IPlugin): if not self.iwla.isValidForCurrentAnalysis(request) or\ not self.iwla.hasBeenViewed(request): continue - if not request['is_page']: continue + uri = request['extract_request']['extract_uri'] - for regexp in self.regexps: - if regexp.match(uri): - #print '%s is an hit' % uri - request['is_page'] = False - super_hit['viewed_pages'] -= 1 - super_hit['viewed_hits'] += 1 - break + + if request['is_page']: + # Page to hit + for regexp in self.ph_regexps: + if regexp.match(uri): + #print '%s is a hit' % (uri ) + request['is_page'] = False + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1 + break + else: + # Hit to page + for regexp in self.hp_regexps: + if regexp.match(uri): + #print '%s is a page' % (uri ) + request['is_page'] = True + super_hit['viewed_pages'] += 1 + super_hit['viewed_hits'] -= 1 + break