Add hit_to_page_conf in addition to page_to_hit_conf

This commit is contained in:
Grégory Soutadé 2014-11-27 13:46:58 +01:00
parent 5ccc63c7ae
commit c87ddfb1aa
5 changed files with 38 additions and 18 deletions

View File

@ -16,11 +16,12 @@ DB_ROOT = './output/'
DISPLAY_ROOT = './output/' DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['page_to_hit', 'robots'] pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['referers', 'top_pages', 'top_downloads'] post_analysis_hooks = ['referers', 'top_pages', 'top_downloads', 'top_hits']
# post_analysis_hooks = ['top_visitors', 'reverse_dns'] # post_analysis_hooks = ['top_visitors', 'reverse_dns']
display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages', 'top_downloads'] display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages', 'top_downloads', 'top_hits']
reverse_dns_timeout = 0.2 reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo[/]?$', r'^.+/category/.+$', r'^.+/tag/.+$', r'^.+/archive/.+$', r'^.+/ljdc[/]?$'] page_to_hit_conf = [r'^.+/logo[/]?$']
hit_to_page_conf = [r'^.+/category/.+$', r'^.+/tag/.+$', r'^.+/archive/.+$', r'^.+/ljdc[/]?$']
count_hit_only_visitors = True count_hit_only_visitors = True

View File

@ -29,7 +29,7 @@ class IWLADisplayTopDownloads(IPlugin):
path = '%d/%s' % (cur_time.tm_year, filename) path = '%d/%s' % (cur_time.tm_year, filename)
page = DisplayHTMLPage(title, path) page = DisplayHTMLPage(title, path)
table = DisplayHTMLBlockTable('Top Downloads', ['URI', 'Hit']) table = DisplayHTMLBlockTable('All Downloads', ['URI', 'Hit'])
for (uri, entrance) in top_downloads: for (uri, entrance) in top_downloads:
table.appendRow([uri, entrance]) table.appendRow([uri, entrance])
page.appendBlock(table) page.appendBlock(table)

View File

@ -23,7 +23,7 @@ class IWLADisplayTopPages(IPlugin):
index.appendBlock(table) index.appendBlock(table)
cur_time = self.iwla.getCurTime() cur_time = self.iwla.getCurTime()
title = time.strftime('Top Pages - %B %Y', cur_time) title = time.strftime('All Pages - %B %Y', cur_time)
filename = 'top_pages_%d.html' % (cur_time.tm_mon) filename = 'top_pages_%d.html' % (cur_time.tm_mon)
path = '%d/%s' % (cur_time.tm_year, filename) path = '%d/%s' % (cur_time.tm_year, filename)

View File

@ -23,7 +23,9 @@ class IWLAPostAnalysisTopPages(IPlugin):
for r in super_hit['requests']: for r in super_hit['requests']:
if not r['is_page']: continue if not r['is_page']: continue
if not self.iwla.isValidForCurrentAnalysis(r): continue if not self.iwla.isValidForCurrentAnalysis(r) or\
not self.iwla.hasBeenViewed(r):
continue
uri = r['extract_request']['extract_uri'] uri = r['extract_request']['extract_uri']
if self.index_re.match(uri): if self.index_re.match(uri):

View File

@ -12,10 +12,15 @@ class IWLAPreAnalysisPageToHit(IPlugin):
self.API_VERSION = 1 self.API_VERSION = 1
def load(self): def load(self):
# Remove logo from indefero # Page to hit
self.regexps = self.iwla.getConfValue('page_to_hit_conf', []) self.ph_regexps = self.iwla.getConfValue('page_to_hit_conf', [])
if not self.regexps: return False if not self.ph_regexps: return False
self.regexps = map(lambda(r): re.compile(r), self.regexps) self.ph_regexps = map(lambda(r): re.compile(r), self.ph_regexps)
# Hit to page
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
if not self.hp_regexps: return False
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
return True return True
@ -29,12 +34,24 @@ class IWLAPreAnalysisPageToHit(IPlugin):
if not self.iwla.isValidForCurrentAnalysis(request) or\ if not self.iwla.isValidForCurrentAnalysis(request) or\
not self.iwla.hasBeenViewed(request): not self.iwla.hasBeenViewed(request):
continue continue
if not request['is_page']: continue
uri = request['extract_request']['extract_uri'] uri = request['extract_request']['extract_uri']
for regexp in self.regexps:
if regexp.match(uri): if request['is_page']:
#print '%s is an hit' % uri # Page to hit
request['is_page'] = False for regexp in self.ph_regexps:
super_hit['viewed_pages'] -= 1 if regexp.match(uri):
super_hit['viewed_hits'] += 1 #print '%s is a hit' % (uri )
break request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break
else:
# Hit to page
for regexp in self.hp_regexps:
if regexp.match(uri):
#print '%s is a page' % (uri )
request['is_page'] = True
super_hit['viewed_pages'] += 1
super_hit['viewed_hits'] -= 1
break