Add option count_hit_only_visitors and function isValidForCurrentAnalysis()

This commit is contained in:
Grégory Soutadé 2014-11-27 09:01:51 +01:00
parent 6b0ed18f35
commit dd8349ab08
10 changed files with 54 additions and 47 deletions

View File

@ -22,6 +22,5 @@ display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages']
reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo/$']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py']
count_hit_only_visitors = False

View File

@ -22,3 +22,5 @@ display_hooks = []
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200, 304]
count_hit_only_visitors = True

32
iwla.py
View File

@ -75,6 +75,10 @@ class IWLA(object):
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def _clearMeta(self):
self.meta_infos = {
'last_time' : None
@ -264,15 +268,15 @@ class IWLA(object):
#stats['requests'] = set()
stats['nb_visitors'] = 0
for k in visits.keys():
super_hit = visits[k]
for (k, super_hit) in visits.items():
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if not super_hit['hit_only']:
if conf.count_hit_only_visitors or\
super_hit['viewed_pages']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
@ -298,7 +302,14 @@ class IWLA(object):
self.current_analysis['month_stats'] = stats
self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
self.valid_visitors = {}
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
self.valid_visitors[k] = v
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
@ -331,9 +342,12 @@ class IWLA(object):
for k in stats.keys():
stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0
for k in visits.keys():
if visits[k]['robot']: continue
if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
if v['last_access'].tm_mday == cur_time.tm_mday:
stats['nb_visitors'] += 1
print stats
@ -349,7 +363,7 @@ class IWLA(object):
self.analyse_started = True
else:
if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
if not self.isValidForCurrentAnalysis(hit):
return False
else:
self.analyse_started = True
@ -374,7 +388,7 @@ class IWLA(object):
return True
def start(self):
print '==> Analyse previous database'
print '==> Load previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:

View File

@ -11,6 +11,8 @@ class IWLADisplayAllVisits(IPlugin):
def hook(self):
hits = self.iwla.getValidVisitors()
display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False)
last_access = sorted(hits.values(), key=lambda t: t['last_access'], reverse=True)
cur_time = self.iwla.getCurTime()
@ -23,7 +25,7 @@ class IWLADisplayAllVisits(IPlugin):
table = DisplayHTMLBlockTable('Last seen', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen'])
for super_hit in last_access:
address = super_hit['remote_addr']
if self.iwla.getConfValue('display_visitor_ip', False) and\
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
address = '%s [%s]' % (address, super_hit['remote_ip'])

View File

@ -91,7 +91,6 @@ class IWLADisplayReferers(IPlugin):
index.appendBlock(table)
# All key phrases in a file
cur_time = self.iwla.getCurTime()
title = time.strftime('Key Phrases - %B %Y', cur_time)
filename = 'key_phrases_%d.html' % (cur_time.tm_mon)

View File

@ -11,8 +11,11 @@ class IWLADisplayTopVisitors(IPlugin):
def hook(self):
hits = self.iwla.getValidVisitors()
count_hit_only = self.iwla.getConfValue('count_hit_only_visitors', False)
display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False)
top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()]
top_bandwidth = [(k,v['bandwidth']) for (k,v) in hits.items() \
if count_hit_only or v['viewed_pages']]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
top_visitors = [hits[h[0]] for h in top_bandwidth[:10]]
@ -20,7 +23,7 @@ class IWLADisplayTopVisitors(IPlugin):
table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen'])
for super_hit in top_visitors:
address = super_hit['remote_addr']
if self.iwla.getConfValue('display_visitor_ip', False) and\
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
address = '%s [%s]' % (address, super_hit['remote_ip'])

View File

@ -1,4 +1,3 @@
import time
import re
import xml.sax.saxutils as saxutils
@ -66,8 +65,6 @@ class IWLAPostAnalysisReferers(IPlugin):
break
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
stats = self.iwla.getCurrentVisists()
month_stats = self.iwla.getMonthStats()
@ -78,7 +75,7 @@ class IWLAPostAnalysisReferers(IPlugin):
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if time.mktime(r['time_decoded']) < start_time: continue
if not self.iwla.isValidForCurrentAnalysis(r): continue
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']

View File

@ -1,4 +1,3 @@
import time
import re
from iwla import IWLA
@ -14,9 +13,6 @@ class IWLAPostAnalysisTopPages(IPlugin):
return True
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
stats = self.iwla.getCurrentVisists()
month_stats = self.iwla.getMonthStats()
@ -27,7 +23,7 @@ class IWLAPostAnalysisTopPages(IPlugin):
for r in super_hit['requests']:
if not r['is_page']: continue
if time.mktime(r['time_decoded']) < start_time: continue
if not self.iwla.isValidForCurrentAnalysis(r): continue
uri = r['extract_request']['extract_uri']
if self.index_re.match(uri):

View File

@ -1,5 +1,4 @@
import re
import time
from iwla import IWLA
from iplugin import IPlugin
@ -21,21 +20,18 @@ class IWLAPreAnalysisPageToHit(IPlugin):
return True
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
hits = self.iwla.getCurrentVisists()
viewed_http_codes = self.iwla.getConfValue('viewed_http_codes', [200, 304])
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if time.mktime(p['time_decoded']) < start_time: continue
uri = p['extract_request']['extract_uri']
for r in self.regexps:
if r.match(uri):
p['is_page'] = False
for request in super_hit['requests']:
if not request['is_page']: continue
if not self.iwla.isValidForCurrentAnalysis(request): continue
uri = request['extract_request']['extract_uri']
for regexp in self.regexps:
if regexp.match(uri):
request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break

View File

@ -18,24 +18,23 @@ class IWLAPreAnalysisRobots(IPlugin):
# Basic rule to detect robots
def hook(self):
hits = self.iwla.getCurrentVisists()
for k in hits.keys():
super_hit = hits[k]
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
isRobot = False
referers = 0
first_page = super_hit['requests'][0]
if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday:
for r in self.awstats_robots:
if r.match(first_page['http_user_agent']):
isRobot = True
break
if not self.iwla.isValidForCurrentAnalysis(first_page): continue
if isRobot:
super_hit['robot'] = 1
continue
for r in self.awstats_robots:
if r.match(first_page['http_user_agent']):
isRobot = True
break
if isRobot:
super_hit['robot'] = 1
continue
# 1) no pages view --> robot
# if not super_hit['viewed_pages']: