diff --git a/conf.py b/conf.py index 7b205cb..94b66aa 100644 --- a/conf.py +++ b/conf.py @@ -22,6 +22,5 @@ display_hooks = ['top_visitors', 'all_visits', 'referers', 'top_pages'] reverse_dns_timeout = 0.2 page_to_hit_conf = [r'^.+/logo/$'] -# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] -# post_analysis_hooks = ['top_visitors.py'] -# display_hooks = ['top_visitors.py'] + +count_hit_only_visitors = False diff --git a/default_conf.py b/default_conf.py index 1b4c62b..765afa8 100644 --- a/default_conf.py +++ b/default_conf.py @@ -22,3 +22,5 @@ display_hooks = [] pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200, 304] + +count_hit_only_visitors = True diff --git a/iwla.py b/iwla.py index ec8b40c..5fb4a78 100755 --- a/iwla.py +++ b/iwla.py @@ -75,6 +75,10 @@ class IWLA(object): def getStartAnalysisTime(self): return self.meta_infos['start_analysis_time'] + def isValidForCurrentAnalysis(self, request): + cur_time = self.meta_infos['start_analysis_time'] + return (time.mktime(cur_time) < time.mktime(request['time_decoded'])) + def _clearMeta(self): self.meta_infos = { 'last_time' : None @@ -264,15 +268,15 @@ class IWLA(object): #stats['requests'] = set() stats['nb_visitors'] = 0 - for k in visits.keys(): - super_hit = visits[k] + for (k, super_hit) in visits.items(): if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwidth'] continue #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) - if not super_hit['hit_only']: + if conf.count_hit_only_visitors or\ + super_hit['viewed_pages']: stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwidth'] stats['viewed_pages'] += super_hit['viewed_pages'] @@ -298,7 +302,14 @@ class IWLA(object): self.current_analysis['month_stats'] = stats - self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} + self.valid_visitors = {} + for (k,v) in visits.items(): + if v['robot']: continue + if conf.count_hit_only_visitors and\ + (not v['viewed_pages']): + continue + self.valid_visitors[k] = v + self._callPlugins(conf.POST_HOOK_DIRECTORY) path = self.getDBFilename(cur_time) @@ -331,9 +342,12 @@ class IWLA(object): for k in stats.keys(): stats[k] -= self.current_analysis['days_stats'][last_day][k] stats['nb_visitors'] = 0 - for k in visits.keys(): - if visits[k]['robot']: continue - if visits[k]['last_access'].tm_mday == cur_time.tm_mday: + for (k,v) in visits.items(): + if v['robot']: continue + if conf.count_hit_only_visitors and\ + (not v['viewed_pages']): + continue + if v['last_access'].tm_mday == cur_time.tm_mday: stats['nb_visitors'] += 1 print stats @@ -349,7 +363,7 @@ class IWLA(object): self.analyse_started = True else: if not self.analyse_started: - if time.mktime(cur_time) >= time.mktime(t): + if not self.isValidForCurrentAnalysis(hit): return False else: self.analyse_started = True @@ -374,7 +388,7 @@ class IWLA(object): return True def start(self): - print '==> Analyse previous database' + print '==> Load previous database' self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: diff --git a/plugins/display/all_visits.py b/plugins/display/all_visits.py index a0d7152..2374233 100644 --- a/plugins/display/all_visits.py +++ b/plugins/display/all_visits.py @@ -11,6 +11,8 @@ class IWLADisplayAllVisits(IPlugin): def hook(self): hits = self.iwla.getValidVisitors() + display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False) + last_access = sorted(hits.values(), key=lambda t: t['last_access'], reverse=True) cur_time = self.iwla.getCurTime() @@ -23,7 +25,7 @@ class IWLADisplayAllVisits(IPlugin): table = DisplayHTMLBlockTable('Last seen', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) for super_hit in last_access: address = super_hit['remote_addr'] - if self.iwla.getConfValue('display_visitor_ip', False) and\ + if display_visitor_ip and\ super_hit.get('dns_name_replaced', False): address = '%s [%s]' % (address, super_hit['remote_ip']) diff --git a/plugins/display/referers.py b/plugins/display/referers.py index be743f0..318c94f 100644 --- a/plugins/display/referers.py +++ b/plugins/display/referers.py @@ -91,7 +91,6 @@ class IWLADisplayReferers(IPlugin): index.appendBlock(table) # All key phrases in a file - cur_time = self.iwla.getCurTime() title = time.strftime('Key Phrases - %B %Y', cur_time) filename = 'key_phrases_%d.html' % (cur_time.tm_mon) diff --git a/plugins/display/top_visitors.py b/plugins/display/top_visitors.py index b60f5ec..e806209 100644 --- a/plugins/display/top_visitors.py +++ b/plugins/display/top_visitors.py @@ -11,8 +11,11 @@ class IWLADisplayTopVisitors(IPlugin): def hook(self): hits = self.iwla.getValidVisitors() + count_hit_only = self.iwla.getConfValue('count_hit_only_visitors', False) + display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False) - top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()] + top_bandwidth = [(k,v['bandwidth']) for (k,v) in hits.items() \ + if count_hit_only or v['viewed_pages']] top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) top_visitors = [hits[h[0]] for h in top_bandwidth[:10]] @@ -20,7 +23,7 @@ class IWLADisplayTopVisitors(IPlugin): table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen']) for super_hit in top_visitors: address = super_hit['remote_addr'] - if self.iwla.getConfValue('display_visitor_ip', False) and\ + if display_visitor_ip and\ super_hit.get('dns_name_replaced', False): address = '%s [%s]' % (address, super_hit['remote_ip']) diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index af3b9f5..6619ecd 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -1,4 +1,3 @@ -import time import re import xml.sax.saxutils as saxutils @@ -66,8 +65,6 @@ class IWLAPostAnalysisReferers(IPlugin): break def hook(self): - start_time = self.iwla.getStartAnalysisTime() - start_time = time.mktime(start_time) stats = self.iwla.getCurrentVisists() month_stats = self.iwla.getMonthStats() @@ -78,7 +75,7 @@ class IWLAPostAnalysisReferers(IPlugin): for (k, super_hit) in stats.items(): for r in super_hit['requests']: - if time.mktime(r['time_decoded']) < start_time: continue + if not self.iwla.isValidForCurrentAnalysis(r): continue if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py index e4f01e7..106f903 100644 --- a/plugins/post_analysis/top_pages.py +++ b/plugins/post_analysis/top_pages.py @@ -1,4 +1,3 @@ -import time import re from iwla import IWLA @@ -14,9 +13,6 @@ class IWLAPostAnalysisTopPages(IPlugin): return True def hook(self): - start_time = self.iwla.getStartAnalysisTime() - start_time = time.mktime(start_time) - stats = self.iwla.getCurrentVisists() month_stats = self.iwla.getMonthStats() @@ -27,7 +23,7 @@ class IWLAPostAnalysisTopPages(IPlugin): for r in super_hit['requests']: if not r['is_page']: continue - if time.mktime(r['time_decoded']) < start_time: continue + if not self.iwla.isValidForCurrentAnalysis(r): continue uri = r['extract_request']['extract_uri'] if self.index_re.match(uri): diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py index 0f20c8e..936cf63 100644 --- a/plugins/pre_analysis/page_to_hit.py +++ b/plugins/pre_analysis/page_to_hit.py @@ -1,5 +1,4 @@ import re -import time from iwla import IWLA from iplugin import IPlugin @@ -21,21 +20,18 @@ class IWLAPreAnalysisPageToHit(IPlugin): return True def hook(self): - start_time = self.iwla.getStartAnalysisTime() - start_time = time.mktime(start_time) - hits = self.iwla.getCurrentVisists() viewed_http_codes = self.iwla.getConfValue('viewed_http_codes', [200, 304]) for (k, super_hit) in hits.items(): if super_hit['robot']: continue - for p in super_hit['requests']: - if not p['is_page']: continue - if time.mktime(p['time_decoded']) < start_time: continue - uri = p['extract_request']['extract_uri'] - for r in self.regexps: - if r.match(uri): - p['is_page'] = False + for request in super_hit['requests']: + if not request['is_page']: continue + if not self.iwla.isValidForCurrentAnalysis(request): continue + uri = request['extract_request']['extract_uri'] + for regexp in self.regexps: + if regexp.match(uri): + request['is_page'] = False super_hit['viewed_pages'] -= 1 super_hit['viewed_hits'] += 1 break diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 2d120c5..0557448 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -18,24 +18,23 @@ class IWLAPreAnalysisRobots(IPlugin): # Basic rule to detect robots def hook(self): hits = self.iwla.getCurrentVisists() - for k in hits.keys(): - super_hit = hits[k] - + for (k, super_hit) in hits.items(): if super_hit['robot']: continue isRobot = False referers = 0 first_page = super_hit['requests'][0] - if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday: - for r in self.awstats_robots: - if r.match(first_page['http_user_agent']): - isRobot = True - break + if not self.iwla.isValidForCurrentAnalysis(first_page): continue - if isRobot: - super_hit['robot'] = 1 - continue + for r in self.awstats_robots: + if r.match(first_page['http_user_agent']): + isRobot = True + break + + if isRobot: + super_hit['robot'] = 1 + continue # 1) no pages view --> robot # if not super_hit['viewed_pages']: