diff --git a/iwla.py b/iwla.py index d3361cc..93de2cd 100755 --- a/iwla.py +++ b/iwla.py @@ -256,7 +256,7 @@ class IWLA(object): hit['is_page'] = self.isPage(uri) if super_hit['robot'] or\ - not int(hit['status']) in conf.viewed_http_codes: + not self.hasBeenViewed(hit): page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: @@ -523,7 +523,7 @@ class IWLA(object): if hit['time_decoded'].tm_mday != cur_time.tm_mday: break if super_hit['robot'] or\ - not int(hit['status']) in conf.viewed_http_codes: + not self.hasBeenViewed(hit): stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent']) continue stats['viewed_bandwidth'] += int(hit['body_bytes_sent']) diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index 100ab6b..f4ef9e9 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -109,15 +109,14 @@ class IWLAPostAnalysisReferers(IPlugin): key_phrases = month_stats.get('key_phrases', {}) for (k, super_hit) in stats.items(): - for r in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(r): continue + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): break if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] - is_search_engine = False - if self.own_domain_re.match(uri): continue + is_search_engine = False for (name, engine) in self.search_engines.items(): for (hashid, hashid_re) in engine['hashid']: if not hashid_re.match(uri): continue diff --git a/plugins/post_analysis/top_downloads.py b/plugins/post_analysis/top_downloads.py index ab73ba0..9aa9c6a 100644 --- a/plugins/post_analysis/top_downloads.py +++ b/plugins/post_analysis/top_downloads.py @@ -46,14 +46,12 @@ class IWLAPostAnalysisTopDownloads(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + r['is_page']: continue - if r['is_page']: continue - - - if not int(r['status']) in viewed_http_codes: continue uri = r['extract_request']['extract_uri'].lower() diff --git a/plugins/post_analysis/top_hits.py b/plugins/post_analysis/top_hits.py index a6c15e1..05f272a 100644 --- a/plugins/post_analysis/top_hits.py +++ b/plugins/post_analysis/top_hits.py @@ -40,15 +40,14 @@ class IWLAPostAnalysisTopHits(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if r['is_page']: continue - - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + r['is_page']: continue - uri = r['extract_request']['extract_uri'] - + uri = r['extract_request']['extract_uri'].lower() uri = "%s%s" % (r.get('server_name', ''), uri) if not uri in top_hits.keys(): diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py index 9c85cd8..8d938dd 100644 --- a/plugins/post_analysis/top_pages.py +++ b/plugins/post_analysis/top_pages.py @@ -46,11 +46,11 @@ class IWLAPostAnalysisTopPages(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if not r['is_page']: continue - - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + not r['is_page']: continue uri = r['extract_request']['extract_uri'] diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py index fd8ad87..c202efb 100644 --- a/plugins/pre_analysis/page_to_hit.py +++ b/plugins/pre_analysis/page_to_hit.py @@ -54,9 +54,11 @@ class IWLAPreAnalysisPageToHit(IPlugin): for (k, super_hit) in hits.items(): if super_hit['robot']: continue - for request in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(request) or\ - not self.iwla.hasBeenViewed(request): + for request in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(request): + break + + if not self.iwla.hasBeenViewed(request): continue uri = request['extract_request']['extract_uri']