From f593cc78d93d02e080dcd3f71b2993eb4f8dc96b Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Thu, 20 Nov 2014 14:09:01 +0100 Subject: [PATCH] Basically seems to work --- hooks/pre_analysis/H001_soutade.py | 2 ++ hooks/pre_analysis/H002_robot.py | 5 ++++- iwla.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/hooks/pre_analysis/H001_soutade.py b/hooks/pre_analysis/H001_soutade.py index 50a7932..6d683be 100644 --- a/hooks/pre_analysis/H001_soutade.py +++ b/hooks/pre_analysis/H001_soutade.py @@ -13,6 +13,8 @@ def hook(hits): for p in super_hit['pages']: if not p['is_page']: continue + if int(p['status']) != 200: continue + if logo_re.match(p['extract_request']['extract_uri']): p['is_page'] = False if super_hit['viewed_pages']: diff --git a/hooks/pre_analysis/H002_robot.py b/hooks/pre_analysis/H002_robot.py index 2e59ad5..8a6e721 100644 --- a/hooks/pre_analysis/H002_robot.py +++ b/hooks/pre_analysis/H002_robot.py @@ -19,7 +19,10 @@ def hook(hits): if not super_hit['viewed_hits']: super_hit['robot'] = 1 continue - + elif not super_hit['viewed_pages']: +# Hit only + super_hit['hit_only'] = 1 + for hit in super_hit['pages']: # 3) /robots.txt read if hit['extract_request']['http_uri'] == '/robots.txt': diff --git a/iwla.py b/iwla.py index 5dbb728..25478d9 100755 --- a/iwla.py +++ b/iwla.py @@ -161,6 +161,7 @@ def createUser(hit): super_hit['last_access'] = meta_visit['last_time'] super_hit['pages'] = []; super_hit['robot'] = isRobot(hit); + super_hit['hit_only'] = 0; appendHit(hit) def isRobot(hit): @@ -276,7 +277,8 @@ def generateStats(visits): print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) - stats['nb_visitors'] += 1 + if not super_hit['hit_only']: + stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwith'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits']