2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
# Basic rule to detect robots
|
|
|
|
|
|
|
|
def hook(hits):
|
|
|
|
for k in hits.keys():
|
|
|
|
super_hit = hits[k]
|
|
|
|
|
|
|
|
if super_hit['robot']: continue
|
|
|
|
|
|
|
|
isRobot = False
|
|
|
|
referers = 0
|
|
|
|
|
|
|
|
# 1) no pages view --> robot
|
2014-11-20 09:37:54 +01:00
|
|
|
# if not super_hit['viewed_pages']:
|
|
|
|
# super_hit['robot'] = 1
|
|
|
|
# continue
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
# 2) pages without hit --> robot
|
|
|
|
if not super_hit['viewed_hits']:
|
|
|
|
super_hit['robot'] = 1
|
|
|
|
continue
|
2014-11-20 14:09:01 +01:00
|
|
|
elif not super_hit['viewed_pages']:
|
|
|
|
# Hit only
|
|
|
|
super_hit['hit_only'] = 1
|
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
for hit in super_hit['pages']:
|
|
|
|
# 3) /robots.txt read
|
|
|
|
if hit['extract_request']['http_uri'] == '/robots.txt':
|
|
|
|
isRobot = True
|
|
|
|
break
|
|
|
|
|
|
|
|
# 4) Any referer for hits
|
|
|
|
if not hit['is_page'] and hit['http_referer']:
|
|
|
|
referers += 1
|
|
|
|
|
|
|
|
if isRobot:
|
|
|
|
super_hit['robot'] = 1
|
|
|
|
continue
|
|
|
|
|
|
|
|
if super_hit['viewed_hits'] and not referers:
|
|
|
|
super_hit['robot'] = 1
|
|
|
|
continue
|