40 lines
935 B
Python
40 lines
935 B
Python
|
|
||
|
# Basic rule to detect robots
|
||
|
|
||
|
def hook(hits):
|
||
|
for k in hits.keys():
|
||
|
super_hit = hits[k]
|
||
|
|
||
|
if super_hit['robot']: continue
|
||
|
|
||
|
isRobot = False
|
||
|
referers = 0
|
||
|
|
||
|
# 1) no pages view --> robot
|
||
|
if not super_hit['viewed_pages']:
|
||
|
super_hit['robot'] = 1
|
||
|
continue
|
||
|
|
||
|
# 2) pages without hit --> robot
|
||
|
if not super_hit['viewed_hits']:
|
||
|
super_hit['robot'] = 1
|
||
|
continue
|
||
|
|
||
|
for hit in super_hit['pages']:
|
||
|
# 3) /robots.txt read
|
||
|
if hit['extract_request']['http_uri'] == '/robots.txt':
|
||
|
isRobot = True
|
||
|
break
|
||
|
|
||
|
# 4) Any referer for hits
|
||
|
if not hit['is_page'] and hit['http_referer']:
|
||
|
referers += 1
|
||
|
|
||
|
if isRobot:
|
||
|
super_hit['robot'] = 1
|
||
|
continue
|
||
|
|
||
|
if super_hit['viewed_hits'] and not referers:
|
||
|
super_hit['robot'] = 1
|
||
|
continue
|