Add one more rule to robot detection : more than ten 404 pages viewed

This commit is contained in:
Gregory Soutade 2017-05-25 21:04:18 +02:00
parent 4bc2c1ad4b
commit 68a67adecc

View File

@ -114,12 +114,16 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
not_found_pages = 0
for hit in super_hit['requests']: for hit in super_hit['requests']:
# 3) /robots.txt read # 3) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'): if hit['extract_request']['http_uri'].endswith('/robots.txt'):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
break break
if int(hit['status']) == 404:
not_found_pages += 1
# 4) Any referer for hits # 4) Any referer for hits
if not hit['is_page'] and hit['http_referer']: if not hit['is_page'] and hit['http_referer']:
referers += 1 referers += 1
@ -128,6 +132,11 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
# 5) more than 10 404 pages
if not_found_pages > 10:
self._setRobot(k, super_hit)
continue
if not super_hit['viewed_pages'] and \ if not super_hit['viewed_pages'] and \
(super_hit['viewed_hits'] and not referers): (super_hit['viewed_hits'] and not referers):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)