Add one more rule to robot detection : more than ten 404 pages viewed
This commit is contained in:
parent
4bc2c1ad4b
commit
68a67adecc
|
@ -114,12 +114,16 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
not_found_pages = 0
|
||||||
for hit in super_hit['requests']:
|
for hit in super_hit['requests']:
|
||||||
# 3) /robots.txt read
|
# 3) /robots.txt read
|
||||||
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if int(hit['status']) == 404:
|
||||||
|
not_found_pages += 1
|
||||||
|
|
||||||
# 4) Any referer for hits
|
# 4) Any referer for hits
|
||||||
if not hit['is_page'] and hit['http_referer']:
|
if not hit['is_page'] and hit['http_referer']:
|
||||||
referers += 1
|
referers += 1
|
||||||
|
@ -128,6 +132,11 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 5) more than 10 404 pages
|
||||||
|
if not_found_pages > 10:
|
||||||
|
self._setRobot(k, super_hit)
|
||||||
|
continue
|
||||||
|
|
||||||
if not super_hit['viewed_pages'] and \
|
if not super_hit['viewed_pages'] and \
|
||||||
(super_hit['viewed_hits'] and not referers):
|
(super_hit['viewed_hits'] and not referers):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user