New rule for robot : more than 10 not modified pages in a row

This commit is contained in:
Gregory Soutade 2023-01-28 09:40:26 +01:00
parent ac246eabe2
commit 6a4fd4e9c8

View File

@ -123,20 +123,23 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit)
continue
# 3) no pages and not hit --> robot
# 3) no pages and not hit --> robot
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
not_found_pages = 0
not_modified_pages = 0
for hit in super_hit['requests']:
# 5) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
self._setRobot(k, super_hit)
break
if int(hit['status']) == 404 or int(hit['status']) == 403:
if int(hit['status']) in (404, 403):
not_found_pages += 1
elif int(hit['status']) in (304,):
not_modified_pages += 1
# 6) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
@ -146,8 +149,8 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit)
continue
# 7) more than 10 404/403 pages
if not_found_pages > 10:
# 7) more than 10 404/403 or 304 pages
if not_found_pages > 10 or not_modified_pages > 10:
self._setRobot(k, super_hit)
continue