From 6a4fd4e9c8756f9b2c5c9b46541435ed50bf125d Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Sat, 28 Jan 2023 09:40:26 +0100 Subject: [PATCH] New rule for robot : more than 10 not modified pages in a row --- plugins/pre_analysis/robots.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index e2d71dd..922c9d2 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -123,20 +123,23 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue -# 3) no pages and not hit --> robot + # 3) no pages and not hit --> robot if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]: self._setRobot(k, super_hit) continue not_found_pages = 0 + not_modified_pages = 0 for hit in super_hit['requests']: # 5) /robots.txt read if hit['extract_request']['http_uri'].endswith('/robots.txt'): self._setRobot(k, super_hit) break - if int(hit['status']) == 404 or int(hit['status']) == 403: + if int(hit['status']) in (404, 403): not_found_pages += 1 + elif int(hit['status']) in (304,): + not_modified_pages += 1 # 6) Any referer for hits if not hit['is_page'] and hit['http_referer']: @@ -146,8 +149,8 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue -# 7) more than 10 404/403 pages - if not_found_pages > 10: +# 7) more than 10 404/403 or 304 pages + if not_found_pages > 10 or not_modified_pages > 10: self._setRobot(k, super_hit) continue