diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 6f89bb3..4bad943 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -114,12 +114,16 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue + not_found_pages = 0 for hit in super_hit['requests']: # 3) /robots.txt read if hit['extract_request']['http_uri'].endswith('/robots.txt'): self._setRobot(k, super_hit) break + if int(hit['status']) == 404: + not_found_pages += 1 + # 4) Any referer for hits if not hit['is_page'] and hit['http_referer']: referers += 1 @@ -128,6 +132,11 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue +# 5) more than 10 404 pages + if not_found_pages > 10: + self._setRobot(k, super_hit) + continue + if not super_hit['viewed_pages'] and \ (super_hit['viewed_hits'] and not referers): self._setRobot(k, super_hit)