New rule for robot : more than 10 not modified pages in a row
This commit is contained in:
parent
ac246eabe2
commit
6a4fd4e9c8
|
@ -129,14 +129,17 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
not_found_pages = 0
|
not_found_pages = 0
|
||||||
|
not_modified_pages = 0
|
||||||
for hit in super_hit['requests']:
|
for hit in super_hit['requests']:
|
||||||
# 5) /robots.txt read
|
# 5) /robots.txt read
|
||||||
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
break
|
break
|
||||||
|
|
||||||
if int(hit['status']) == 404 or int(hit['status']) == 403:
|
if int(hit['status']) in (404, 403):
|
||||||
not_found_pages += 1
|
not_found_pages += 1
|
||||||
|
elif int(hit['status']) in (304,):
|
||||||
|
not_modified_pages += 1
|
||||||
|
|
||||||
# 6) Any referer for hits
|
# 6) Any referer for hits
|
||||||
if not hit['is_page'] and hit['http_referer']:
|
if not hit['is_page'] and hit['http_referer']:
|
||||||
|
@ -146,8 +149,8 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 7) more than 10 404/403 pages
|
# 7) more than 10 404/403 or 304 pages
|
||||||
if not_found_pages > 10:
|
if not_found_pages > 10 or not_modified_pages > 10:
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user