New rule for robot : more than 10 not modified pages in a row
This commit is contained in:
parent
ac246eabe2
commit
6a4fd4e9c8
|
@ -123,20 +123,23 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
# 3) no pages and not hit --> robot
|
||||
# 3) no pages and not hit --> robot
|
||||
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
not_found_pages = 0
|
||||
not_modified_pages = 0
|
||||
for hit in super_hit['requests']:
|
||||
# 5) /robots.txt read
|
||||
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
||||
self._setRobot(k, super_hit)
|
||||
break
|
||||
|
||||
if int(hit['status']) == 404 or int(hit['status']) == 403:
|
||||
if int(hit['status']) in (404, 403):
|
||||
not_found_pages += 1
|
||||
elif int(hit['status']) in (304,):
|
||||
not_modified_pages += 1
|
||||
|
||||
# 6) Any referer for hits
|
||||
if not hit['is_page'] and hit['http_referer']:
|
||||
|
@ -146,8 +149,8 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
# 7) more than 10 404/403 pages
|
||||
if not_found_pages > 10:
|
||||
# 7) more than 10 404/403 or 304 pages
|
||||
if not_found_pages > 10 or not_modified_pages > 10:
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user