diff --git a/default_conf.py b/default_conf.py index 6aa682d..13ddd70 100644 --- a/default_conf.py +++ b/default_conf.py @@ -66,3 +66,6 @@ keep_requests = False # Domain names that should be ignored excluded_domain_name = [] + +# Domains that set no-referer as Referer-Policy +no_referrer_domains = [] diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 0ce9433..47036d4 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -36,7 +36,8 @@ Plugin requirements : None Conf values needed : - None + count_hit_only_visitors + no_referrer_domains Output files : None @@ -63,6 +64,7 @@ class IWLAPreAnalysisRobots(IPlugin): self.compatible_re = re.compile(r'.*\(.*compatible; (.*); \+.*\)*') self.logger = logging.getLogger(self.__class__.__name__) self.one_hit_only = self.iwla.getConfValue('count_hit_only_visitors', False) + self.no_referrer_domains = self.iwla.getConfValue('no_referrer_domains', []) return True @@ -125,11 +127,6 @@ class IWLAPreAnalysisRobots(IPlugin): # 2) Less than 1 hit per page if super_hit['viewed_pages'][0] and (super_hit['viewed_hits'][0] < super_hit['viewed_pages'][0]): isRobot = True - for hit in super_hit['requests']: - if hit['server_name'] == 'indefero.soutade.fr': - if super_hit['viewed_hits'][0]*3 >= super_hit['viewed_pages'][0]: - isRobot = False - break if isRobot: self._setRobot(k, super_hit) @@ -148,7 +145,9 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) break - if int(hit['status']) >= 400 and int(hit['status']) <= 499: + # Exception for favicon.png and all apple-*icon* + if int(hit['status']) >= 400 and int(hit['status']) <= 499 and\ + 'icon' not in hit['extract_request']['http_uri']: error_codes += 1 elif int(hit['status']) in (304,): not_modified_pages += 1 @@ -161,7 +160,8 @@ class IWLAPreAnalysisRobots(IPlugin): continue # 6) Any referer for hits - if super_hit['viewed_hits'][0] and not referers: + if super_hit['viewed_hits'][0] and not referers and\ + not super_hit['requests'][0]['server_name'] in self.no_referrer_domains: self._setRobot(k, super_hit) continue