diff --git a/plugins/post_analysis/feeds.py b/plugins/post_analysis/feeds.py index ec256ea..0b210da 100644 --- a/plugins/post_analysis/feeds.py +++ b/plugins/post_analysis/feeds.py @@ -35,6 +35,7 @@ Plugin requirements : Conf values needed : feeds + feeds_referers* merge_one_hit_only_feeds_parsers* Output files : @@ -63,14 +64,15 @@ class IWLAPostAnalysisFeeds(IPlugin): self.conf_requires = ['feeds'] def load(self): - feeds = self.iwla.getConfValue('feeds', None) + feeds = self.iwla.getConfValue('feeds', []) + feeds_referers = self.iwla.getConfValue('feeds_referers', []) self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True) if feeds is None: return False self.feeds_re = [] for f in feeds: - self.feeds_re.append(re.compile(r'.*%s.*' % (f))) + self.feeds_re.append(re.compile(f)) self.bad_feeds_re = [] self.bad_feeds_re.append(re.compile(r'.*crawl.*')) @@ -80,6 +82,10 @@ class IWLAPostAnalysisFeeds(IPlugin): self.user_agents_re.append(re.compile(r'.*atom.*')) self.user_agents_re.append(re.compile(r'.*feed.*')) + self.referers_uri = [] + for f in feeds_referer: + self.referers_uri.append(f) + return True def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit): @@ -115,8 +121,9 @@ class IWLAPostAnalysisFeeds(IPlugin): return return + request = hit['requests'][0] isFeedParser = self.NOT_A_FEED_PARSER - uri = hit['requests'][0]['extract_request']['extract_uri'].lower() + uri = request['extract_request']['extract_uri'].lower() for regexp in self.feeds_re: if regexp.match(uri): isFeedParser = self.FEED_PARSER @@ -127,12 +134,20 @@ class IWLAPostAnalysisFeeds(IPlugin): break if isFeedParser == self.NOT_A_FEED_PARSER: - user_agent = hit['requests'][0]['http_user_agent'].lower() + user_agent = request['http_user_agent'].lower() for regexp in self.user_agents_re: if regexp.match(user_agent): isFeedParser = self.FEED_PARSER break + if isFeedParser == self.NOT_A_FEED_PARSER and\ + request.get('extract_referer', False): + referer = request['extract_referer']['extract_uri'].lower() + for uri in self.referers_uri: + if referer == uri: + isFeedParser = self.FEED_PARSER + break + if self.merge_one_hit_only_feeds_parsers: self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit) else: