Update feed detector : check 'feed', 'rss' or 'atom' string in user agent

This commit is contained in:
Gregory Soutade 2021-04-01 08:22:52 +02:00
parent 0602d3ce4b
commit 8697be26ad

View File

@ -75,6 +75,11 @@ class IWLAPostAnalysisFeeds(IPlugin):
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
self.user_agents_re = []
self.user_agents_re.append(re.compile(r'.*rss.*'))
self.user_agents_re.append(re.compile(r'.*atom.*'))
self.user_agents_re.append(re.compile(r'.*feed.*'))
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
@ -120,6 +125,14 @@ class IWLAPostAnalysisFeeds(IPlugin):
if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.NOT_A_FEED_PARSER:
user_agent = hit['requests'][0]['http_user_agent'].lower()
for regexp in self.user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.FEED_PARSER
break
if self.merge_one_hit_only_feeds_parsers:
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
else: