Update feed detector : check 'feed', 'rss' or 'atom' string in user agent
This commit is contained in:
parent
0602d3ce4b
commit
8697be26ad
|
@ -75,6 +75,11 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
self.bad_feeds_re = []
|
self.bad_feeds_re = []
|
||||||
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
|
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
|
||||||
|
|
||||||
|
self.user_agents_re = []
|
||||||
|
self.user_agents_re.append(re.compile(r'.*rss.*'))
|
||||||
|
self.user_agents_re.append(re.compile(r'.*atom.*'))
|
||||||
|
self.user_agents_re.append(re.compile(r'.*feed.*'))
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
||||||
|
@ -120,6 +125,14 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
if hit['not_viewed_pages'][0]:
|
if hit['not_viewed_pages'][0]:
|
||||||
isFeedParser = self.NOT_A_FEED_PARSER
|
isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if isFeedParser == self.NOT_A_FEED_PARSER:
|
||||||
|
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
||||||
|
for regexp in self.user_agents_re:
|
||||||
|
if regexp.match(user_agent):
|
||||||
|
isFeedParser = self.FEED_PARSER
|
||||||
|
break
|
||||||
|
|
||||||
if self.merge_one_hit_only_feeds_parsers:
|
if self.merge_one_hit_only_feeds_parsers:
|
||||||
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
|
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user