Fix feeds re and add 'feeds_referers' configuration
This commit is contained in:
parent
2545ca5e52
commit
4cd7712201
|
@ -35,6 +35,7 @@ Plugin requirements :
|
||||||
|
|
||||||
Conf values needed :
|
Conf values needed :
|
||||||
feeds
|
feeds
|
||||||
|
feeds_referers*
|
||||||
merge_one_hit_only_feeds_parsers*
|
merge_one_hit_only_feeds_parsers*
|
||||||
|
|
||||||
Output files :
|
Output files :
|
||||||
|
@ -63,14 +64,15 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
self.conf_requires = ['feeds']
|
self.conf_requires = ['feeds']
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
feeds = self.iwla.getConfValue('feeds', None)
|
feeds = self.iwla.getConfValue('feeds', [])
|
||||||
|
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
|
||||||
self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True)
|
self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True)
|
||||||
|
|
||||||
if feeds is None: return False
|
if feeds is None: return False
|
||||||
|
|
||||||
self.feeds_re = []
|
self.feeds_re = []
|
||||||
for f in feeds:
|
for f in feeds:
|
||||||
self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
|
self.feeds_re.append(re.compile(f))
|
||||||
|
|
||||||
self.bad_feeds_re = []
|
self.bad_feeds_re = []
|
||||||
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
|
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
|
||||||
|
@ -80,6 +82,10 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
self.user_agents_re.append(re.compile(r'.*atom.*'))
|
self.user_agents_re.append(re.compile(r'.*atom.*'))
|
||||||
self.user_agents_re.append(re.compile(r'.*feed.*'))
|
self.user_agents_re.append(re.compile(r'.*feed.*'))
|
||||||
|
|
||||||
|
self.referers_uri = []
|
||||||
|
for f in feeds_referer:
|
||||||
|
self.referers_uri.append(f)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
||||||
|
@ -115,8 +121,9 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
return
|
return
|
||||||
return
|
return
|
||||||
|
|
||||||
|
request = hit['requests'][0]
|
||||||
isFeedParser = self.NOT_A_FEED_PARSER
|
isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()
|
uri = request['extract_request']['extract_uri'].lower()
|
||||||
for regexp in self.feeds_re:
|
for regexp in self.feeds_re:
|
||||||
if regexp.match(uri):
|
if regexp.match(uri):
|
||||||
isFeedParser = self.FEED_PARSER
|
isFeedParser = self.FEED_PARSER
|
||||||
|
@ -127,12 +134,20 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
break
|
break
|
||||||
|
|
||||||
if isFeedParser == self.NOT_A_FEED_PARSER:
|
if isFeedParser == self.NOT_A_FEED_PARSER:
|
||||||
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
user_agent = request['http_user_agent'].lower()
|
||||||
for regexp in self.user_agents_re:
|
for regexp in self.user_agents_re:
|
||||||
if regexp.match(user_agent):
|
if regexp.match(user_agent):
|
||||||
isFeedParser = self.FEED_PARSER
|
isFeedParser = self.FEED_PARSER
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if isFeedParser == self.NOT_A_FEED_PARSER and\
|
||||||
|
request.get('extract_referer', False):
|
||||||
|
referer = request['extract_referer']['extract_uri'].lower()
|
||||||
|
for uri in self.referers_uri:
|
||||||
|
if referer == uri:
|
||||||
|
isFeedParser = self.FEED_PARSER
|
||||||
|
break
|
||||||
|
|
||||||
if self.merge_one_hit_only_feeds_parsers:
|
if self.merge_one_hit_only_feeds_parsers:
|
||||||
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
|
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user