Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value
This commit is contained in:
parent
242bb6cabe
commit
f8b37a625c
|
@ -44,6 +44,7 @@ Output files :
|
|||
Statistics creation :
|
||||
remote_addr =>
|
||||
feed_parser
|
||||
feed_name_analysed
|
||||
|
||||
Statistics update :
|
||||
None
|
||||
|
@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
|||
feeds = self.iwla.getConfValue('feeds', [])
|
||||
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
|
||||
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
|
||||
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
|
||||
|
||||
if feeds is None: return False
|
||||
|
||||
|
@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
|||
for f in feeds_referers:
|
||||
self.referers_uri.append(f)
|
||||
|
||||
self.merge_feeds_parsers_list = []
|
||||
for f in _merge_feeds_parsers_list:
|
||||
self.merge_feeds_parsers_list.append(re.compile(f))
|
||||
|
||||
self.merged_feeds = {}
|
||||
|
||||
return True
|
||||
|
||||
def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
||||
# One hit only match
|
||||
if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
|
||||
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
||||
# First time, register into dict
|
||||
if one_hit_only.get(user_agent, None) is None:
|
||||
# Merged
|
||||
one_hit_only[user_agent] = hit
|
||||
else:
|
||||
# Next time
|
||||
# Current must be ignored
|
||||
hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
||||
# Previous matched hit must be set as merged
|
||||
isFeedParser = self.MERGED_FEED_PARSER
|
||||
hit = one_hit_only[user_agent]
|
||||
def _appendToMergeCache(self, isFeedParser, key, hit):
|
||||
# First time, register into dict
|
||||
if self.merged_feeds.get(key, None) is None:
|
||||
# Merged
|
||||
self.merged_feeds[key] = hit
|
||||
else:
|
||||
# Next time
|
||||
# Current must be ignored
|
||||
hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
||||
# Previous matched hit must be set as merged
|
||||
isFeedParser = self.MERGED_FEED_PARSER
|
||||
hit = self.merged_feeds[key]
|
||||
hit['feed_parser'] = isFeedParser
|
||||
|
||||
def mergeFeedsParsers(self, isFeedParser, hit):
|
||||
if isFeedParser:
|
||||
# One hit only match
|
||||
if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
|
||||
for r in self.merge_feeds_parsers_list:
|
||||
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
|
||||
#print('hit match %s' % (hit['remote_addr']))
|
||||
self._appendToMergeCache(isFeedParser, r, hit)
|
||||
return
|
||||
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
|
||||
# Other cases, look for user agent
|
||||
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
||||
self._appendToMergeCache(isFeedParser, user_agent, hit)
|
||||
|
||||
def hook(self):
|
||||
hits = self.iwla.getCurrentVisits()
|
||||
one_hit_only = {}
|
||||
for hit in hits.values():
|
||||
isFeedParser = hit.get('feed_parser', None)
|
||||
|
||||
# Register already tagged feed parser in one_hit_only
|
||||
# Register already tagged feed parser in merged_feeds
|
||||
if self.merge_feeds_parsers and\
|
||||
not isFeedParser in (None, self.BAD_FEED_PARSER):
|
||||
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
|
||||
self.mergeFeedsParsers(isFeedParser, hit)
|
||||
continue
|
||||
|
||||
if isFeedParser:
|
||||
|
@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
|||
break
|
||||
|
||||
if self.merge_feeds_parsers:
|
||||
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
|
||||
self.mergeFeedsParsers(isFeedParser, hit)
|
||||
else:
|
||||
hit['feed_parser'] = isFeedParser
|
||||
|
|
Loading…
Reference in New Issue
Block a user