Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value

This commit is contained in:
Gregory Soutade 2022-11-16 21:09:50 +01:00
parent 242bb6cabe
commit f8b37a625c

View File

@ -44,6 +44,7 @@ Output files :
Statistics creation :
remote_addr =>
feed_parser
feed_name_analysed
Statistics update :
None
@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds = self.iwla.getConfValue('feeds', [])
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
if feeds is None: return False
@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in feeds_referers:
self.referers_uri.append(f)
self.merge_feeds_parsers_list = []
for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {}
return True
def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit):
# One hit only match
if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
user_agent = hit['requests'][0]['http_user_agent'].lower()
# First time, register into dict
if one_hit_only.get(user_agent, None) is None:
# Merged
one_hit_only[user_agent] = hit
else:
# Next time
# Current must be ignored
hit['feed_parser'] = self.NOT_A_FEED_PARSER
# Previous matched hit must be set as merged
isFeedParser = self.MERGED_FEED_PARSER
hit = one_hit_only[user_agent]
def _appendToMergeCache(self, isFeedParser, key, hit):
# First time, register into dict
if self.merged_feeds.get(key, None) is None:
# Merged
self.merged_feeds[key] = hit
else:
# Next time
# Current must be ignored
hit['feed_parser'] = self.NOT_A_FEED_PARSER
# Previous matched hit must be set as merged
isFeedParser = self.MERGED_FEED_PARSER
hit = self.merged_feeds[key]
hit['feed_parser'] = isFeedParser
def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser:
# One hit only match
if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
#print('hit match %s' % (hit['remote_addr']))
self._appendToMergeCache(isFeedParser, r, hit)
return
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
# Other cases, look for user agent
user_agent = hit['requests'][0]['http_user_agent'].lower()
self._appendToMergeCache(isFeedParser, user_agent, hit)
def hook(self):
hits = self.iwla.getCurrentVisits()
one_hit_only = {}
for hit in hits.values():
isFeedParser = hit.get('feed_parser', None)
# Register already tagged feed parser in one_hit_only
# Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers and\
not isFeedParser in (None, self.BAD_FEED_PARSER):
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
self.mergeFeedsParsers(isFeedParser, hit)
continue
if isFeedParser:
@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
break
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
self.mergeFeedsParsers(isFeedParser, hit)
else:
hit['feed_parser'] = isFeedParser