From f8b37a625c06da029137541e47d99f8c94f40aef Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Wed, 16 Nov 2022 21:09:50 +0100 Subject: [PATCH] Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value --- plugins/post_analysis/feeds.py | 58 ++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/plugins/post_analysis/feeds.py b/plugins/post_analysis/feeds.py index c42fcdb..4069197 100644 --- a/plugins/post_analysis/feeds.py +++ b/plugins/post_analysis/feeds.py @@ -44,6 +44,7 @@ Output files : Statistics creation : remote_addr => feed_parser + feed_name_analysed Statistics update : None @@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin): feeds = self.iwla.getConfValue('feeds', []) feeds_referers = self.iwla.getConfValue('feeds_referers', []) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) + _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) if feeds is None: return False @@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin): for f in feeds_referers: self.referers_uri.append(f) + self.merge_feeds_parsers_list = [] + for f in _merge_feeds_parsers_list: + self.merge_feeds_parsers_list.append(re.compile(f)) + + self.merged_feeds = {} + return True - - def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit): - # One hit only match - if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1: - user_agent = hit['requests'][0]['http_user_agent'].lower() - # First time, register into dict - if one_hit_only.get(user_agent, None) is None: - # Merged - one_hit_only[user_agent] = hit - else: - # Next time - # Current must be ignored - hit['feed_parser'] = self.NOT_A_FEED_PARSER - # Previous matched hit must be set as merged - isFeedParser = self.MERGED_FEED_PARSER - hit = one_hit_only[user_agent] + + def _appendToMergeCache(self, isFeedParser, key, hit): + # First time, register into dict + if self.merged_feeds.get(key, None) is None: + # Merged + self.merged_feeds[key] = hit + else: + # Next time + # Current must be ignored + hit['feed_parser'] = self.NOT_A_FEED_PARSER + # Previous matched hit must be set as merged + isFeedParser = self.MERGED_FEED_PARSER + hit = self.merged_feeds[key] hit['feed_parser'] = isFeedParser + + def mergeFeedsParsers(self, isFeedParser, hit): + if isFeedParser: + # One hit only match + if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1: + for r in self.merge_feeds_parsers_list: + if r.match(hit['remote_addr']) or r.match(hit['remote_ip']): + #print('hit match %s' % (hit['remote_addr'])) + self._appendToMergeCache(isFeedParser, r, hit) + return + #print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0])) + # Other cases, look for user agent + user_agent = hit['requests'][0]['http_user_agent'].lower() + self._appendToMergeCache(isFeedParser, user_agent, hit) def hook(self): hits = self.iwla.getCurrentVisits() - one_hit_only = {} for hit in hits.values(): isFeedParser = hit.get('feed_parser', None) - # Register already tagged feed parser in one_hit_only + # Register already tagged feed parser in merged_feeds if self.merge_feeds_parsers and\ not isFeedParser in (None, self.BAD_FEED_PARSER): - self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) + self.mergeFeedsParsers(isFeedParser, hit) continue if isFeedParser: @@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin): break if self.merge_feeds_parsers: - self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) + self.mergeFeedsParsers(isFeedParser, hit) else: hit['feed_parser'] = isFeedParser