Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value
This commit is contained in:
		| @@ -44,6 +44,7 @@ Output files : | |||||||
| Statistics creation : | Statistics creation : | ||||||
|    remote_addr => |    remote_addr => | ||||||
|        feed_parser |        feed_parser | ||||||
|  |        feed_name_analysed | ||||||
|  |  | ||||||
| Statistics update : | Statistics update : | ||||||
|     None |     None | ||||||
| @@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin): | |||||||
|         feeds = self.iwla.getConfValue('feeds', []) |         feeds = self.iwla.getConfValue('feeds', []) | ||||||
|         feeds_referers = self.iwla.getConfValue('feeds_referers', []) |         feeds_referers = self.iwla.getConfValue('feeds_referers', []) | ||||||
|         self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) |         self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) | ||||||
|  |         _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) | ||||||
|  |  | ||||||
|         if feeds is None: return False |         if feeds is None: return False | ||||||
|  |  | ||||||
| @@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin): | |||||||
|         for f in feeds_referers: |         for f in feeds_referers: | ||||||
|             self.referers_uri.append(f) |             self.referers_uri.append(f) | ||||||
|  |  | ||||||
|  |         self.merge_feeds_parsers_list = [] | ||||||
|  |         for f in _merge_feeds_parsers_list: | ||||||
|  |             self.merge_feeds_parsers_list.append(re.compile(f)) | ||||||
|  |              | ||||||
|  |         self.merged_feeds = {} | ||||||
|  |  | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|     def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit): |     def _appendToMergeCache(self, isFeedParser, key, hit): | ||||||
|         # One hit only match |         # First time, register into dict | ||||||
|         if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1: |         if self.merged_feeds.get(key, None) is None: | ||||||
|             user_agent = hit['requests'][0]['http_user_agent'].lower() |             # Merged | ||||||
|             # First time, register into dict |             self.merged_feeds[key] = hit | ||||||
|             if one_hit_only.get(user_agent, None) is None: |         else: | ||||||
|                 # Merged |             # Next time | ||||||
|                 one_hit_only[user_agent] = hit |             # Current must be ignored | ||||||
|             else: |             hit['feed_parser'] = self.NOT_A_FEED_PARSER | ||||||
|                 # Next time |             # Previous matched hit must be set as merged | ||||||
|                 # Current must be ignored |             isFeedParser = self.MERGED_FEED_PARSER | ||||||
|                 hit['feed_parser'] = self.NOT_A_FEED_PARSER |             hit = self.merged_feeds[key] | ||||||
|                 # Previous matched hit must be set as merged |  | ||||||
|                 isFeedParser = self.MERGED_FEED_PARSER |  | ||||||
|                 hit = one_hit_only[user_agent] |  | ||||||
|         hit['feed_parser'] = isFeedParser |         hit['feed_parser'] = isFeedParser | ||||||
|      |      | ||||||
|  |     def mergeFeedsParsers(self, isFeedParser, hit): | ||||||
|  |         if isFeedParser: | ||||||
|  |             # One hit only match | ||||||
|  |             if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1: | ||||||
|  |                 for r in self.merge_feeds_parsers_list: | ||||||
|  |                     if r.match(hit['remote_addr']) or r.match(hit['remote_ip']): | ||||||
|  |                         #print('hit match %s' % (hit['remote_addr'])) | ||||||
|  |                         self._appendToMergeCache(isFeedParser, r, hit) | ||||||
|  |                         return | ||||||
|  |             #print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0])) | ||||||
|  |             # Other cases, look for user agent | ||||||
|  |             user_agent = hit['requests'][0]['http_user_agent'].lower() | ||||||
|  |             self._appendToMergeCache(isFeedParser, user_agent, hit) | ||||||
|  |  | ||||||
|     def hook(self): |     def hook(self): | ||||||
|         hits = self.iwla.getCurrentVisits() |         hits = self.iwla.getCurrentVisits() | ||||||
|         one_hit_only = {} |  | ||||||
|         for hit in hits.values(): |         for hit in hits.values(): | ||||||
|             isFeedParser = hit.get('feed_parser', None) |             isFeedParser = hit.get('feed_parser', None) | ||||||
|  |  | ||||||
|             # Register already tagged feed parser in one_hit_only |             # Register already tagged feed parser in merged_feeds | ||||||
|             if self.merge_feeds_parsers and\ |             if self.merge_feeds_parsers and\ | ||||||
|                not isFeedParser in (None, self.BAD_FEED_PARSER): |                not isFeedParser in (None, self.BAD_FEED_PARSER): | ||||||
|                 self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) |                 self.mergeFeedsParsers(isFeedParser, hit) | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if isFeedParser: |             if isFeedParser: | ||||||
| @@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin): | |||||||
|                         break |                         break | ||||||
|  |  | ||||||
|             if self.merge_feeds_parsers: |             if self.merge_feeds_parsers: | ||||||
|                 self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) |                 self.mergeFeedsParsers(isFeedParser, hit) | ||||||
|             else: |             else: | ||||||
|                 hit['feed_parser'] = isFeedParser |                 hit['feed_parser'] = isFeedParser | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user