Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value
This commit is contained in:
parent
242bb6cabe
commit
f8b37a625c
|
@ -44,6 +44,7 @@ Output files :
|
||||||
Statistics creation :
|
Statistics creation :
|
||||||
remote_addr =>
|
remote_addr =>
|
||||||
feed_parser
|
feed_parser
|
||||||
|
feed_name_analysed
|
||||||
|
|
||||||
Statistics update :
|
Statistics update :
|
||||||
None
|
None
|
||||||
|
@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
feeds = self.iwla.getConfValue('feeds', [])
|
feeds = self.iwla.getConfValue('feeds', [])
|
||||||
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
|
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
|
||||||
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
|
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
|
||||||
|
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
|
||||||
|
|
||||||
if feeds is None: return False
|
if feeds is None: return False
|
||||||
|
|
||||||
|
@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
for f in feeds_referers:
|
for f in feeds_referers:
|
||||||
self.referers_uri.append(f)
|
self.referers_uri.append(f)
|
||||||
|
|
||||||
|
self.merge_feeds_parsers_list = []
|
||||||
|
for f in _merge_feeds_parsers_list:
|
||||||
|
self.merge_feeds_parsers_list.append(re.compile(f))
|
||||||
|
|
||||||
|
self.merged_feeds = {}
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit):
|
def _appendToMergeCache(self, isFeedParser, key, hit):
|
||||||
# One hit only match
|
# First time, register into dict
|
||||||
if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
|
if self.merged_feeds.get(key, None) is None:
|
||||||
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
# Merged
|
||||||
# First time, register into dict
|
self.merged_feeds[key] = hit
|
||||||
if one_hit_only.get(user_agent, None) is None:
|
else:
|
||||||
# Merged
|
# Next time
|
||||||
one_hit_only[user_agent] = hit
|
# Current must be ignored
|
||||||
else:
|
hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
||||||
# Next time
|
# Previous matched hit must be set as merged
|
||||||
# Current must be ignored
|
isFeedParser = self.MERGED_FEED_PARSER
|
||||||
hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
hit = self.merged_feeds[key]
|
||||||
# Previous matched hit must be set as merged
|
|
||||||
isFeedParser = self.MERGED_FEED_PARSER
|
|
||||||
hit = one_hit_only[user_agent]
|
|
||||||
hit['feed_parser'] = isFeedParser
|
hit['feed_parser'] = isFeedParser
|
||||||
|
|
||||||
|
def mergeFeedsParsers(self, isFeedParser, hit):
|
||||||
|
if isFeedParser:
|
||||||
|
# One hit only match
|
||||||
|
if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
|
||||||
|
for r in self.merge_feeds_parsers_list:
|
||||||
|
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
|
||||||
|
#print('hit match %s' % (hit['remote_addr']))
|
||||||
|
self._appendToMergeCache(isFeedParser, r, hit)
|
||||||
|
return
|
||||||
|
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
|
||||||
|
# Other cases, look for user agent
|
||||||
|
user_agent = hit['requests'][0]['http_user_agent'].lower()
|
||||||
|
self._appendToMergeCache(isFeedParser, user_agent, hit)
|
||||||
|
|
||||||
def hook(self):
|
def hook(self):
|
||||||
hits = self.iwla.getCurrentVisits()
|
hits = self.iwla.getCurrentVisits()
|
||||||
one_hit_only = {}
|
|
||||||
for hit in hits.values():
|
for hit in hits.values():
|
||||||
isFeedParser = hit.get('feed_parser', None)
|
isFeedParser = hit.get('feed_parser', None)
|
||||||
|
|
||||||
# Register already tagged feed parser in one_hit_only
|
# Register already tagged feed parser in merged_feeds
|
||||||
if self.merge_feeds_parsers and\
|
if self.merge_feeds_parsers and\
|
||||||
not isFeedParser in (None, self.BAD_FEED_PARSER):
|
not isFeedParser in (None, self.BAD_FEED_PARSER):
|
||||||
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
|
self.mergeFeedsParsers(isFeedParser, hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if isFeedParser:
|
if isFeedParser:
|
||||||
|
@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
break
|
break
|
||||||
|
|
||||||
if self.merge_feeds_parsers:
|
if self.merge_feeds_parsers:
|
||||||
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit)
|
self.mergeFeedsParsers(isFeedParser, hit)
|
||||||
else:
|
else:
|
||||||
hit['feed_parser'] = isFeedParser
|
hit['feed_parser'] = isFeedParser
|
||||||
|
|
Loading…
Reference in New Issue
Block a user