Add no_merge_feeds_parsers_list conf value

This commit is contained in:
Gregory Soutade 2024-10-27 09:15:39 +01:00
parent 9939922c31
commit 70de0d3aca
2 changed files with 29 additions and 26 deletions

View File

@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
return True return True
def hook(self): def hook(self):
from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds
display = self.iwla.getDisplay() display = self.iwla.getDisplay()
hits = self.iwla.getCurrentVisits() hits = self.iwla.getCurrentVisits()

View File

@ -25,7 +25,7 @@ from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
""" """
Post analysis hook Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
@ -77,6 +77,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds_agents = self.iwla.getConfValue('feeds_agents', []) feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False if feeds is None: return False
@ -104,6 +105,10 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in _merge_feeds_parsers_list: for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f)) self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {} self.merged_feeds = {}
return True return True
@ -134,8 +139,11 @@ class IWLAPostAnalysisFeeds(IPlugin):
def mergeFeedsParsers(self, isFeedParser, hit): def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER): if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list: for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']): if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
# One group can view multiple different feeds # One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '') key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit) self._appendToMergeCache(isFeedParser, key, hit)
@ -159,15 +167,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)): if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access'] hit['feed_parser_last_access'] = hit['last_access']
if not hit.get('feed_name_analyzed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.NOT_A_FEED_PARSER
break
# Register already tagged feed parser in merged_feeds # Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)
@ -177,12 +176,12 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower() uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re: for regexp in self.feeds_re:
if regexp.match(uri): if regexp.match(uri) and self.iwla.hasBeenViewed(request):
isFeedParser = self.FEED_PARSER isFeedParser = self.FEED_PARSER
# Robot that views pages -> bot # # Robot that views pages -> bot
if hit['robot']: # if hit['robot']:
if hit['not_viewed_pages'][0]: # if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER # isFeedParser = self.NOT_A_FEED_PARSER
break break
user_agent = request['http_user_agent'].lower() user_agent = request['http_user_agent'].lower()
@ -199,14 +198,17 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
if not hit.get('feed_name_analyzed', False) and\ if isFeedParser == self.FEED_PARSER:
hit.get('dns_name_replaced', False): if not hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True self.iwla.reverseDNS(hit)
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re: if not hit.get('feed_name_analyzed', False):
if r.match(addr): hit['feed_name_analyzed'] = True
isFeedParser = hit['feed_parser'] = self.NOT_A_FEED_PARSER addr = hit.get('remote_addr', None)
break for r in self.bad_feeds_re:
if r.match(addr):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER: if isFeedParser == self.FEED_PARSER:
hit['feed_domain'] = request['server_name'] hit['feed_domain'] = request['server_name']
@ -217,6 +219,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
if subscribers: if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0]) hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)