Remove crawler from feed parsers

This commit is contained in:
2016-08-20 13:08:02 +02:00
parent 10d087ad70
commit e805e59c10
3 changed files with 19 additions and 3 deletions

View File

@@ -55,6 +55,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
NOT_A_FEED_PARSER = 0
FEED_PARSER = 1
MERGED_FEED_PARSER = 2
BAD_FEED_PARSER = 3
def __init__(self, iwla):
super(IWLAPostAnalysisFeeds, self).__init__(iwla)
@@ -71,6 +72,9 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in feeds:
self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
@@ -94,7 +98,17 @@ class IWLAPostAnalysisFeeds(IPlugin):
self.merge_one_hit_only_feeds_parsers:
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
if not isFeedParser is None: continue
if isFeedParser:
if hit['feed_parser'] == self.BAD_FEED_PARSER: continue
if not hit.get('feed_name_analysed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analysed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.BAD_FEED_PARSER
return
return
isFeedParser = self.NOT_A_FEED_PARSER
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()

View File

@@ -67,7 +67,7 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
hits = self.iwla.getCurrentVisits()
for (k, hit) in hits.items():
if hit.get('dns_analysed', False): continue
if not hit['feed_parser'] and\
if not hit.get('feed_parser', False) and\
not self.iwla.isValidVisitor(hit):
continue
try: