From cfbd35d818bd495076ce11b416dd84bbb2396803 Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Wed, 18 Feb 2015 20:32:04 +0100 Subject: [PATCH] Merge one hit only parsers in feeds parsers detection --- plugins/display/feeds.py | 7 +++++-- plugins/post_analysis/feeds.py | 28 +++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/plugins/display/feeds.py b/plugins/display/feeds.py index 3466fd9..9e65c17 100644 --- a/plugins/display/feeds.py +++ b/plugins/display/feeds.py @@ -67,7 +67,7 @@ class IWLADisplayFeeds(IPlugin): # All in a page if self.create_all_feeds_page: - title = createCurTitle(self.iwla, u'All Feeds parsers') + title = createCurTitle(self.iwla, self.iwla._(u'All Feeds parsers')) filename = 'all_feeds.html' path = self.iwla.getCurDisplayPath(filename) display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False) @@ -81,7 +81,10 @@ class IWLADisplayFeeds(IPlugin): if display_visitor_ip and\ super_hit.get('dns_name_replaced', False): address = '%s [%s]' % (address, super_hit['remote_ip']) - table.appendRow([address, super_hit['viewed_pages'], super_hit['viewed_hits']]) + if super_hit['robot']: + table.appendRow([address, super_hit['not_viewed_pages'], super_hit['not_viewed_hits']]) + else: + table.appendRow([address, super_hit['viewed_pages'], super_hit['viewed_hits']]) page.appendBlock(table) display.addPage(page) diff --git a/plugins/post_analysis/feeds.py b/plugins/post_analysis/feeds.py index 96884d8..2d4987c 100644 --- a/plugins/post_analysis/feeds.py +++ b/plugins/post_analysis/feeds.py @@ -27,6 +27,8 @@ from iplugin import IPlugin Post analysis hook Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) +If there is ony one hit per day to a feed, merge feeds parsers with the same user agent +as it must be the same person with a different IP address. Plugin requirements : None @@ -64,18 +66,34 @@ class IWLAPostAnalysisFeeds(IPlugin): return True + def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit): + if isFeedParser and (hit['viewed_hits'] + hit['not_viewed_hits']) == 1: + user_agent = hit['requests'][0]['http_user_agent'].lower() + if one_hit_only.get(user_agent, None) is None: + one_hit_only[user_agent] = (hit) + else: + isFeedParser = False + hit['feed_parser'] = isFeedParser + def hook(self): hits = self.iwla.getCurrentVisists() + one_hit_only = {} for hit in hits.values(): - if not hit.get('feed_parser', None) is None: continue + isFeedParser = hit.get('feed_parser', None) + + if isFeedParser == True: + self.mergeOneHitOnlyFeedsParsers(one_hit_only, hit) + + if not isFeedParser is None: continue + isFeedParser = False uri = hit['requests'][0]['extract_request']['extract_uri'].lower() for regexp in self.feeds_re: if regexp.match(uri): + isFeedParser = True # Robot that views pages -> bot if hit['robot']: - if hit['viewed_pages']: continue - isFeedParser = True + if hit['viewed_pages']: + isFeedParser = False break - hit['feed_parser'] = isFeedParser - + self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)