Merge one hit only parsers in feeds parsers detection

This commit is contained in:
Gregory Soutade 2015-02-18 20:32:04 +01:00
parent efb5ddf761
commit cfbd35d818
2 changed files with 28 additions and 7 deletions

View File

@ -67,7 +67,7 @@ class IWLADisplayFeeds(IPlugin):
# All in a page
if self.create_all_feeds_page:
title = createCurTitle(self.iwla, u'All Feeds parsers')
title = createCurTitle(self.iwla, self.iwla._(u'All Feeds parsers'))
filename = 'all_feeds.html'
path = self.iwla.getCurDisplayPath(filename)
display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False)
@ -81,6 +81,9 @@ class IWLADisplayFeeds(IPlugin):
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
address = '%s [%s]' % (address, super_hit['remote_ip'])
if super_hit['robot']:
table.appendRow([address, super_hit['not_viewed_pages'], super_hit['not_viewed_hits']])
else:
table.appendRow([address, super_hit['viewed_pages'], super_hit['viewed_hits']])
page.appendBlock(table)

View File

@ -27,6 +27,8 @@ from iplugin import IPlugin
Post analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If there is ony one hit per day to a feed, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Plugin requirements :
None
@ -64,18 +66,34 @@ class IWLAPostAnalysisFeeds(IPlugin):
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
if isFeedParser and (hit['viewed_hits'] + hit['not_viewed_hits']) == 1:
user_agent = hit['requests'][0]['http_user_agent'].lower()
if one_hit_only.get(user_agent, None) is None:
one_hit_only[user_agent] = (hit)
else:
isFeedParser = False
hit['feed_parser'] = isFeedParser
def hook(self):
hits = self.iwla.getCurrentVisists()
one_hit_only = {}
for hit in hits.values():
if not hit.get('feed_parser', None) is None: continue
isFeedParser = hit.get('feed_parser', None)
if isFeedParser == True:
self.mergeOneHitOnlyFeedsParsers(one_hit_only, hit)
if not isFeedParser is None: continue
isFeedParser = False
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri):
isFeedParser = True
# Robot that views pages -> bot
if hit['robot']:
if hit['viewed_pages']: continue
isFeedParser = True
if hit['viewed_pages']:
isFeedParser = False
break
hit['feed_parser'] = isFeedParser
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)