iwla/plugins/pre_analysis/feeds.py
2024-10-27 09:15:39 +01:00

226 lines
8.4 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import re
import time
from iwla import IWLA
from iplugin import IPlugin
"""
Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_ip =>
feed_parser
feed_name_analyzed
feed_parser_last_access (for merged parser)
feed_domain
feed_uri
feed_subscribers
Statistics update :
None
Statistics deletion :
None
"""
class IWLAPostAnalysisFeeds(IPlugin):
NOT_A_FEED_PARSER = 0
FEED_PARSER = 1
MERGED_FEED_PARSER = 2
BAD_FEED_PARSER = 3
def __init__(self, iwla):
super(IWLAPostAnalysisFeeds, self).__init__(iwla)
self.API_VERSION = 1
self.conf_requires = ['feeds']
def load(self):
feeds = self.iwla.getConfValue('feeds', [])
feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False
self.feeds_re = []
for f in feeds:
self.feeds_re.append(re.compile(f))
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
self.user_agents_re = []
self.user_agents_re.append(re.compile(r'.*rss.*'))
self.user_agents_re.append(re.compile(r'.*atom.*'))
self.user_agents_re.append(re.compile(r'.*feed.*'))
for f in feeds_agents:
self.user_agents_re.append(re.compile(f))
self.bad_user_agents_re = []
self.bad_user_agents_re.append(re.compile(r'.*feedback.*'))
self.subscribers_re = re.compile(r'.* ([0-9]+) subscriber.*')
self.merge_feeds_parsers_list = []
for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {}
return True
def _appendToMergeCache(self, isFeedParser, key, hit):
hit['feed_parser'] = isFeedParser
# First time, register into dict
if self.merged_feeds.get(key, None) is None:
# Merged
self.merged_feeds[key] = hit
elif hit['remote_ip'] != self.merged_feeds[key]['remote_ip']:
# Next time
# Current must be ignored
hit['feed_parser'] = self.NOT_A_FEED_PARSER
merged_hit = hit
last_access = hit['last_access']
# Previous matched hit must be set as merged
hit = self.merged_feeds[key]
hit['feed_parser'] = self.MERGED_FEED_PARSER
hit['viewed_pages'][0] += merged_hit['viewed_pages'][0]
hit['viewed_hits'][0] += merged_hit['viewed_hits'][0]
hit['not_viewed_pages'][0] += merged_hit['not_viewed_pages'][0]
hit['not_viewed_hits'][0] += merged_hit['not_viewed_hits'][0]
if hit['last_access'] < merged_hit['last_access']:
hit['feed_parser_last_access'] = merged_hit['last_access']
else:
hit['feed_parser_last_access'] = hit['last_access']
def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
# One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit)
return
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
# Other cases, look for user agent
user_agent = hit['requests'][0]['http_user_agent'].lower()
self._appendToMergeCache(isFeedParser, user_agent, hit)
def hook(self):
hits = self.iwla.getCurrentVisits()
for hit in hits.values():
isFeedParser = hit.get('feed_parser', None)
if isFeedParser == self.NOT_A_FEED_PARSER:
continue
# Second time
if isFeedParser:
# Update last access time
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access']
# Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)
continue
request = hit['requests'][0]
isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri) and self.iwla.hasBeenViewed(request):
isFeedParser = self.FEED_PARSER
# # Robot that views pages -> bot
# if hit['robot']:
# if hit['not_viewed_pages'][0]:
# isFeedParser = self.NOT_A_FEED_PARSER
break
user_agent = request['http_user_agent'].lower()
if isFeedParser == self.NOT_A_FEED_PARSER:
for regexp in self.user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
for regexp in self.bad_user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
if not hit.get('dns_name_replaced', False):
self.iwla.reverseDNS(hit)
if not hit.get('feed_name_analyzed', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
hit['feed_domain'] = request['server_name']
hit['feed_uri'] = uri
hit['feed_subscribers'] = 0
subscribers = self.subscribers_re.match(user_agent)
if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)