# -*- coding: utf-8 -*- # # Copyright Grégory Soutadé 2015 # This file is part of iwla # iwla is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # iwla is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with iwla. If not, see . # import re import time from iwla import IWLA from iplugin import IPlugin """ Pre analysis hook Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent as it must be the same person with a different IP address. Warning : When merge_feeds_parsers is activated, last access display date is the more recent date of all merged parsers found Plugin requirements : None Conf values needed : feeds feeds_agents* merge_feeds_parsers* Output files : None Statistics creation : remote_ip => feed_parser feed_name_analyzed feed_parser_last_access (for merged parser) feed_domain feed_uri feed_subscribers Statistics update : None Statistics deletion : None """ class IWLAPostAnalysisFeeds(IPlugin): NOT_A_FEED_PARSER = 0 FEED_PARSER = 1 MERGED_FEED_PARSER = 2 BAD_FEED_PARSER = 3 def __init__(self, iwla): super(IWLAPostAnalysisFeeds, self).__init__(iwla) self.API_VERSION = 1 self.conf_requires = ['feeds'] def load(self): feeds = self.iwla.getConfValue('feeds', []) feeds_agents = self.iwla.getConfValue('feeds_agents', []) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) _no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', []) if feeds is None: return False self.feeds_re = [] for f in feeds: self.feeds_re.append(re.compile(f)) self.bad_feeds_re = [] self.bad_feeds_re.append(re.compile(r'.*crawl.*')) self.user_agents_re = [] self.user_agents_re.append(re.compile(r'.*rss.*')) self.user_agents_re.append(re.compile(r'.*atom.*')) self.user_agents_re.append(re.compile(r'.*feed.*')) for f in feeds_agents: self.user_agents_re.append(re.compile(f)) self.bad_user_agents_re = [] self.bad_user_agents_re.append(re.compile(r'.*feedback.*')) self.subscribers_re = re.compile(r'.* ([0-9]+) subscriber.*') self.merge_feeds_parsers_list = [] for f in _merge_feeds_parsers_list: self.merge_feeds_parsers_list.append(re.compile(f)) self.no_merge_feeds_parsers_list = [] for f in _no_merge_feeds_parsers_list: self.no_merge_feeds_parsers_list.append(re.compile(f)) self.merged_feeds = {} return True def _appendToMergeCache(self, isFeedParser, key, hit): hit['feed_parser'] = isFeedParser # First time, register into dict if self.merged_feeds.get(key, None) is None: # Merged self.merged_feeds[key] = hit elif hit['remote_ip'] != self.merged_feeds[key]['remote_ip']: # Next time # Current must be ignored hit['feed_parser'] = self.NOT_A_FEED_PARSER merged_hit = hit last_access = hit['last_access'] # Previous matched hit must be set as merged hit = self.merged_feeds[key] hit['feed_parser'] = self.MERGED_FEED_PARSER hit['viewed_pages'][0] += merged_hit['viewed_pages'][0] hit['viewed_hits'][0] += merged_hit['viewed_hits'][0] hit['not_viewed_pages'][0] += merged_hit['not_viewed_pages'][0] hit['not_viewed_hits'][0] += merged_hit['not_viewed_hits'][0] if hit['last_access'] < merged_hit['last_access']: hit['feed_parser_last_access'] = merged_hit['last_access'] else: hit['feed_parser_last_access'] = hit['last_access'] def mergeFeedsParsers(self, isFeedParser, hit): if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER): for r in self.no_merge_feeds_parsers_list: if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']): return for r in self.merge_feeds_parsers_list: if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']): # One group can view multiple different feeds key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '') self._appendToMergeCache(isFeedParser, key, hit) return #print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0])) # Other cases, look for user agent user_agent = hit['requests'][0]['http_user_agent'].lower() self._appendToMergeCache(isFeedParser, user_agent, hit) def hook(self): hits = self.iwla.getCurrentVisits() for hit in hits.values(): isFeedParser = hit.get('feed_parser', None) if isFeedParser == self.NOT_A_FEED_PARSER: continue # Second time if isFeedParser: # Update last access time if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)): hit['feed_parser_last_access'] = hit['last_access'] # Register already tagged feed parser in merged_feeds if self.merge_feeds_parsers: self.mergeFeedsParsers(isFeedParser, hit) continue request = hit['requests'][0] isFeedParser = self.NOT_A_FEED_PARSER uri = request['extract_request']['extract_uri'].lower() for regexp in self.feeds_re: if regexp.match(uri) and self.iwla.hasBeenViewed(request): isFeedParser = self.FEED_PARSER # # Robot that views pages -> bot # if hit['robot']: # if hit['not_viewed_pages'][0]: # isFeedParser = self.NOT_A_FEED_PARSER break user_agent = request['http_user_agent'].lower() if isFeedParser == self.NOT_A_FEED_PARSER: for regexp in self.user_agents_re: if regexp.match(user_agent): isFeedParser = self.FEED_PARSER break if isFeedParser == self.FEED_PARSER: for regexp in self.bad_user_agents_re: if regexp.match(user_agent): isFeedParser = self.NOT_A_FEED_PARSER break if isFeedParser == self.FEED_PARSER: if not hit.get('dns_name_replaced', False): self.iwla.reverseDNS(hit) if not hit.get('feed_name_analyzed', False): hit['feed_name_analyzed'] = True addr = hit.get('remote_addr', None) for r in self.bad_feeds_re: if r.match(addr): isFeedParser = self.NOT_A_FEED_PARSER break if isFeedParser == self.FEED_PARSER: hit['feed_domain'] = request['server_name'] hit['feed_uri'] = uri hit['feed_subscribers'] = 0 subscribers = self.subscribers_re.match(user_agent) if subscribers: hit['feed_subscribers'] = int(subscribers.groups()[0]) hit['robot'] = True hit['feed_parser'] = isFeedParser if self.merge_feeds_parsers: self.mergeFeedsParsers(isFeedParser, hit)