# -*- coding: utf-8 -*- # # Copyright Grégory Soutadé 2015 # This file is part of iwla # iwla is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # iwla is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with iwla. If not, see . # import re from iwla import IWLA from iplugin import IPlugin """ Post analysis hook Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent as it must be the same person with a different IP address. Warning : When merge_feeds_parsers is activated, last access display date is the more recent date of all merged parsers found Plugin requirements : None Conf values needed : feeds feeds_referers* merge_feeds_parsers* Output files : None Statistics creation : remote_addr => feed_parser feed_name_analysed Statistics update : None Statistics deletion : None """ class IWLAPostAnalysisFeeds(IPlugin): NOT_A_FEED_PARSER = 0 FEED_PARSER = 1 MERGED_FEED_PARSER = 2 BAD_FEED_PARSER = 3 def __init__(self, iwla): super(IWLAPostAnalysisFeeds, self).__init__(iwla) self.API_VERSION = 1 self.conf_requires = ['feeds'] def load(self): feeds = self.iwla.getConfValue('feeds', []) feeds_referers = self.iwla.getConfValue('feeds_referers', []) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) if feeds is None: return False self.feeds_re = [] for f in feeds: self.feeds_re.append(re.compile(f)) self.bad_feeds_re = [] self.bad_feeds_re.append(re.compile(r'.*crawl.*')) self.user_agents_re = [] self.user_agents_re.append(re.compile(r'.*rss.*')) self.user_agents_re.append(re.compile(r'.*atom.*')) self.user_agents_re.append(re.compile(r'.*feed.*')) self.referers_uri = [] for f in feeds_referers: self.referers_uri.append(f) self.merge_feeds_parsers_list = [] for f in _merge_feeds_parsers_list: self.merge_feeds_parsers_list.append(re.compile(f)) self.merged_feeds = {} return True def _appendToMergeCache(self, isFeedParser, key, hit): # First time, register into dict if self.merged_feeds.get(key, None) is None: # Merged self.merged_feeds[key] = hit elif hit['remote_ip'] != self.merged_feeds[key]['remote_ip']: # Next time # Current must be ignored hit['feed_parser'] = self.NOT_A_FEED_PARSER last_access = hit['last_access'] # Previous matched hit must be set as merged isFeedParser = self.MERGED_FEED_PARSER hit = self.merged_feeds[key] if hit['last_access'] < last_access: hit['feed_parser_last_access'] = last_access else: hit['feed_parser_last_access'] = hit['last_access'] hit['feed_parser'] = isFeedParser def mergeFeedsParsers(self, isFeedParser, hit): if isFeedParser: for r in self.merge_feeds_parsers_list: if r.match(hit['remote_addr']) or r.match(hit['remote_ip']): self._appendToMergeCache(isFeedParser, r, hit) return #print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0])) # Other cases, look for user agent user_agent = hit['requests'][0]['http_user_agent'].lower() self._appendToMergeCache(isFeedParser, user_agent, hit) def hook(self): hits = self.iwla.getCurrentVisits() for hit in hits.values(): isFeedParser = hit.get('feed_parser', None) # Register already tagged feed parser in merged_feeds if self.merge_feeds_parsers and\ not isFeedParser in (None, self.BAD_FEED_PARSER): self.mergeFeedsParsers(isFeedParser, hit) continue if isFeedParser: if hit['feed_parser'] == self.BAD_FEED_PARSER: continue if not hit.get('feed_name_analysed', False) and\ hit.get('dns_name_replaced', False): hit['feed_name_analysed'] = True addr = hit.get('remote_addr', None) for r in self.bad_feeds_re: if r.match(addr): hit['feed_parser'] = self.BAD_FEED_PARSER break continue request = hit['requests'][0] isFeedParser = self.NOT_A_FEED_PARSER uri = request['extract_request']['extract_uri'].lower() for regexp in self.feeds_re: if regexp.match(uri): isFeedParser = self.FEED_PARSER # Robot that views pages -> bot if hit['robot']: if hit['not_viewed_pages'][0]: isFeedParser = self.NOT_A_FEED_PARSER break if isFeedParser == self.NOT_A_FEED_PARSER: user_agent = request['http_user_agent'].lower() for regexp in self.user_agents_re: if regexp.match(user_agent): isFeedParser = self.FEED_PARSER break if isFeedParser == self.NOT_A_FEED_PARSER and\ request.get('extract_referer', False): referer = request['extract_referer']['extract_uri'].lower() for uri in self.referers_uri: if referer == uri: isFeedParser = self.FEED_PARSER break if self.merge_feeds_parsers: self.mergeFeedsParsers(isFeedParser, hit) else: hit['feed_parser'] = isFeedParser