# -*- coding: utf-8 -*- # # Copyright Grégory Soutadé 2015 # This file is part of iwla # iwla is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # iwla is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with iwla. If not, see . # import re from iwla import IWLA from iplugin import IPlugin """ Post analysis hook Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) Plugin requirements : None Conf values needed : feeds Output files : None Statistics creation : remote_addr => feed_parser Statistics update : None Statistics deletion : None """ class IWLAPostAnalysisFeeds(IPlugin): def __init__(self, iwla): super(IWLAPostAnalysisFeeds, self).__init__(iwla) self.API_VERSION = 1 def load(self): feeds = self.iwla.getConfValue('feeds', None) if feeds is None: return False self.feeds_re = [] for f in feeds: self.feeds_re.append(re.compile(r'.*%s.*' % (f))) return True def hook(self): hits = self.iwla.getCurrentVisists() for hit in hits.values(): if not hit.get('feed_parser', None) is None: continue isFeedParser = False uri = hit['requests'][0]['extract_request']['extract_uri'].lower() for regexp in self.feeds_re: if regexp.match(uri): # Robot that views pages -> bot if hit['robot']: if hit['viewed_pages']: continue isFeedParser = True break hit['feed_parser'] = isFeedParser