178 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			178 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # -*- coding: utf-8 -*-
 | |
| #
 | |
| # Copyright Grégory Soutadé 2015
 | |
| 
 | |
| # This file is part of iwla
 | |
| 
 | |
| # iwla is free software: you can redistribute it and/or modify
 | |
| # it under the terms of the GNU General Public License as published by
 | |
| # the Free Software Foundation, either version 3 of the License, or
 | |
| # (at your option) any later version.
 | |
| #
 | |
| # iwla is distributed in the hope that it will be useful,
 | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| # GNU General Public License for more details.
 | |
| #
 | |
| # You should have received a copy of the GNU General Public License
 | |
| # along with iwla.  If not, see <http://www.gnu.org/licenses/>.
 | |
| #
 | |
| 
 | |
| import re
 | |
| 
 | |
| from iwla import IWLA
 | |
| from iplugin import IPlugin
 | |
| 
 | |
| """
 | |
| Post analysis hook
 | |
| 
 | |
| Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
 | |
| If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
 | |
| as it must be the same person with a different IP address.
 | |
| 
 | |
| Warning : When merge_feeds_parsers is activated, last access display date is the more
 | |
| recent date of all merged parsers found
 | |
| 
 | |
| Plugin requirements :
 | |
|     None
 | |
| 
 | |
| Conf values needed :
 | |
|     feeds
 | |
|     feeds_agents*
 | |
|     merge_feeds_parsers*
 | |
| 
 | |
| Output files :
 | |
|     None
 | |
| 
 | |
| Statistics creation :
 | |
|    remote_ip =>
 | |
|        feed_parser
 | |
|        feed_name_analysed
 | |
|        feed_parser_last_access (for merged parser)
 | |
| 
 | |
| Statistics update :
 | |
|     None
 | |
| 
 | |
| Statistics deletion :
 | |
|     None
 | |
| """
 | |
| 
 | |
| class IWLAPostAnalysisFeeds(IPlugin):
 | |
|     NOT_A_FEED_PARSER = 0
 | |
|     FEED_PARSER = 1
 | |
|     MERGED_FEED_PARSER = 2
 | |
|     BAD_FEED_PARSER = 3
 | |
| 
 | |
|     def __init__(self, iwla):
 | |
|         super(IWLAPostAnalysisFeeds, self).__init__(iwla)
 | |
|         self.API_VERSION = 1
 | |
|         self.conf_requires = ['feeds']
 | |
| 
 | |
|     def load(self):
 | |
|         feeds = self.iwla.getConfValue('feeds', [])
 | |
|         feeds_agents = self.iwla.getConfValue('feeds_agents', [])
 | |
|         self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
 | |
|         _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
 | |
| 
 | |
|         if feeds is None: return False
 | |
| 
 | |
|         self.feeds_re = []
 | |
|         for f in feeds:
 | |
|             self.feeds_re.append(re.compile(f))
 | |
| 
 | |
|         self.bad_feeds_re = []
 | |
|         self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
 | |
| 
 | |
|         self.user_agents_re = []
 | |
|         self.user_agents_re.append(re.compile(r'.*rss.*'))
 | |
|         self.user_agents_re.append(re.compile(r'.*atom.*'))
 | |
|         self.user_agents_re.append(re.compile(r'.*feed.*'))
 | |
| 
 | |
|         for f in feeds_agents:
 | |
|             self.user_agents_re.append(re.compile(f))
 | |
| 
 | |
|         self.merge_feeds_parsers_list = []
 | |
|         for f in _merge_feeds_parsers_list:
 | |
|             self.merge_feeds_parsers_list.append(re.compile(f))
 | |
|             
 | |
|         self.merged_feeds = {}
 | |
| 
 | |
|         return True
 | |
| 
 | |
|     def _appendToMergeCache(self, isFeedParser, key, hit):
 | |
|         # First time, register into dict
 | |
|         if self.merged_feeds.get(key, None) is None:
 | |
|             # Merged
 | |
|             self.merged_feeds[key] = hit
 | |
|         elif hit['remote_ip'] != self.merged_feeds[key]['remote_ip']:
 | |
|             # Next time
 | |
|             # Current must be ignored
 | |
|             hit['feed_parser'] = self.NOT_A_FEED_PARSER
 | |
|             last_access = hit['last_access']
 | |
|             # Previous matched hit must be set as merged
 | |
|             isFeedParser = self.MERGED_FEED_PARSER
 | |
|             hit = self.merged_feeds[key]
 | |
|             if hit['last_access'] < last_access:
 | |
|                 hit['feed_parser_last_access'] = last_access
 | |
|             else:
 | |
|                 hit['feed_parser_last_access'] = hit['last_access']
 | |
|         hit['feed_parser'] = isFeedParser
 | |
|     
 | |
|     def mergeFeedsParsers(self, isFeedParser, hit):
 | |
|         if isFeedParser:
 | |
|             for r in self.merge_feeds_parsers_list:
 | |
|                 if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
 | |
|                     self._appendToMergeCache(isFeedParser, r, hit)
 | |
|                     return
 | |
|             #print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
 | |
|             # Other cases, look for user agent
 | |
|             user_agent = hit['requests'][0]['http_user_agent'].lower()
 | |
|             self._appendToMergeCache(isFeedParser, user_agent, hit)
 | |
| 
 | |
|     def hook(self):
 | |
|         hits = self.iwla.getCurrentVisits()
 | |
|         for hit in hits.values():
 | |
|             isFeedParser = hit.get('feed_parser', None)
 | |
| 
 | |
|             # Register already tagged feed parser in merged_feeds
 | |
|             if self.merge_feeds_parsers and\
 | |
|                not isFeedParser in (None, self.BAD_FEED_PARSER):
 | |
|                 self.mergeFeedsParsers(isFeedParser, hit)
 | |
|                 continue
 | |
| 
 | |
|             if isFeedParser:
 | |
|                 if hit['feed_parser'] == self.BAD_FEED_PARSER: continue
 | |
|                 if not hit.get('feed_name_analysed', False) and\
 | |
|                    hit.get('dns_name_replaced', False):
 | |
|                     hit['feed_name_analysed'] = True
 | |
|                     addr = hit.get('remote_addr', None)
 | |
|                     for r in self.bad_feeds_re:
 | |
|                         if r.match(addr):
 | |
|                             hit['feed_parser'] = self.BAD_FEED_PARSER
 | |
|                             break
 | |
|                 continue
 | |
| 
 | |
|             request = hit['requests'][0]
 | |
|             isFeedParser = self.NOT_A_FEED_PARSER
 | |
|             uri = request['extract_request']['extract_uri'].lower()
 | |
|             for regexp in self.feeds_re:
 | |
|                 if regexp.match(uri):
 | |
|                     isFeedParser = self.FEED_PARSER
 | |
|                     # Robot that views pages -> bot
 | |
|                     if hit['robot']:
 | |
|                         if hit['not_viewed_pages'][0]:
 | |
|                             isFeedParser = self.NOT_A_FEED_PARSER
 | |
|                     break
 | |
| 
 | |
|             if isFeedParser == self.NOT_A_FEED_PARSER:
 | |
|                 user_agent = request['http_user_agent'].lower()
 | |
|                 for regexp in self.user_agents_re:
 | |
|                     if regexp.match(user_agent):
 | |
|                         isFeedParser = self.FEED_PARSER
 | |
|                         break
 | |
|                 
 | |
|             if self.merge_feeds_parsers:
 | |
|                 self.mergeFeedsParsers(isFeedParser, hit)
 | |
|             else:
 | |
|                 hit['feed_parser'] = isFeedParser
 |