iwla/plugins/pre_analysis/robots.py
Gregory Soutade bb268114b2 Make backup before compressing (low memory servers)
Fix error : Call post hook plugins even in display only mode
Don't compute unordered hits (remove pasts if they are found after current)
Remove tags in stats diff
Don't do geolocalisation is visitor is not valid
Don't try to find search engine on robots
Update robot check rules
Add top_pages_diff plugin
2019-08-30 07:50:54 +02:00

154 lines
4.4 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import re
import logging
import inspect
from iwla import IWLA
from iplugin import IPlugin
import awstats_data
"""
Pre analysis hook
Filter robots
Plugin requirements :
None
Conf values needed :
page_to_hit_conf*
hit_to_page_conf*
Output files :
None
Statistics creation :
None
Statistics update :
visits :
remote_addr =>
robot
Statistics deletion :
None
"""
class IWLAPreAnalysisRobots(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisRobots, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
self.awstats_robots = map(lambda (x) : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots)
self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)
self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)
self.logger = logging.getLogger(self.__class__.__name__)
return True
def _setRobot(self, k, super_hit):
callerframerecord = inspect.stack()[1]
frame = callerframerecord[0]
info = inspect.getframeinfo(frame)
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
super_hit['robot'] = 1
# Basic rule to detect robots
def hook(self):
hits = self.iwla.getCurrentVisits()
for (k, super_hit) in hits.items():
if super_hit['robot']:
self.logger.debug('%s is a robot' % (k))
continue
isRobot = False
referers = 0
first_page = super_hit['requests'][0]
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']):
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
for r in self.awstats_robots:
if r.match(first_page['http_user_agent']):
isRobot = True
break
if isRobot:
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
# 1) no pages view --> robot
# if not super_hit['viewed_pages'][0]:
# super_hit['robot'] = 1
# continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 3) no pages and not hit --> robot
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 4) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
continue
not_found_pages = 0
for hit in super_hit['requests']:
# 5) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
self._setRobot(k, super_hit)
break
if int(hit['status']) == 404 or int(hit['status']) == 403:
not_found_pages += 1
# 6) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
if isRobot:
self._setRobot(k, super_hit)
continue
# 7) more than 10 404/403 pages
if not_found_pages > 10:
self._setRobot(k, super_hit)
continue
if not super_hit['viewed_pages'][0] and \
(super_hit['viewed_hits'][0] and not referers):
self._setRobot(k, super_hit)
continue