bb268114b2
Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin
154 lines
4.4 KiB
Python
154 lines
4.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright Grégory Soutadé 2015
|
|
|
|
# This file is part of iwla
|
|
|
|
# iwla is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# iwla is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
import re
|
|
import logging
|
|
import inspect
|
|
|
|
from iwla import IWLA
|
|
from iplugin import IPlugin
|
|
|
|
import awstats_data
|
|
|
|
"""
|
|
Pre analysis hook
|
|
|
|
Filter robots
|
|
|
|
Plugin requirements :
|
|
None
|
|
|
|
Conf values needed :
|
|
page_to_hit_conf*
|
|
hit_to_page_conf*
|
|
|
|
Output files :
|
|
None
|
|
|
|
Statistics creation :
|
|
None
|
|
|
|
Statistics update :
|
|
visits :
|
|
remote_addr =>
|
|
robot
|
|
|
|
Statistics deletion :
|
|
None
|
|
"""
|
|
|
|
class IWLAPreAnalysisRobots(IPlugin):
|
|
def __init__(self, iwla):
|
|
super(IWLAPreAnalysisRobots, self).__init__(iwla)
|
|
self.API_VERSION = 1
|
|
|
|
def load(self):
|
|
self.awstats_robots = map(lambda (x) : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots)
|
|
self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)
|
|
self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
return True
|
|
|
|
def _setRobot(self, k, super_hit):
|
|
callerframerecord = inspect.stack()[1]
|
|
frame = callerframerecord[0]
|
|
info = inspect.getframeinfo(frame)
|
|
|
|
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
|
|
super_hit['robot'] = 1
|
|
|
|
# Basic rule to detect robots
|
|
def hook(self):
|
|
hits = self.iwla.getCurrentVisits()
|
|
for (k, super_hit) in hits.items():
|
|
if super_hit['robot']:
|
|
self.logger.debug('%s is a robot' % (k))
|
|
continue
|
|
|
|
isRobot = False
|
|
referers = 0
|
|
|
|
first_page = super_hit['requests'][0]
|
|
|
|
if self.robot_re.match(first_page['http_user_agent']) or\
|
|
self.crawl_re.match(first_page['http_user_agent']):
|
|
self.logger.debug(first_page['http_user_agent'])
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
for r in self.awstats_robots:
|
|
if r.match(first_page['http_user_agent']):
|
|
isRobot = True
|
|
break
|
|
|
|
if isRobot:
|
|
self.logger.debug(first_page['http_user_agent'])
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
# 1) no pages view --> robot
|
|
# if not super_hit['viewed_pages'][0]:
|
|
# super_hit['robot'] = 1
|
|
# continue
|
|
|
|
# 2) pages without hit --> robot
|
|
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
# 3) no pages and not hit --> robot
|
|
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
# 4) pages without hit --> robot
|
|
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
|
|
self.logger.debug(super_hit)
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
not_found_pages = 0
|
|
for hit in super_hit['requests']:
|
|
# 5) /robots.txt read
|
|
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
|
self._setRobot(k, super_hit)
|
|
break
|
|
|
|
if int(hit['status']) == 404 or int(hit['status']) == 403:
|
|
not_found_pages += 1
|
|
|
|
# 6) Any referer for hits
|
|
if not hit['is_page'] and hit['http_referer']:
|
|
referers += 1
|
|
|
|
if isRobot:
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
# 7) more than 10 404/403 pages
|
|
if not_found_pages > 10:
|
|
self._setRobot(k, super_hit)
|
|
continue
|
|
|
|
if not super_hit['viewed_pages'][0] and \
|
|
(super_hit['viewed_hits'][0] and not referers):
|
|
self._setRobot(k, super_hit)
|
|
continue
|