bb268114b2
Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin
180 lines
5.9 KiB
Python
180 lines
5.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright Grégory Soutadé 2015
|
|
|
|
# This file is part of iwla
|
|
|
|
# iwla is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# iwla is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
import re
|
|
import urllib
|
|
|
|
from iwla import IWLA
|
|
from iplugin import IPlugin
|
|
|
|
import awstats_data
|
|
|
|
"""
|
|
Post analysis hook
|
|
|
|
Extract referers and key phrases from requests
|
|
|
|
Plugin requirements :
|
|
None
|
|
|
|
Conf values needed :
|
|
domain_name
|
|
|
|
Output files :
|
|
None
|
|
|
|
Statistics creation :
|
|
None
|
|
|
|
Statistics update :
|
|
month_stats :
|
|
referers =>
|
|
pages => count
|
|
hits => count
|
|
robots_referers =>
|
|
pages => count
|
|
hits => count
|
|
search_engine_referers =>
|
|
pages => count
|
|
hits => count
|
|
key_phrases =>
|
|
phrase => count
|
|
|
|
Statistics deletion :
|
|
None
|
|
"""
|
|
|
|
class IWLAPostAnalysisReferers(IPlugin):
|
|
def __init__(self, iwla):
|
|
super(IWLAPostAnalysisReferers, self).__init__(iwla)
|
|
self.API_VERSION = 1
|
|
self.conf_requires = ['domain_name']
|
|
|
|
def _getSearchEngine(self, hashid):
|
|
for (k, e) in self.search_engines.items():
|
|
for (h,h_re) in e['hashid']:
|
|
if hashid == h:
|
|
return k
|
|
return None
|
|
|
|
def load(self):
|
|
domain_name = self.iwla.getConfValue('domain_name', '')
|
|
|
|
if not domain_name:
|
|
print 'domain_name must not be empty !'
|
|
return False
|
|
|
|
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
|
|
self.search_engines = {}
|
|
|
|
for (hashid, name) in awstats_data.search_engines_hashid.items():
|
|
hashid_re = re.compile(r'.*%s.*' % (hashid))
|
|
if not name in self.search_engines.keys():
|
|
self.search_engines[name] = {
|
|
'hashid' : [(hashid, hashid_re)]
|
|
}
|
|
else:
|
|
self.search_engines[name]['hashid'].append((hashid, hashid_re))
|
|
#print 'Hashid %s => %s' % (name, hashid)
|
|
|
|
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
|
|
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
|
|
|
|
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
|
|
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
|
|
key = self._getSearchEngine(engine)
|
|
if key:
|
|
self.search_engines[key]['not_search_engine'] = not_engine_re
|
|
|
|
return True
|
|
|
|
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
|
|
if not parameters or not key_phrase_re: return
|
|
|
|
for p in parameters.split('&'):
|
|
groups = key_phrase_re.match(p)
|
|
if groups:
|
|
key_phrase = groups.groupdict()['key_phrase']
|
|
try:
|
|
key_phrase = urllib.unquote_plus(key_phrase).decode('utf8')
|
|
except Exception, e:
|
|
print(e)
|
|
continue
|
|
if not key_phrase in key_phrases.keys():
|
|
key_phrases[key_phrase] = 1
|
|
else:
|
|
key_phrases[key_phrase] += 1
|
|
break
|
|
|
|
def hook(self):
|
|
stats = self.iwla.getCurrentVisits()
|
|
month_stats = self.iwla.getMonthStats()
|
|
|
|
referers = month_stats.get('referers', {})
|
|
robots_referers = month_stats.get('robots_referers', {})
|
|
search_engine_referers = month_stats.get('search_engine_referers', {})
|
|
key_phrases = month_stats.get('key_phrases', {})
|
|
|
|
for (k, super_hit) in stats.items():
|
|
for r in super_hit['requests'][::-1]:
|
|
if not self.iwla.isValidForCurrentAnalysis(r): break
|
|
if not r['http_referer']: continue
|
|
|
|
uri = r['extract_referer']['extract_uri']
|
|
if self.own_domain_re.match(uri): continue
|
|
|
|
if super_hit['robot']:
|
|
dictionary = robots_referers
|
|
# print '%s => %s' % (uri, super_hit['remote_ip'])
|
|
else:
|
|
is_search_engine = False
|
|
for (name, engine) in self.search_engines.items():
|
|
for (hashid, hashid_re) in engine['hashid']:
|
|
if not hashid_re.match(uri): continue
|
|
|
|
not_engine = engine.get('not_search_engine', None)
|
|
# Try not engine
|
|
if not_engine and not_engine.match(uri): break
|
|
is_search_engine = True
|
|
uri = name
|
|
|
|
parameters = r['extract_referer'].get('extract_parameters', None)
|
|
key_phrase_re = engine.get('known_url', None)
|
|
|
|
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
|
|
break
|
|
|
|
if is_search_engine:
|
|
dictionary = search_engine_referers
|
|
else:
|
|
dictionary = referers
|
|
|
|
if r['is_page']:
|
|
key = 'pages'
|
|
else:
|
|
key = 'hits'
|
|
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
|
|
dictionary[uri][key] += 1
|
|
|
|
month_stats['referers'] = referers
|
|
month_stats['robots_referers'] = robots_referers
|
|
month_stats['search_engine_referers'] = search_engine_referers
|
|
month_stats['key_phrases'] = key_phrases
|