iwla/plugins/post_analysis/referers.py
Gregory Soutade bb268114b2 Make backup before compressing (low memory servers)
Fix error : Call post hook plugins even in display only mode
Don't compute unordered hits (remove pasts if they are found after current)
Remove tags in stats diff
Don't do geolocalisation is visitor is not valid
Don't try to find search engine on robots
Update robot check rules
Add top_pages_diff plugin
2019-08-30 07:50:54 +02:00

180 lines
5.9 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import re
import urllib
from iwla import IWLA
from iplugin import IPlugin
import awstats_data
"""
Post analysis hook
Extract referers and key phrases from requests
Plugin requirements :
None
Conf values needed :
domain_name
Output files :
None
Statistics creation :
None
Statistics update :
month_stats :
referers =>
pages => count
hits => count
robots_referers =>
pages => count
hits => count
search_engine_referers =>
pages => count
hits => count
key_phrases =>
phrase => count
Statistics deletion :
None
"""
class IWLAPostAnalysisReferers(IPlugin):
def __init__(self, iwla):
super(IWLAPostAnalysisReferers, self).__init__(iwla)
self.API_VERSION = 1
self.conf_requires = ['domain_name']
def _getSearchEngine(self, hashid):
for (k, e) in self.search_engines.items():
for (h,h_re) in e['hashid']:
if hashid == h:
return k
return None
def load(self):
domain_name = self.iwla.getConfValue('domain_name', '')
if not domain_name:
print 'domain_name must not be empty !'
return False
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
self.search_engines = {}
for (hashid, name) in awstats_data.search_engines_hashid.items():
hashid_re = re.compile(r'.*%s.*' % (hashid))
if not name in self.search_engines.keys():
self.search_engines[name] = {
'hashid' : [(hashid, hashid_re)]
}
else:
self.search_engines[name]['hashid'].append((hashid, hashid_re))
#print 'Hashid %s => %s' % (name, hashid)
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
key = self._getSearchEngine(engine)
if key:
self.search_engines[key]['not_search_engine'] = not_engine_re
return True
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
if not parameters or not key_phrase_re: return
for p in parameters.split('&'):
groups = key_phrase_re.match(p)
if groups:
key_phrase = groups.groupdict()['key_phrase']
try:
key_phrase = urllib.unquote_plus(key_phrase).decode('utf8')
except Exception, e:
print(e)
continue
if not key_phrase in key_phrases.keys():
key_phrases[key_phrase] = 1
else:
key_phrases[key_phrase] += 1
break
def hook(self):
stats = self.iwla.getCurrentVisits()
month_stats = self.iwla.getMonthStats()
referers = month_stats.get('referers', {})
robots_referers = month_stats.get('robots_referers', {})
search_engine_referers = month_stats.get('search_engine_referers', {})
key_phrases = month_stats.get('key_phrases', {})
for (k, super_hit) in stats.items():
for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r): break
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
if self.own_domain_re.match(uri): continue
if super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:
key = 'hits'
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
dictionary[uri][key] += 1
month_stats['referers'] = referers
month_stats['robots_referers'] = robots_referers
month_stats['search_engine_referers'] = search_engine_referers
month_stats['key_phrases'] = key_phrases