2014-11-26 21:06:36 +01:00
|
|
|
import re
|
2014-12-04 21:04:41 +01:00
|
|
|
import urllib
|
2014-11-26 21:06:36 +01:00
|
|
|
|
|
|
|
from iwla import IWLA
|
|
|
|
from iplugin import IPlugin
|
|
|
|
|
|
|
|
import awstats_data
|
|
|
|
|
2014-12-09 16:54:02 +01:00
|
|
|
#
|
|
|
|
# Post analysis hook
|
|
|
|
#
|
|
|
|
# Extract referers and key phrases from requests
|
|
|
|
#
|
2014-12-10 07:09:05 +01:00
|
|
|
# Plugin requirements :
|
|
|
|
# None
|
2014-12-09 16:54:02 +01:00
|
|
|
#
|
|
|
|
# Conf values needed :
|
2014-12-10 07:09:05 +01:00
|
|
|
# domain_name
|
2014-12-09 16:54:02 +01:00
|
|
|
#
|
|
|
|
# Output files :
|
|
|
|
# None
|
|
|
|
#
|
|
|
|
# Statistics creation :
|
|
|
|
# None
|
|
|
|
#
|
|
|
|
# Statistics update :
|
2014-12-10 07:09:05 +01:00
|
|
|
# month_stats :
|
|
|
|
# referers =>
|
|
|
|
# pages
|
|
|
|
# hits
|
|
|
|
# robots_referers =>
|
|
|
|
# pages
|
|
|
|
# hits
|
|
|
|
# search_engine_referers =>
|
|
|
|
# pages
|
|
|
|
# hits
|
|
|
|
# key_phrases =>
|
|
|
|
# phrase
|
2014-12-09 16:54:02 +01:00
|
|
|
#
|
|
|
|
# Statistics deletion :
|
|
|
|
# None
|
|
|
|
#
|
|
|
|
|
2014-11-26 21:06:36 +01:00
|
|
|
class IWLAPostAnalysisReferers(IPlugin):
|
|
|
|
def __init__(self, iwla):
|
|
|
|
super(IWLAPostAnalysisReferers, self).__init__(iwla)
|
|
|
|
self.API_VERSION = 1
|
2014-11-27 12:34:42 +01:00
|
|
|
self.conf_requires = ['domain_name']
|
2014-11-26 21:06:36 +01:00
|
|
|
|
|
|
|
def _getSearchEngine(self, hashid):
|
|
|
|
for (k, e) in self.search_engines.items():
|
|
|
|
for (h,h_re) in e['hashid']:
|
|
|
|
if hashid == h:
|
|
|
|
return k
|
|
|
|
return None
|
|
|
|
|
|
|
|
def load(self):
|
|
|
|
domain_name = self.iwla.getConfValue('domain_name', '')
|
|
|
|
|
|
|
|
if not domain_name:
|
2014-11-27 12:34:42 +01:00
|
|
|
print 'domain_name must not be empty !'
|
2014-11-26 21:06:36 +01:00
|
|
|
return False
|
|
|
|
|
|
|
|
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
|
|
|
|
self.search_engines = {}
|
|
|
|
|
|
|
|
for (hashid, name) in awstats_data.search_engines_hashid.items():
|
|
|
|
hashid_re = re.compile(r'.*%s.*' % (hashid))
|
|
|
|
if not name in self.search_engines.keys():
|
|
|
|
self.search_engines[name] = {
|
|
|
|
'hashid' : [(hashid, hashid_re)]
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
self.search_engines[name]['hashid'].append((hashid, hashid_re))
|
|
|
|
#print 'Hashid %s => %s' % (name, hashid)
|
|
|
|
|
|
|
|
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
|
|
|
|
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
|
|
|
|
|
|
|
|
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
|
|
|
|
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
|
|
|
|
key = self._getSearchEngine(engine)
|
|
|
|
if key:
|
|
|
|
self.search_engines[key]['not_search_engine'] = not_engine_re
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
|
|
|
|
if not parameters or not key_phrase_re: return
|
|
|
|
|
|
|
|
for p in parameters.split('&'):
|
|
|
|
groups = key_phrase_re.match(p)
|
|
|
|
if groups:
|
|
|
|
key_phrase = groups.groupdict()['key_phrase']
|
2014-12-04 21:04:41 +01:00
|
|
|
key_phrase = urllib.unquote_plus(key_phrase).decode('utf8')
|
2014-11-26 21:06:36 +01:00
|
|
|
if not key_phrase in key_phrases.keys():
|
|
|
|
key_phrases[key_phrase] = 1
|
|
|
|
else:
|
|
|
|
key_phrases[key_phrase] += 1
|
|
|
|
break
|
|
|
|
|
|
|
|
def hook(self):
|
|
|
|
stats = self.iwla.getCurrentVisists()
|
|
|
|
month_stats = self.iwla.getMonthStats()
|
2014-11-26 22:03:19 +01:00
|
|
|
|
2014-11-26 21:06:36 +01:00
|
|
|
referers = month_stats.get('referers', {})
|
|
|
|
robots_referers = month_stats.get('robots_referers', {})
|
|
|
|
search_engine_referers = month_stats.get('search_engine_referers', {})
|
|
|
|
key_phrases = month_stats.get('key_phrases', {})
|
|
|
|
|
|
|
|
for (k, super_hit) in stats.items():
|
|
|
|
for r in super_hit['requests']:
|
2014-11-27 09:01:51 +01:00
|
|
|
if not self.iwla.isValidForCurrentAnalysis(r): continue
|
2014-11-26 21:06:36 +01:00
|
|
|
if not r['http_referer']: continue
|
|
|
|
|
|
|
|
uri = r['extract_referer']['extract_uri']
|
|
|
|
is_search_engine = False
|
|
|
|
|
|
|
|
if self.own_domain_re.match(uri): continue
|
|
|
|
|
|
|
|
for (name, engine) in self.search_engines.items():
|
|
|
|
for (hashid, hashid_re) in engine['hashid']:
|
|
|
|
if not hashid_re.match(uri): continue
|
|
|
|
|
|
|
|
not_engine = engine.get('not_search_engine', None)
|
|
|
|
# Try not engine
|
|
|
|
if not_engine and not_engine.match(uri): break
|
|
|
|
is_search_engine = True
|
|
|
|
uri = name
|
|
|
|
|
|
|
|
parameters = r['extract_referer'].get('extract_parameters', None)
|
|
|
|
key_phrase_re = engine.get('known_url', None)
|
|
|
|
|
|
|
|
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
|
|
|
|
break
|
|
|
|
|
|
|
|
if is_search_engine:
|
|
|
|
dictionary = search_engine_referers
|
|
|
|
elif super_hit['robot']:
|
|
|
|
dictionary = robots_referers
|
|
|
|
# print '%s => %s' % (uri, super_hit['remote_ip'])
|
|
|
|
else:
|
|
|
|
dictionary = referers
|
|
|
|
if r['is_page']:
|
|
|
|
key = 'pages'
|
|
|
|
else:
|
|
|
|
key = 'hits'
|
|
|
|
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
|
|
|
|
dictionary[uri][key] += 1
|
|
|
|
|
|
|
|
month_stats['referers'] = referers
|
|
|
|
month_stats['robots_referers'] = robots_referers
|
|
|
|
month_stats['search_engine_referers'] = search_engine_referers
|
|
|
|
month_stats['key_phrases'] = key_phrases
|