iwla/plugins/display/referers.py

210 lines
8.0 KiB
Python
Raw Normal View History

2014-11-26 16:17:16 +01:00
import time
import re
2014-11-26 19:33:08 +01:00
import xml.sax.saxutils as saxutils
2014-11-26 16:17:16 +01:00
from iwla import IWLA
from iplugin import IPlugin
from display import *
import awstats_data
class IWLADisplayReferers(IPlugin):
def __init__(self, iwla):
super(IWLADisplayReferers, self).__init__(iwla)
self.API_VERSION = 1
2014-11-26 19:33:08 +01:00
def _getSearchEngine(self, hashid):
#print 'Look for %s' % engine
2014-11-26 16:56:33 +01:00
for (k, e) in self.search_engines.items():
2014-11-26 19:33:08 +01:00
for (h,h_re) in e['hashid']:
if hashid == h:
2014-11-26 16:56:33 +01:00
return k
2014-11-26 19:33:08 +01:00
#print 'Not found %s' % (hashid)
2014-11-26 16:56:33 +01:00
return None
2014-11-26 16:17:16 +01:00
def load(self):
domain_name = self.iwla.getConfValue('domain_name', '')
if not domain_name:
print 'domain_name required in conf'
return False
2014-11-26 19:33:08 +01:00
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
2014-11-26 16:17:16 +01:00
self.search_engines = {}
2014-11-26 19:33:08 +01:00
for (hashid, name) in awstats_data.search_engines_hashid.items():
hashid_re = re.compile(r'.*%s.*' % (hashid))
if not name in self.search_engines.keys():
self.search_engines[name] = {
'hashid' : [(hashid, hashid_re)]
2014-11-26 16:56:33 +01:00
}
else:
2014-11-26 19:33:08 +01:00
self.search_engines[name]['hashid'].append((hashid, hashid_re))
#print 'Hashid %s => %s' % (name, hashid)
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
2014-11-26 16:56:33 +01:00
2014-11-26 16:17:16 +01:00
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
2014-11-26 19:33:08 +01:00
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
2014-11-26 16:56:33 +01:00
key = self._getSearchEngine(engine)
if key:
self.search_engines[key]['not_search_engine'] = not_engine_re
2014-11-26 16:17:16 +01:00
2014-11-26 19:33:08 +01:00
#self.html_parser = html.parser.HTMLParser()
2014-11-26 16:17:16 +01:00
return True
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
if not parameters or not key_phrase_re: return
for p in parameters.split('&'):
groups = key_phrase_re.match(p)
if groups:
2014-11-26 19:33:08 +01:00
key_phrase = groups.groupdict()['key_phrase']
key_phrase = key_phrase.replace('+', ' ').lower()
key_phrase = saxutils.unescape(key_phrase)
2014-11-26 16:17:16 +01:00
if not key_phrase in key_phrases.keys():
key_phrases[key_phrase] = 1
else:
key_phrases[key_phrase] += 1
2014-11-26 19:33:08 +01:00
break
2014-11-26 16:17:16 +01:00
def hook(self, iwla):
stats = iwla.getCurrentVisists()
referers = {}
robots_referers = {}
search_engine_referers = {}
key_phrases = {}
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
is_search_engine = False
if self.own_domain_re.match(uri): continue
2014-11-26 19:33:08 +01:00
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
2014-11-26 16:17:16 +01:00
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
2014-11-26 19:33:08 +01:00
uri = name
2014-11-26 16:17:16 +01:00
parameters = r['extract_referer'].get('extract_parameters', None)
2014-11-26 19:33:08 +01:00
key_phrase_re = engine.get('known_url', None)
2014-11-26 16:17:16 +01:00
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
elif super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:
key = 'hits'
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
dictionary[uri][key] += 1
top_referers = [(k, referers[k]['pages']) for k in referers.keys()]
top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True)
top_robots_referers = [(k, robots_referers[k]['pages']) for k in robots_referers.keys()]
top_robots_referers = sorted(top_robots_referers, key=lambda t: t[1], reverse=True)
top_search_engine_referers = [(k, search_engine_referers[k]['pages']) for k in search_engine_referers.keys()]
top_search_engine_referers = sorted(top_search_engine_referers, key=lambda t: t[1], reverse=True)
top_key_phrases = key_phrases.items()
top_key_phrases = sorted(top_key_phrases, key=lambda t: t[1], reverse=True)
# Top referers in index
index = self.iwla.getDisplayIndex()
table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits'])
table.appendRow(['<b>Search Engine</b>', '', ''])
for r,_ in top_search_engine_referers[:10]:
row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']]
table.appendRow(row)
table.appendRow(['<b>External URL</b>', '', ''])
for r,_ in top_referers[:10]:
row = [r, referers[r]['pages'], referers[r]['hits']]
table.appendRow(row)
table.appendRow(['<b>External URL (robot)</b>', '', ''])
for r,_ in top_robots_referers[:10]:
row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']]
table.appendRow(row)
index.appendBlock(table)
# All referers in a file
cur_time = self.iwla.getCurTime()
title = time.strftime('Connexion from - %B %Y', cur_time)
filename = 'referers_%d.html' % (cur_time.tm_mon)
path = '%d/%s' % (cur_time.tm_year, filename)
page = DisplayHTMLPage(title, path)
table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits'])
table.appendRow(['<b>Search Engine</b>', '', ''])
for r,_ in top_search_engine_referers:
row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']]
table.appendRow(row)
table.appendRow(['<b>External URL</b>', '', ''])
for r,_ in top_referers:
row = [r, referers[r]['pages'], referers[r]['hits']]
table.appendRow(row)
table.appendRow(['<b>External URL (robot)</b>', '', ''])
for r,_ in top_robots_referers:
row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']]
table.appendRow(row)
page.appendBlock(table)
display = self.iwla.getDisplay()
display.addPage(page)
block = DisplayHTMLRawBlock()
block.setRawHTML('<a href=\'%s\'>All referers</a>' % (filename))
index.appendBlock(block)
# Top key phrases in index
table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search'])
for phrase in top_key_phrases[:10]:
table.appendRow([phrase[0], phrase[1]])
index.appendBlock(table)
# All key phrases in a file
cur_time = self.iwla.getCurTime()
title = time.strftime('Key Phrases - %B %Y', cur_time)
filename = 'key_phrases_%d.html' % (cur_time.tm_mon)
path = '%d/%s' % (cur_time.tm_year, filename)
page = DisplayHTMLPage(title, path)
table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search'])
for phrase in top_key_phrases:
table.appendRow([phrase[0], phrase[1]])
page.appendBlock(table)
display.addPage(page)
2014-11-26 19:33:08 +01:00
block = DisplayHTMLRawBlock()
block.setRawHTML('<a href=\'%s\'>All key phrases</a>' % (filename))
index.appendBlock(block)