2014-11-26 16:17:16 +01:00
|
|
|
import time
|
|
|
|
import re
|
|
|
|
import HTMLParser
|
|
|
|
|
|
|
|
from iwla import IWLA
|
|
|
|
from iplugin import IPlugin
|
|
|
|
from display import *
|
|
|
|
|
|
|
|
import awstats_data
|
|
|
|
|
|
|
|
class IWLADisplayReferers(IPlugin):
|
|
|
|
def __init__(self, iwla):
|
|
|
|
super(IWLADisplayReferers, self).__init__(iwla)
|
|
|
|
self.API_VERSION = 1
|
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
def _getSearchEngine(self, engine):
|
|
|
|
for (k, e) in self.search_engines.items():
|
|
|
|
for hashid in e['hashid']:
|
|
|
|
if hashid.match(engine):
|
|
|
|
return k
|
|
|
|
print 'Not found %s' % (engine)
|
|
|
|
return None
|
|
|
|
|
2014-11-26 16:17:16 +01:00
|
|
|
def load(self):
|
|
|
|
domain_name = self.iwla.getConfValue('domain_name', '')
|
|
|
|
|
|
|
|
if not domain_name:
|
|
|
|
print 'domain_name required in conf'
|
|
|
|
return False
|
|
|
|
|
|
|
|
self.own_domain_re = re.compile('.*%s.*' % (domain_name))
|
|
|
|
self.search_engines = {}
|
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
for (engine, known_url) in awstats_data.search_engines_knwown_url.items():
|
2014-11-26 16:17:16 +01:00
|
|
|
self.search_engines[engine] = {
|
2014-11-26 16:56:33 +01:00
|
|
|
'known_url' : re.compile(known_url + '(?P<key_phrase>.+)'),
|
|
|
|
'hashid' : []
|
2014-11-26 16:17:16 +01:00
|
|
|
}
|
2014-11-26 16:56:33 +01:00
|
|
|
|
|
|
|
for (hashid, engine) in awstats_data.search_engines_hashid.items():
|
|
|
|
hashid_re = re.compile('.*%s.*' % (hashid))
|
|
|
|
if not engine in self.search_engines.keys():
|
|
|
|
self.search_engines[engine] = {
|
|
|
|
'hashid' : [hashid_re]
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
self.search_engines[engine]['hashid'].append(hashid_re)
|
|
|
|
print 'Hashid %s => %s' % (engine, hashid)
|
|
|
|
|
2014-11-26 16:17:16 +01:00
|
|
|
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
|
2014-11-26 16:56:33 +01:00
|
|
|
not_engine_re = re.compile('.*%s.*' % (not_engine))
|
|
|
|
key = self._getSearchEngine(engine)
|
|
|
|
if key:
|
|
|
|
self.search_engines[key]['not_search_engine'] = not_engine_re
|
2014-11-26 16:17:16 +01:00
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
for engine in awstats_data.search_engines:
|
|
|
|
engine_re = re.compile('.*%s.*' % (engine), re.IGNORECASE)
|
|
|
|
key = self._getSearchEngine(engine)
|
|
|
|
if key:
|
|
|
|
self.search_engines[key]['re'] = not_engine_re
|
2014-11-26 16:17:16 +01:00
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
for (k,e) in self.search_engines.items():
|
|
|
|
if not 're' in e.keys():
|
|
|
|
print 'Remove %s' % k
|
|
|
|
del self.search_engines[k]
|
2014-11-26 16:17:16 +01:00
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
print self.search_engines
|
2014-11-26 16:17:16 +01:00
|
|
|
|
|
|
|
self.html_parser = HTMLParser.HTMLParser()
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
|
|
|
|
if not parameters or not key_phrase_re: return
|
|
|
|
|
|
|
|
for p in parameters.split('&'):
|
|
|
|
groups = key_phrase_re.match(p)
|
|
|
|
if groups:
|
|
|
|
print groups.groupddict()
|
|
|
|
key_phrase = self.html_parser.unescape(groups.groupddict()['key_phrase']).lower()
|
|
|
|
if not key_phrase in key_phrases.keys():
|
|
|
|
key_phrases[key_phrase] = 1
|
|
|
|
else:
|
|
|
|
key_phrases[key_phrase] += 1
|
|
|
|
|
|
|
|
def hook(self, iwla):
|
|
|
|
stats = iwla.getCurrentVisists()
|
|
|
|
referers = {}
|
|
|
|
robots_referers = {}
|
|
|
|
search_engine_referers = {}
|
|
|
|
key_phrases = {}
|
|
|
|
|
|
|
|
for (k, super_hit) in stats.items():
|
|
|
|
for r in super_hit['requests']:
|
|
|
|
if not r['http_referer']: continue
|
|
|
|
|
|
|
|
uri = r['extract_referer']['extract_uri']
|
|
|
|
is_search_engine = False
|
|
|
|
|
|
|
|
if self.own_domain_re.match(uri): continue
|
|
|
|
|
|
|
|
for e in self.search_engines.values():
|
|
|
|
if e['re'].match(uri):
|
|
|
|
not_engine = e.get('not_search_engine', None)
|
|
|
|
# Try not engine
|
|
|
|
if not_engine and not_engine.match(uri): break
|
|
|
|
is_search_engine = True
|
|
|
|
uri = e['name']
|
|
|
|
|
|
|
|
parameters = r['extract_referer'].get('extract_parameters', None)
|
|
|
|
key_phrase_re = e.get('known_url', None)
|
|
|
|
|
2014-11-26 16:56:33 +01:00
|
|
|
# print parameters
|
|
|
|
# print key_phrase_re
|
2014-11-26 16:17:16 +01:00
|
|
|
|
|
|
|
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
if is_search_engine:
|
|
|
|
dictionary = search_engine_referers
|
|
|
|
elif super_hit['robot']:
|
|
|
|
dictionary = robots_referers
|
|
|
|
# print '%s => %s' % (uri, super_hit['remote_ip'])
|
|
|
|
else:
|
|
|
|
dictionary = referers
|
|
|
|
if r['is_page']:
|
|
|
|
key = 'pages'
|
|
|
|
else:
|
|
|
|
key = 'hits'
|
|
|
|
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
|
|
|
|
dictionary[uri][key] += 1
|
|
|
|
|
|
|
|
top_referers = [(k, referers[k]['pages']) for k in referers.keys()]
|
|
|
|
top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True)
|
|
|
|
|
|
|
|
top_robots_referers = [(k, robots_referers[k]['pages']) for k in robots_referers.keys()]
|
|
|
|
top_robots_referers = sorted(top_robots_referers, key=lambda t: t[1], reverse=True)
|
|
|
|
|
|
|
|
top_search_engine_referers = [(k, search_engine_referers[k]['pages']) for k in search_engine_referers.keys()]
|
|
|
|
top_search_engine_referers = sorted(top_search_engine_referers, key=lambda t: t[1], reverse=True)
|
|
|
|
|
|
|
|
top_key_phrases = key_phrases.items()
|
|
|
|
top_key_phrases = sorted(top_key_phrases, key=lambda t: t[1], reverse=True)
|
|
|
|
|
|
|
|
# Top referers in index
|
|
|
|
index = self.iwla.getDisplayIndex()
|
|
|
|
|
|
|
|
table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits'])
|
|
|
|
table.appendRow(['<b>Search Engine</b>', '', ''])
|
|
|
|
for r,_ in top_search_engine_referers[:10]:
|
|
|
|
row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
table.appendRow(['<b>External URL</b>', '', ''])
|
|
|
|
for r,_ in top_referers[:10]:
|
|
|
|
row = [r, referers[r]['pages'], referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
table.appendRow(['<b>External URL (robot)</b>', '', ''])
|
|
|
|
for r,_ in top_robots_referers[:10]:
|
|
|
|
row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
index.appendBlock(table)
|
|
|
|
|
|
|
|
# All referers in a file
|
|
|
|
cur_time = self.iwla.getCurTime()
|
|
|
|
title = time.strftime('Connexion from - %B %Y', cur_time)
|
|
|
|
|
|
|
|
filename = 'referers_%d.html' % (cur_time.tm_mon)
|
|
|
|
path = '%d/%s' % (cur_time.tm_year, filename)
|
|
|
|
|
|
|
|
page = DisplayHTMLPage(title, path)
|
|
|
|
table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits'])
|
|
|
|
|
|
|
|
table.appendRow(['<b>Search Engine</b>', '', ''])
|
|
|
|
for r,_ in top_search_engine_referers:
|
|
|
|
row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
table.appendRow(['<b>External URL</b>', '', ''])
|
|
|
|
for r,_ in top_referers:
|
|
|
|
row = [r, referers[r]['pages'], referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
table.appendRow(['<b>External URL (robot)</b>', '', ''])
|
|
|
|
for r,_ in top_robots_referers:
|
|
|
|
row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']]
|
|
|
|
table.appendRow(row)
|
|
|
|
|
|
|
|
page.appendBlock(table)
|
|
|
|
|
|
|
|
display = self.iwla.getDisplay()
|
|
|
|
display.addPage(page)
|
|
|
|
|
|
|
|
block = DisplayHTMLRawBlock()
|
|
|
|
block.setRawHTML('<a href=\'%s\'>All referers</a>' % (filename))
|
|
|
|
index.appendBlock(block)
|
|
|
|
|
|
|
|
# Top key phrases in index
|
|
|
|
table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search'])
|
|
|
|
for phrase in top_key_phrases[:10]:
|
|
|
|
table.appendRow([phrase[0], phrase[1]])
|
|
|
|
index.appendBlock(table)
|
|
|
|
|
|
|
|
# All key phrases in a file
|
|
|
|
cur_time = self.iwla.getCurTime()
|
|
|
|
title = time.strftime('Key Phrases - %B %Y', cur_time)
|
|
|
|
|
|
|
|
filename = 'key_phrases_%d.html' % (cur_time.tm_mon)
|
|
|
|
path = '%d/%s' % (cur_time.tm_year, filename)
|
|
|
|
|
|
|
|
page = DisplayHTMLPage(title, path)
|
|
|
|
table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search'])
|
|
|
|
for phrase in top_key_phrases:
|
|
|
|
table.appendRow([phrase[0], phrase[1]])
|
|
|
|
page.appendBlock(table)
|
|
|
|
|
|
|
|
display.addPage(page)
|