import time import re import HTMLParser from iwla import IWLA from iplugin import IPlugin from display import * import awstats_data class IWLADisplayReferers(IPlugin): def __init__(self, iwla): super(IWLADisplayReferers, self).__init__(iwla) self.API_VERSION = 1 def load(self): domain_name = self.iwla.getConfValue('domain_name', '') if not domain_name: print 'domain_name required in conf' return False self.own_domain_re = re.compile('.*%s.*' % (domain_name)) self.search_engines = {} for engine in awstats_data.search_engines: self.search_engines[engine] = { 're' : re.compile(engine, re.IGNORECASE) } for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): if not engine in self.search_engines: continue self.search_engines[engine]['not_search_engine'] = \ re.compile(not_engine, re.IGNORECASE) for (engine, name) in awstats_data.search_engines_hashid.items(): if not engine in self.search_engines: continue self.search_engines[engine]['name'] = name for (engine, knwown_url) in awstats_data.search_engines_knwown_url.items(): engine = engin[2:-2] if not engine in self.search_engines: continue print knwown_url self.search_engines[engine]['known_url'] = re.compile(known_url + '(?P.+)') self.html_parser = HTMLParser.HTMLParser() return True def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): if not parameters or not key_phrase_re: return for p in parameters.split('&'): groups = key_phrase_re.match(p) if groups: print groups.groupddict() key_phrase = self.html_parser.unescape(groups.groupddict()['key_phrase']).lower() if not key_phrase in key_phrases.keys(): key_phrases[key_phrase] = 1 else: key_phrases[key_phrase] += 1 def hook(self, iwla): stats = iwla.getCurrentVisists() referers = {} robots_referers = {} search_engine_referers = {} key_phrases = {} for (k, super_hit) in stats.items(): for r in super_hit['requests']: if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] is_search_engine = False if self.own_domain_re.match(uri): continue for e in self.search_engines.values(): if e['re'].match(uri): not_engine = e.get('not_search_engine', None) # Try not engine if not_engine and not_engine.match(uri): break is_search_engine = True uri = e['name'] parameters = r['extract_referer'].get('extract_parameters', None) key_phrase_re = e.get('known_url', None) print parameters print key_phrase_re self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) break if is_search_engine: dictionary = search_engine_referers elif super_hit['robot']: dictionary = robots_referers # print '%s => %s' % (uri, super_hit['remote_ip']) else: dictionary = referers if r['is_page']: key = 'pages' else: key = 'hits' if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} dictionary[uri][key] += 1 top_referers = [(k, referers[k]['pages']) for k in referers.keys()] top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True) top_robots_referers = [(k, robots_referers[k]['pages']) for k in robots_referers.keys()] top_robots_referers = sorted(top_robots_referers, key=lambda t: t[1], reverse=True) top_search_engine_referers = [(k, search_engine_referers[k]['pages']) for k in search_engine_referers.keys()] top_search_engine_referers = sorted(top_search_engine_referers, key=lambda t: t[1], reverse=True) top_key_phrases = key_phrases.items() top_key_phrases = sorted(top_key_phrases, key=lambda t: t[1], reverse=True) # Top referers in index index = self.iwla.getDisplayIndex() table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) table.appendRow(['Search Engine', '', '']) for r,_ in top_search_engine_referers[:10]: row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL', '', '']) for r,_ in top_referers[:10]: row = [r, referers[r]['pages'], referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL (robot)', '', '']) for r,_ in top_robots_referers[:10]: row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] table.appendRow(row) index.appendBlock(table) # All referers in a file cur_time = self.iwla.getCurTime() title = time.strftime('Connexion from - %B %Y', cur_time) filename = 'referers_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) page = DisplayHTMLPage(title, path) table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) table.appendRow(['Search Engine', '', '']) for r,_ in top_search_engine_referers: row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL', '', '']) for r,_ in top_referers: row = [r, referers[r]['pages'], referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL (robot)', '', '']) for r,_ in top_robots_referers: row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] table.appendRow(row) page.appendBlock(table) display = self.iwla.getDisplay() display.addPage(page) block = DisplayHTMLRawBlock() block.setRawHTML('All referers' % (filename)) index.appendBlock(block) # Top key phrases in index table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) for phrase in top_key_phrases[:10]: table.appendRow([phrase[0], phrase[1]]) index.appendBlock(table) # All key phrases in a file cur_time = self.iwla.getCurTime() title = time.strftime('Key Phrases - %B %Y', cur_time) filename = 'key_phrases_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) page = DisplayHTMLPage(title, path) table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) for phrase in top_key_phrases: table.appendRow([phrase[0], phrase[1]]) page.appendBlock(table) display.addPage(page)