import time import re import xml.sax.saxutils as saxutils from iwla import IWLA from iplugin import IPlugin from display import * import awstats_data class IWLADisplayReferers(IPlugin): def __init__(self, iwla): super(IWLADisplayReferers, self).__init__(iwla) self.API_VERSION = 1 def _getSearchEngine(self, hashid): #print 'Look for %s' % engine for (k, e) in self.search_engines.items(): for (h,h_re) in e['hashid']: if hashid == h: return k #print 'Not found %s' % (hashid) return None def load(self): domain_name = self.iwla.getConfValue('domain_name', '') if not domain_name: print 'domain_name required in conf' return False self.own_domain_re = re.compile(r'.*%s.*' % (domain_name)) self.search_engines = {} for (hashid, name) in awstats_data.search_engines_hashid.items(): hashid_re = re.compile(r'.*%s.*' % (hashid)) if not name in self.search_engines.keys(): self.search_engines[name] = { 'hashid' : [(hashid, hashid_re)] } else: self.search_engines[name]['hashid'].append((hashid, hashid_re)) #print 'Hashid %s => %s' % (name, hashid) for (name, known_url) in awstats_data.search_engines_knwown_url.items(): self.search_engines[name]['known_url'] = re.compile(known_url + '(?P.+)') for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): not_engine_re = re.compile(r'.*%s.*' % (not_engine)) key = self._getSearchEngine(engine) if key: self.search_engines[key]['not_search_engine'] = not_engine_re #self.html_parser = html.parser.HTMLParser() return True def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): if not parameters or not key_phrase_re: return for p in parameters.split('&'): groups = key_phrase_re.match(p) if groups: key_phrase = groups.groupdict()['key_phrase'] key_phrase = key_phrase.replace('+', ' ').lower() key_phrase = saxutils.unescape(key_phrase) if not key_phrase in key_phrases.keys(): key_phrases[key_phrase] = 1 else: key_phrases[key_phrase] += 1 break def hook(self, iwla): stats = iwla.getCurrentVisists() referers = {} robots_referers = {} search_engine_referers = {} key_phrases = {} for (k, super_hit) in stats.items(): for r in super_hit['requests']: if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] is_search_engine = False if self.own_domain_re.match(uri): continue for (name, engine) in self.search_engines.items(): for (hashid, hashid_re) in engine['hashid']: if not hashid_re.match(uri): continue not_engine = engine.get('not_search_engine', None) # Try not engine if not_engine and not_engine.match(uri): break is_search_engine = True uri = name parameters = r['extract_referer'].get('extract_parameters', None) key_phrase_re = engine.get('known_url', None) self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) break if is_search_engine: dictionary = search_engine_referers elif super_hit['robot']: dictionary = robots_referers # print '%s => %s' % (uri, super_hit['remote_ip']) else: dictionary = referers if r['is_page']: key = 'pages' else: key = 'hits' if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} dictionary[uri][key] += 1 top_referers = [(k, referers[k]['pages']) for k in referers.keys()] top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True) top_robots_referers = [(k, robots_referers[k]['pages']) for k in robots_referers.keys()] top_robots_referers = sorted(top_robots_referers, key=lambda t: t[1], reverse=True) top_search_engine_referers = [(k, search_engine_referers[k]['pages']) for k in search_engine_referers.keys()] top_search_engine_referers = sorted(top_search_engine_referers, key=lambda t: t[1], reverse=True) top_key_phrases = key_phrases.items() top_key_phrases = sorted(top_key_phrases, key=lambda t: t[1], reverse=True) # Top referers in index index = self.iwla.getDisplayIndex() table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) table.appendRow(['Search Engine', '', '']) for r,_ in top_search_engine_referers[:10]: row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL', '', '']) for r,_ in top_referers[:10]: row = [r, referers[r]['pages'], referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL (robot)', '', '']) for r,_ in top_robots_referers[:10]: row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] table.appendRow(row) index.appendBlock(table) # All referers in a file cur_time = self.iwla.getCurTime() title = time.strftime('Connexion from - %B %Y', cur_time) filename = 'referers_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) page = DisplayHTMLPage(title, path) table = DisplayHTMLBlockTable('Connexion from', ['Origin', 'Pages', 'Hits']) table.appendRow(['Search Engine', '', '']) for r,_ in top_search_engine_referers: row = [r, search_engine_referers[r]['pages'], search_engine_referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL', '', '']) for r,_ in top_referers: row = [r, referers[r]['pages'], referers[r]['hits']] table.appendRow(row) table.appendRow(['External URL (robot)', '', '']) for r,_ in top_robots_referers: row = [r, robots_referers[r]['pages'], robots_referers[r]['hits']] table.appendRow(row) page.appendBlock(table) display = self.iwla.getDisplay() display.addPage(page) block = DisplayHTMLRawBlock() block.setRawHTML('All referers' % (filename)) index.appendBlock(block) # Top key phrases in index table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) for phrase in top_key_phrases[:10]: table.appendRow([phrase[0], phrase[1]]) index.appendBlock(table) # All key phrases in a file cur_time = self.iwla.getCurTime() title = time.strftime('Key Phrases - %B %Y', cur_time) filename = 'key_phrases_%d.html' % (cur_time.tm_mon) path = '%d/%s' % (cur_time.tm_year, filename) page = DisplayHTMLPage(title, path) table = DisplayHTMLBlockTable('Top key phrases', ['Key phrase', 'Search']) for phrase in top_key_phrases: table.appendRow([phrase[0], phrase[1]]) page.appendBlock(table) display.addPage(page) block = DisplayHTMLRawBlock() block.setRawHTML('All key phrases' % (filename)) index.appendBlock(block)