import re import urllib from iwla import IWLA from iplugin import IPlugin import awstats_data class IWLAPostAnalysisReferers(IPlugin): def __init__(self, iwla): super(IWLAPostAnalysisReferers, self).__init__(iwla) self.API_VERSION = 1 self.conf_requires = ['domain_name'] def _getSearchEngine(self, hashid): for (k, e) in self.search_engines.items(): for (h,h_re) in e['hashid']: if hashid == h: return k return None def load(self): domain_name = self.iwla.getConfValue('domain_name', '') if not domain_name: print 'domain_name must not be empty !' return False self.own_domain_re = re.compile(r'.*%s.*' % (domain_name)) self.search_engines = {} for (hashid, name) in awstats_data.search_engines_hashid.items(): hashid_re = re.compile(r'.*%s.*' % (hashid)) if not name in self.search_engines.keys(): self.search_engines[name] = { 'hashid' : [(hashid, hashid_re)] } else: self.search_engines[name]['hashid'].append((hashid, hashid_re)) #print 'Hashid %s => %s' % (name, hashid) for (name, known_url) in awstats_data.search_engines_knwown_url.items(): self.search_engines[name]['known_url'] = re.compile(known_url + '(?P.+)') for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): not_engine_re = re.compile(r'.*%s.*' % (not_engine)) key = self._getSearchEngine(engine) if key: self.search_engines[key]['not_search_engine'] = not_engine_re return True def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases): if not parameters or not key_phrase_re: return for p in parameters.split('&'): groups = key_phrase_re.match(p) if groups: key_phrase = groups.groupdict()['key_phrase'] key_phrase = urllib.unquote_plus(key_phrase).decode('utf8') if not key_phrase in key_phrases.keys(): key_phrases[key_phrase] = 1 else: key_phrases[key_phrase] += 1 break def hook(self): stats = self.iwla.getCurrentVisists() month_stats = self.iwla.getMonthStats() referers = month_stats.get('referers', {}) robots_referers = month_stats.get('robots_referers', {}) search_engine_referers = month_stats.get('search_engine_referers', {}) key_phrases = month_stats.get('key_phrases', {}) for (k, super_hit) in stats.items(): for r in super_hit['requests']: if not self.iwla.isValidForCurrentAnalysis(r): continue if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] is_search_engine = False if self.own_domain_re.match(uri): continue for (name, engine) in self.search_engines.items(): for (hashid, hashid_re) in engine['hashid']: if not hashid_re.match(uri): continue not_engine = engine.get('not_search_engine', None) # Try not engine if not_engine and not_engine.match(uri): break is_search_engine = True uri = name parameters = r['extract_referer'].get('extract_parameters', None) key_phrase_re = engine.get('known_url', None) self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) break if is_search_engine: dictionary = search_engine_referers elif super_hit['robot']: dictionary = robots_referers # print '%s => %s' % (uri, super_hit['remote_ip']) else: dictionary = referers if r['is_page']: key = 'pages' else: key = 'hits' if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0} dictionary[uri][key] += 1 month_stats['referers'] = referers month_stats['robots_referers'] = robots_referers month_stats['search_engine_referers'] = search_engine_referers month_stats['key_phrases'] = key_phrases