Fix key_phrases

This commit is contained in:
Grégory Soutadé 2014-11-26 19:33:08 +01:00
parent e6b31fbf8a
commit 92533cc244

View File

@ -1,6 +1,6 @@
import time import time
import re import re
import HTMLParser import xml.sax.saxutils as saxutils
from iwla import IWLA from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
@ -13,12 +13,13 @@ class IWLADisplayReferers(IPlugin):
super(IWLADisplayReferers, self).__init__(iwla) super(IWLADisplayReferers, self).__init__(iwla)
self.API_VERSION = 1 self.API_VERSION = 1
def _getSearchEngine(self, engine): def _getSearchEngine(self, hashid):
#print 'Look for %s' % engine
for (k, e) in self.search_engines.items(): for (k, e) in self.search_engines.items():
for hashid in e['hashid']: for (h,h_re) in e['hashid']:
if hashid.match(engine): if hashid == h:
return k return k
print 'Not found %s' % (engine) #print 'Not found %s' % (hashid)
return None return None
def load(self): def load(self):
@ -28,45 +29,29 @@ class IWLADisplayReferers(IPlugin):
print 'domain_name required in conf' print 'domain_name required in conf'
return False return False
self.own_domain_re = re.compile('.*%s.*' % (domain_name)) self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
self.search_engines = {} self.search_engines = {}
for (engine, known_url) in awstats_data.search_engines_knwown_url.items(): for (hashid, name) in awstats_data.search_engines_hashid.items():
self.search_engines[engine] = { hashid_re = re.compile(r'.*%s.*' % (hashid))
'known_url' : re.compile(known_url + '(?P<key_phrase>.+)'), if not name in self.search_engines.keys():
'hashid' : [] self.search_engines[name] = {
} 'hashid' : [(hashid, hashid_re)]
for (hashid, engine) in awstats_data.search_engines_hashid.items():
hashid_re = re.compile('.*%s.*' % (hashid))
if not engine in self.search_engines.keys():
self.search_engines[engine] = {
'hashid' : [hashid_re]
} }
else: else:
self.search_engines[engine]['hashid'].append(hashid_re) self.search_engines[name]['hashid'].append((hashid, hashid_re))
print 'Hashid %s => %s' % (engine, hashid) #print 'Hashid %s => %s' % (name, hashid)
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
for (engine, not_engine) in awstats_data.not_search_engines_keys.items(): for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
not_engine_re = re.compile('.*%s.*' % (not_engine)) not_engine_re = re.compile(r'.*%s.*' % (not_engine))
key = self._getSearchEngine(engine) key = self._getSearchEngine(engine)
if key: if key:
self.search_engines[key]['not_search_engine'] = not_engine_re self.search_engines[key]['not_search_engine'] = not_engine_re
for engine in awstats_data.search_engines: #self.html_parser = html.parser.HTMLParser()
engine_re = re.compile('.*%s.*' % (engine), re.IGNORECASE)
key = self._getSearchEngine(engine)
if key:
self.search_engines[key]['re'] = not_engine_re
for (k,e) in self.search_engines.items():
if not 're' in e.keys():
print 'Remove %s' % k
del self.search_engines[k]
print self.search_engines
self.html_parser = HTMLParser.HTMLParser()
return True return True
@ -76,12 +61,14 @@ class IWLADisplayReferers(IPlugin):
for p in parameters.split('&'): for p in parameters.split('&'):
groups = key_phrase_re.match(p) groups = key_phrase_re.match(p)
if groups: if groups:
print groups.groupddict() key_phrase = groups.groupdict()['key_phrase']
key_phrase = self.html_parser.unescape(groups.groupddict()['key_phrase']).lower() key_phrase = key_phrase.replace('+', ' ').lower()
key_phrase = saxutils.unescape(key_phrase)
if not key_phrase in key_phrases.keys(): if not key_phrase in key_phrases.keys():
key_phrases[key_phrase] = 1 key_phrases[key_phrase] = 1
else: else:
key_phrases[key_phrase] += 1 key_phrases[key_phrase] += 1
break
def hook(self, iwla): def hook(self, iwla):
stats = iwla.getCurrentVisists() stats = iwla.getCurrentVisists()
@ -99,22 +86,20 @@ class IWLADisplayReferers(IPlugin):
if self.own_domain_re.match(uri): continue if self.own_domain_re.match(uri): continue
for e in self.search_engines.values(): for (name, engine) in self.search_engines.items():
if e['re'].match(uri): for (hashid, hashid_re) in engine['hashid']:
not_engine = e.get('not_search_engine', None) if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine # Try not engine
if not_engine and not_engine.match(uri): break if not_engine and not_engine.match(uri): break
is_search_engine = True is_search_engine = True
uri = e['name'] uri = name
parameters = r['extract_referer'].get('extract_parameters', None) parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = e.get('known_url', None) key_phrase_re = engine.get('known_url', None)
# print parameters
# print key_phrase_re
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break break
if is_search_engine: if is_search_engine:
@ -218,3 +203,7 @@ class IWLADisplayReferers(IPlugin):
page.appendBlock(table) page.appendBlock(table)
display.addPage(page) display.addPage(page)
block = DisplayHTMLRawBlock()
block.setRawHTML('<a href=\'%s\'>All key phrases</a>' % (filename))
index.appendBlock(block)