Do a more generic plugin : page_to_hit

This commit is contained in:
Grégory Soutadé 2014-11-25 16:22:07 +01:00
parent d5db763b48
commit 7405cf237a
5 changed files with 48 additions and 43 deletions

View File

@ -11,11 +11,12 @@ analyzed_filename = 'access.log'
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['soutade', 'robots']
pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['top_visitors', 'reverse_dns']
display_hooks = ['top_visitors']
reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo/$']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py']

View File

@ -31,6 +31,8 @@ class IPlugin(object):
def preloadPlugins(plugins, iwla):
cache_plugins = {}
print "==> Preload plugins"
for root in plugins.keys():
for plugin_filename in plugins[root]:
plugin_path = root + '.' + plugin_filename

13
iwla.py
View File

@ -1,11 +1,8 @@
#!/usr/bin/env python
import sys
import os
import re
import time
import glob
import imp
import pickle
import gzip
import importlib
@ -126,7 +123,7 @@ class IWLA(object):
remote_addr = hit['remote_addr']
if not remote_addr in self.current_analysis['visits'].keys():
self._createUser(hit)
self._createVisitor(hit)
return
super_hit = self.current_analysis['visits'][remote_addr]
@ -160,7 +157,7 @@ class IWLA(object):
else:
super_hit[hit_key] += 1
def _createUser(self, hit):
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
@ -347,7 +344,7 @@ class IWLA(object):
else:
if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
return
return False
else:
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
@ -370,7 +367,7 @@ class IWLA(object):
def start(self):
self.cache_plugins = preloadPlugins(self.plugins, self)
print '==> Analysing log'
print '==> Analyse previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
@ -378,6 +375,8 @@ class IWLA(object):
else:
self._clearVisits()
print '==> Analysing log'
with open(conf.analyzed_filename) as f:
for l in f:
# print "line " + l

View File

@ -0,0 +1,38 @@
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisPageToHit(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.regexps = self.iwla.getConfValue('page_to_hit_conf', [])
if not self.regexps: return False
self.regexps = map(lambda(r): re.compile(r), self.regexps)
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
uri = p['extract_request']['extract_uri']
for r in self.regexps:
if r.match(uri):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break

View File

@ -1,35 +0,0 @@
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisSoutade(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisSoutade, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.logo_re = re.compile(r'^.+/logo/$')
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
if self.logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1