Do a more generic plugin : page_to_hit
This commit is contained in:
parent
d5db763b48
commit
7405cf237a
3
conf.py
3
conf.py
|
@ -11,11 +11,12 @@ analyzed_filename = 'access.log'
|
||||||
DB_ROOT = './output/'
|
DB_ROOT = './output/'
|
||||||
DISPLAY_ROOT = './output/'
|
DISPLAY_ROOT = './output/'
|
||||||
|
|
||||||
pre_analysis_hooks = ['soutade', 'robots']
|
pre_analysis_hooks = ['page_to_hit', 'robots']
|
||||||
post_analysis_hooks = ['top_visitors', 'reverse_dns']
|
post_analysis_hooks = ['top_visitors', 'reverse_dns']
|
||||||
display_hooks = ['top_visitors']
|
display_hooks = ['top_visitors']
|
||||||
|
|
||||||
reverse_dns_timeout = 0.2
|
reverse_dns_timeout = 0.2
|
||||||
|
page_to_hit_conf = [r'^.+/logo/$']
|
||||||
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
|
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
|
||||||
# post_analysis_hooks = ['top_visitors.py']
|
# post_analysis_hooks = ['top_visitors.py']
|
||||||
# display_hooks = ['top_visitors.py']
|
# display_hooks = ['top_visitors.py']
|
||||||
|
|
|
@ -31,6 +31,8 @@ class IPlugin(object):
|
||||||
def preloadPlugins(plugins, iwla):
|
def preloadPlugins(plugins, iwla):
|
||||||
cache_plugins = {}
|
cache_plugins = {}
|
||||||
|
|
||||||
|
print "==> Preload plugins"
|
||||||
|
|
||||||
for root in plugins.keys():
|
for root in plugins.keys():
|
||||||
for plugin_filename in plugins[root]:
|
for plugin_filename in plugins[root]:
|
||||||
plugin_path = root + '.' + plugin_filename
|
plugin_path = root + '.' + plugin_filename
|
||||||
|
|
13
iwla.py
13
iwla.py
|
@ -1,11 +1,8 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import glob
|
|
||||||
import imp
|
|
||||||
import pickle
|
import pickle
|
||||||
import gzip
|
import gzip
|
||||||
import importlib
|
import importlib
|
||||||
|
@ -126,7 +123,7 @@ class IWLA(object):
|
||||||
remote_addr = hit['remote_addr']
|
remote_addr = hit['remote_addr']
|
||||||
|
|
||||||
if not remote_addr in self.current_analysis['visits'].keys():
|
if not remote_addr in self.current_analysis['visits'].keys():
|
||||||
self._createUser(hit)
|
self._createVisitor(hit)
|
||||||
return
|
return
|
||||||
|
|
||||||
super_hit = self.current_analysis['visits'][remote_addr]
|
super_hit = self.current_analysis['visits'][remote_addr]
|
||||||
|
@ -160,7 +157,7 @@ class IWLA(object):
|
||||||
else:
|
else:
|
||||||
super_hit[hit_key] += 1
|
super_hit[hit_key] += 1
|
||||||
|
|
||||||
def _createUser(self, hit):
|
def _createVisitor(self, hit):
|
||||||
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
|
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
|
||||||
super_hit['remote_addr'] = hit['remote_addr']
|
super_hit['remote_addr'] = hit['remote_addr']
|
||||||
super_hit['viewed_pages'] = 0
|
super_hit['viewed_pages'] = 0
|
||||||
|
@ -347,7 +344,7 @@ class IWLA(object):
|
||||||
else:
|
else:
|
||||||
if not self.analyse_started:
|
if not self.analyse_started:
|
||||||
if time.mktime(cur_time) >= time.mktime(t):
|
if time.mktime(cur_time) >= time.mktime(t):
|
||||||
return
|
return False
|
||||||
else:
|
else:
|
||||||
self.analyse_started = True
|
self.analyse_started = True
|
||||||
if cur_time.tm_mon != t.tm_mon:
|
if cur_time.tm_mon != t.tm_mon:
|
||||||
|
@ -370,7 +367,7 @@ class IWLA(object):
|
||||||
def start(self):
|
def start(self):
|
||||||
self.cache_plugins = preloadPlugins(self.plugins, self)
|
self.cache_plugins = preloadPlugins(self.plugins, self)
|
||||||
|
|
||||||
print '==> Analysing log'
|
print '==> Analyse previous database'
|
||||||
|
|
||||||
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
||||||
if self.meta_infos['last_time']:
|
if self.meta_infos['last_time']:
|
||||||
|
@ -378,6 +375,8 @@ class IWLA(object):
|
||||||
else:
|
else:
|
||||||
self._clearVisits()
|
self._clearVisits()
|
||||||
|
|
||||||
|
print '==> Analysing log'
|
||||||
|
|
||||||
with open(conf.analyzed_filename) as f:
|
with open(conf.analyzed_filename) as f:
|
||||||
for l in f:
|
for l in f:
|
||||||
# print "line " + l
|
# print "line " + l
|
||||||
|
|
38
plugins/pre_analysis/page_to_hit.py
Normal file
38
plugins/pre_analysis/page_to_hit.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
from iwla import IWLA
|
||||||
|
from iplugin import IPlugin
|
||||||
|
|
||||||
|
# Basic rule to detect robots
|
||||||
|
|
||||||
|
class IWLAPreAnalysisPageToHit(IPlugin):
|
||||||
|
|
||||||
|
def __init__(self, iwla):
|
||||||
|
super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
|
||||||
|
self.API_VERSION = 1
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
# Remove logo from indefero
|
||||||
|
self.regexps = self.iwla.getConfValue('page_to_hit_conf', [])
|
||||||
|
if not self.regexps: return False
|
||||||
|
self.regexps = map(lambda(r): re.compile(r), self.regexps)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def hook(self, iwla):
|
||||||
|
hits = iwla.getCurrentVisists()
|
||||||
|
|
||||||
|
for (k, super_hit) in hits.items():
|
||||||
|
if super_hit['robot']: continue
|
||||||
|
|
||||||
|
for p in super_hit['requests']:
|
||||||
|
if not p['is_page']: continue
|
||||||
|
if int(p['status']) != 200: continue
|
||||||
|
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
|
||||||
|
uri = p['extract_request']['extract_uri']
|
||||||
|
for r in self.regexps:
|
||||||
|
if r.match(uri):
|
||||||
|
p['is_page'] = False
|
||||||
|
super_hit['viewed_pages'] -= 1
|
||||||
|
super_hit['viewed_hits'] += 1
|
||||||
|
break
|
|
@ -1,35 +0,0 @@
|
||||||
import re
|
|
||||||
|
|
||||||
from iwla import IWLA
|
|
||||||
from iplugin import IPlugin
|
|
||||||
|
|
||||||
# Basic rule to detect robots
|
|
||||||
|
|
||||||
class IWLAPreAnalysisSoutade(IPlugin):
|
|
||||||
|
|
||||||
def __init__(self, iwla):
|
|
||||||
super(IWLAPreAnalysisSoutade, self).__init__(iwla)
|
|
||||||
self.API_VERSION = 1
|
|
||||||
|
|
||||||
def load(self):
|
|
||||||
# Remove logo from indefero
|
|
||||||
self.logo_re = re.compile(r'^.+/logo/$')
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def hook(self, iwla):
|
|
||||||
hits = iwla.getCurrentVisists()
|
|
||||||
|
|
||||||
for k in hits.keys():
|
|
||||||
super_hit = hits[k]
|
|
||||||
|
|
||||||
if super_hit['robot']: continue
|
|
||||||
|
|
||||||
for p in super_hit['requests']:
|
|
||||||
if not p['is_page']: continue
|
|
||||||
if int(p['status']) != 200: continue
|
|
||||||
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
|
|
||||||
if self.logo_re.match(p['extract_request']['extract_uri']):
|
|
||||||
p['is_page'] = False
|
|
||||||
super_hit['viewed_pages'] -= 1
|
|
||||||
super_hit['viewed_hits'] += 1
|
|
Loading…
Reference in New Issue
Block a user