diff --git a/conf.py b/conf.py
index 9a0f235..5a850e4 100644
--- a/conf.py
+++ b/conf.py
@@ -11,4 +11,10 @@ analyzed_filename = 'access.log'
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
-pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
+pre_analysis_hooks = ['H002_soutade', 'H001_robot']
+post_analysis_hooks = ['top_visitors']
+display_hooks = ['top_visitors']
+
+# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
+# post_analysis_hooks = ['top_visitors.py']
+# display_hooks = ['top_visitors.py']
diff --git a/display.py b/display.py
index aa11976..4de1bd6 100644
--- a/display.py
+++ b/display.py
@@ -1,3 +1,4 @@
+
def createPage(display, filename, title):
page = {}
page['title'] = title;
@@ -14,27 +15,37 @@ def createTable(title, cols):
table['cols'] = cols
table['rows'] = []
+ return table
+
def appendRowToTable(table, row):
table['rows'].append(row)
-def buildPages(display):
+def buildTable(block, f):
+ print 'Write table %s' % block['title']
+ f.write('
')
+ f.write('')
+ for title in block['cols']:
+ f.write('%s | ' % (title))
+ f.write('
')
+ for row in block['rows']:
+ f.write('')
+ for v in row:
+ f.write('%s | ' % (v))
+ f.write('
')
+ f.write('
')
+
+def buildPages(display_root, display):
for filename in display.keys():
page = display[filename]
- with open(DISPLAY_ROOT + filename, 'w') as f:
+ print "OPEN %s" % (display_root + filename)
+ with open(display_root + filename, 'w') as f:
f.write('%s' % (page['title']))
for block in page['blocks']:
+ print "Bluid block"
+ print block
+ print "End block"
if block['type'] == 'html':
f.write(block['value'])
elif block['type'] == 'table':
- f.write('')
- f.write('')
- for title in block['cols']:
- f.write('%s | ' % (title))
- f.write('
')
- for row in block['rows']:
- f.write('')
- for v in row:
- f.write('%s | ' % (v))
- f.write('
')
- f.write('
')
+ buildTable(block, f)
f.write('')
diff --git a/iwla.py b/iwla.py
index f8441f2..d14695b 100755
--- a/iwla.py
+++ b/iwla.py
@@ -17,7 +17,7 @@ DISPLAY_ROOT = './output/'
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
'"$request" $status $body_bytes_sent ' +\
- '"$http_referer" "$http_user_agent"';
+ '"$http_referer" "$http_user_agent"'
time_format = '%d/%b/%Y:%H:%M:%S +0100'
@@ -35,7 +35,7 @@ current_visits = {}
cache_plugins = {}
display = {}
-log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
+log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format)
log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)')
@@ -57,11 +57,18 @@ ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
def preloadPlugins():
+ ret = True
for root in plugins.keys():
for plugin_name in plugins[root]:
p = root + '/' + plugin_name
try:
- mod = cache_plugins[p] = imp.load_source('hook', p)
+ fp, pathname, description = imp.find_module(plugin_name, [root])
+ cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description)
+ #cache_plugins[p] = imp.load_module(p,None,p,("py","r",imp.PKG_DIRECTORY))
+ #cache_plugins[p] = imp.load_source(p, p)
+ mod = cache_plugins[p]
+ #print dir(mod)
+ #print "Register %s -> %s" % (p, mod)
infos = mod.get_plugins_infos()
if infos['class'] != ANALYSIS_CLASS or \
API_VERSION < infos['min_version'] or\
@@ -71,8 +78,8 @@ def preloadPlugins():
del cache_plugins[p]
except Exception as e:
print 'Error loading \'%s\' => %s' % (p, e)
- return False
- return True
+ ret = False
+ return ret
def createEmptyVisits():
@@ -113,12 +120,12 @@ def deserialize(filename):
return pickle.load(f)
return None
-def callPlugins(root, *kwargs):
+def callPlugins(root, *args):
print '==> Call plugins (%s)' % root
for p in plugins[root]:
print '\t%s' % (p)
mod = cache_plugins[root + '/' + p]
- mod.hook(*kwargs)
+ mod.hook(*args)
def isPage(request):
for e in pages_extensions:
@@ -135,8 +142,8 @@ def appendHit(hit):
return
super_hit = current_visits['visits'][remote_addr]
- super_hit['pages'].append(hit)
- super_hit['bandwith'] += int(hit['body_bytes_sent'])
+ super_hit['requests'].append(hit)
+ super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = meta_visit['last_time']
request = hit['extract_request']
@@ -167,15 +174,16 @@ def appendHit(hit):
def createUser(hit):
super_hit = current_visits['visits'][hit['remote_addr']] = {}
- super_hit['viewed_pages'] = 0;
- super_hit['viewed_hits'] = 0;
- super_hit['not_viewed_pages'] = 0;
- super_hit['not_viewed_hits'] = 0;
- super_hit['bandwith'] = 0;
+ super_hit['remote_addr'] = hit['remote_addr']
+ super_hit['viewed_pages'] = 0
+ super_hit['viewed_hits'] = 0
+ super_hit['not_viewed_pages'] = 0
+ super_hit['not_viewed_hits'] = 0
+ super_hit['bandwidth'] = 0
super_hit['last_access'] = meta_visit['last_time']
- super_hit['pages'] = [];
+ super_hit['requests'] = []
super_hit['robot'] = False
- super_hit['hit_only'] = 0;
+ super_hit['hit_only'] = 0
appendHit(hit)
def decodeHTTPRequest(hit):
@@ -185,7 +193,7 @@ def decodeHTTPRequest(hit):
if groups:
hit['extract_request'] = groups.groupdict()
- uri_groups = uri_re.match(hit['extract_request']['http_uri']);
+ uri_groups = uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
@@ -195,7 +203,7 @@ def decodeHTTPRequest(hit):
print "Bad request extraction " + hit['request']
return False
- referer_groups = uri_re.match(hit['http_referer']);
+ referer_groups = uri_re.match(hit['http_referer'])
if referer_groups:
referer = hit['extract_referer'] = referer_groups.groupdict()
return True
@@ -205,13 +213,19 @@ def decodeTime(hit):
hit['time_decoded'] = time.strptime(t, time_format)
+def getDisplayIndex():
+ cur_time = meta_visit['last_time']
+ filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
+
+ return display.get(filename, None)
+
def generateDisplayDaysStat():
cur_time = meta_visit['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
page = createPage(display, filename, title)
- days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwith', 'Robot Bandwith'])
+ days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth'])
keys = current_visits['days_stats'].keys()
keys.sort()
@@ -243,7 +257,7 @@ def generateDisplayDaysStat():
def generateDisplay():
generateDisplayDaysStat()
callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display)
- buildPages()
+ buildPages(DISPLAY_ROOT, display)
def generateStats(visits):
stats = {}
@@ -251,27 +265,27 @@ def generateStats(visits):
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
- #stats['pages'] = set()
+ #stats['requests'] = set()
stats['nb_visitors'] = 0
for k in visits.keys():
super_hit = visits[k]
if super_hit['robot']:
- stats['not_viewed_bandwidth'] += super_hit['bandwith']
+ stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
- print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
+ #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if not super_hit['hit_only']:
stats['nb_visitors'] += 1
- stats['viewed_bandwidth'] += super_hit['bandwith']
+ stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
- # for p in super_hit['pages']:
+ # for p in super_hit['requests']:
# if not p['is_page']: continue
# req = p['extract_request']
- # stats['pages'].add(req['extract_uri'])
+ # stats['requests'].add(req['extract_uri'])
return stats
@@ -287,7 +301,7 @@ def generateMonthStats():
print stats
valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
- callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
+ callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats)
current_visits['month_stats'] = stats
@@ -348,7 +362,6 @@ def newHit(hit):
return
else:
analyse_started = True
- current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
if cur_time.tm_mon != t.tm_mon:
generateMonthStats()
current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
@@ -371,12 +384,14 @@ preloadPlugins()
print '==> Analysing log'
meta_visit = deserialize(META_PATH) or createEmptyMeta()
-
-current_visits = createEmptyVisits()
+if meta_visit['last_time']:
+ current_visits = deserialize(getDBFilename(meta_visit['last_time'])) or createEmptyVisits()
+else:
+ current_visits = createEmptyVisits()
f = open(analyzed_filename)
for l in f:
- # print "line " + l;
+ # print "line " + l
groups = log_re.match(l)
@@ -385,7 +400,7 @@ for l in f:
break
else:
print "No match " + l
-f.close();
+f.close()
if analyse_started:
generateDayStats()
@@ -393,3 +408,4 @@ if analyse_started:
serialize(meta_visit, META_PATH)
else:
print '==> Analyse not started : nothing to do'
+ generateMonthStats()
diff --git a/plugins/pre_analysis/H001_robot.py b/plugins/pre_analysis/H001_robot.py
index 91cd5fc..a096dc8 100644
--- a/plugins/pre_analysis/H001_robot.py
+++ b/plugins/pre_analysis/H001_robot.py
@@ -30,10 +30,12 @@ def hook(hits):
isRobot = False
referers = 0
- for r in awstats_robots:
- if r.match(super_hit['pages'][0]['http_user_agent']):
- super_hit['robot'] = 1
- continue
+ first_page = super_hit['requests'][0]
+ if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday:
+ for r in awstats_robots:
+ if r.match(first_page['http_user_agent']):
+ super_hit['robot'] = 1
+ continue
# 1) no pages view --> robot
if not super_hit['viewed_pages']:
@@ -45,7 +47,7 @@ def hook(hits):
super_hit['robot'] = 1
continue
- for hit in super_hit['pages']:
+ for hit in super_hit['requests']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'] == '/robots.txt':
isRobot = True
diff --git a/plugins/pre_analysis/H002_soutade.py b/plugins/pre_analysis/H002_soutade.py
index f546d76..5b70f64 100644
--- a/plugins/pre_analysis/H002_soutade.py
+++ b/plugins/pre_analysis/H002_soutade.py
@@ -7,9 +7,11 @@ PLUGIN_CLASS = 'HTTP'
API_VERSION = 1
def get_plugins_infos():
- infos = {'class' : PLUGIN_CLASS,
- 'min_version' : API_VERSION,
- 'max_version' : -1}
+ infos = {
+ 'class' : PLUGIN_CLASS,
+ 'min_version' : API_VERSION,
+ 'max_version' : -1
+ }
return infos
def load():
@@ -23,9 +25,10 @@ def hook(hits):
if super_hit['robot']: continue
- for p in super_hit['pages']:
+ for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
+ if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1