iwla/iwla.py

409 lines
13 KiB
Python
Raw Normal View History

2014-11-18 20:18:53 +01:00
#!/usr/bin/env python
import os
import re
import time
2014-11-19 19:34:16 +01:00
import pickle
import gzip
2014-11-22 19:23:56 +01:00
import importlib
2014-11-19 19:45:41 +01:00
2014-11-24 21:37:37 +01:00
import default_conf as conf
import conf as _
conf.__dict__.update(_.__dict__)
del _
2014-11-24 17:13:59 +01:00
from iplugin import *
from display import *
2014-11-21 14:46:12 +01:00
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
def __init__(self):
print '==> Start'
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
self.cache_plugins = {}
2014-11-21 16:56:58 +01:00
self.display = DisplayHTMLBuild()
2014-11-21 14:46:12 +01:00
self.valid_visitors = None
2014-11-24 21:37:37 +01:00
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
2014-11-21 14:46:12 +01:00
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
2014-11-26 16:17:16 +01:00
self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
2014-11-24 21:37:37 +01:00
self.plugins = {conf.PRE_HOOK_DIRECTORY : conf.pre_analysis_hooks,
conf.POST_HOOK_DIRECTORY : conf.post_analysis_hooks,
conf.DISPLAY_HOOK_DIRECTORY : conf.display_hooks}
2014-11-21 14:46:12 +01:00
2014-11-24 21:42:57 +01:00
def getConfValue(self, key, default):
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
2014-11-21 14:46:12 +01:00
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
2014-11-21 16:56:58 +01:00
def getMonthStats(self):
2014-11-21 14:46:12 +01:00
return self.current_analysis['month_stats']
def getCurrentVisists(self):
return self.current_analysis['visits']
def getValidVisitors(self):
2014-11-21 16:56:58 +01:00
return self.valid_visitors
def getDisplay(self):
return self.display
2014-11-21 14:46:12 +01:00
2014-11-25 16:59:29 +01:00
def getCurTime(self):
return self.meta_infos['last_time']
2014-11-21 14:46:12 +01:00
def _clearMeta(self):
self.meta_infos = {
'last_time' : None
}
return self.meta_infos
def _clearDisplay(self):
2014-11-21 16:56:58 +01:00
self.display = DisplayHTMLBuild()
2014-11-21 14:46:12 +01:00
return self.display
def getDBFilename(self, time):
2014-11-24 21:37:37 +01:00
return (conf.DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, conf.DB_FILENAME)
2014-11-21 14:46:12 +01:00
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
# TODO : remove return
2014-11-24 21:37:37 +01:00
#return
2014-11-20 08:18:31 +01:00
2014-11-21 16:56:58 +01:00
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
2014-11-19 19:34:16 +01:00
os.remove(filename + '.tmp')
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
with gzip.open(filename, 'r') as f:
return pickle.load(f)
2014-11-19 19:34:16 +01:00
return None
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
def _callPlugins(self, root, *args):
print '==> Call plugins (%s)' % root
for p in self.plugins[root]:
print '\t%s' % (p)
2014-11-22 19:23:56 +01:00
mod = self.cache_plugins[root + '.' + p]
2014-11-21 14:46:12 +01:00
mod.hook(*args)
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
def isPage(self, request):
2014-11-24 21:37:37 +01:00
for e in conf.pages_extensions:
2014-11-21 14:46:12 +01:00
if request.endswith(e):
return True
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
return False
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
if not remote_addr in self.current_analysis['visits'].keys():
2014-11-25 16:22:07 +01:00
self._createVisitor(hit)
2014-11-21 14:46:12 +01:00
return
super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
if 'extract_uri' in request.keys():
uri = request['extract_uri']
else:
uri = request['http_uri']
hit['is_page'] = self.isPage(uri)
status = int(hit['status'])
2014-11-26 16:17:16 +01:00
if status not in conf.viewed_http_codes:
return
2014-11-21 14:46:12 +01:00
if super_hit['robot'] or\
2014-11-24 21:37:37 +01:00
not status in conf.viewed_http_codes:
2014-11-21 14:46:12 +01:00
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
2014-11-25 16:22:07 +01:00
def _createVisitor(self, hit):
2014-11-21 14:46:12 +01:00
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
2014-11-26 16:17:16 +01:00
super_hit['remote_ip'] = hit['remote_addr']
2014-11-21 14:46:12 +01:00
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0
self._appendHit(hit)
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
else:
print "Bad request extraction " + hit['request']
return False
2014-11-26 16:17:16 +01:00
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict()
2014-11-21 14:46:12 +01:00
return True
def _decodeTime(self, hit):
2014-11-24 21:37:37 +01:00
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
2014-11-21 14:46:12 +01:00
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
2014-11-21 16:56:58 +01:00
return self.display.getPage(filename)
2014-11-21 14:46:12 +01:00
def _generateDisplayDaysStat(self):
cur_time = self.meta_infos['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
2014-11-21 16:56:58 +01:00
print '==> Generate display (%s)' % (filename)
page = DisplayHTMLPage(title, filename)
2014-11-21 14:46:12 +01:00
2014-11-21 16:56:58 +01:00
days = DisplayHTMLBlockTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'])
2014-11-21 14:46:12 +01:00
keys = self.current_analysis['days_stats'].keys()
keys.sort()
nb_visits = 0
for k in keys:
stats = self.current_analysis['days_stats'][k]
row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
bytesToStr(stats['viewed_bandwidth']), bytesToStr(stats['not_viewed_bandwidth'])]
2014-11-21 16:56:58 +01:00
days.appendRow(row)
2014-11-21 14:46:12 +01:00
nb_visits += stats['nb_visitors']
stats = self.current_analysis['month_stats']
nb_days = len(keys)
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
2014-11-26 16:17:16 +01:00
average_row = map(lambda(v): int(v/nb_days), row)
2014-11-21 14:46:12 +01:00
else:
2014-11-26 16:17:16 +01:00
average_row = map(lambda(v): 0, row)
2014-11-21 14:46:12 +01:00
average_row[0] = 'Average'
2014-11-26 16:17:16 +01:00
average_row[4] = bytesToStr(average_row[4])
average_row[5] = bytesToStr(average_row[5])
days.appendRow(average_row)
2014-11-18 20:18:53 +01:00
row[0] = 'Total'
row[4] = bytesToStr(row[4])
row[5] = bytesToStr(row[5])
2014-11-21 16:56:58 +01:00
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
2014-11-20 08:18:31 +01:00
2014-11-21 14:46:12 +01:00
def _generateDisplay(self):
self._generateDisplayDaysStat()
2014-11-24 21:37:37 +01:00
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY, self)
self.display.build(conf.DISPLAY_ROOT)
2014-11-20 08:18:31 +01:00
2014-11-21 14:46:12 +01:00
def _generateStats(self, visits):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
#stats['requests'] = set()
stats['nb_visitors'] = 0
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
for k in visits.keys():
super_hit = visits[k]
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
if not super_hit['hit_only']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
# for p in super_hit['requests']:
# if not p['is_page']: continue
# req = p['extract_request']
# stats['requests'].add(req['extract_uri'])
2014-11-21 10:41:29 +01:00
2014-11-21 14:46:12 +01:00
return stats
2014-11-21 10:41:29 +01:00
2014-11-21 14:46:12 +01:00
def _generateMonthStats(self):
self._clearDisplay()
2014-11-21 14:46:12 +01:00
visits = self.current_analysis['visits']
stats = self._generateStats(visits)
cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats
self.current_analysis['month_stats'] = stats
2014-11-21 16:56:58 +01:00
self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
2014-11-24 21:37:37 +01:00
self._callPlugins(conf.POST_HOOK_DIRECTORY, self)
2014-11-21 16:56:58 +01:00
2014-11-21 14:46:12 +01:00
path = self.getDBFilename(cur_time)
if os.path.exists(path):
os.remove(path)
print "==> Serialize to %s" % path
self._serialize(self.current_analysis, path)
self._generateDisplay()
def _generateDayStats(self):
visits = self.current_analysis['visits']
2014-11-24 21:37:37 +01:00
self._callPlugins(conf.PRE_HOOK_DIRECTORY, self)
2014-11-21 14:46:12 +01:00
stats = self._generateStats(visits)
cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
if cur_time.tm_mday > 1:
last_day = cur_time.tm_mday - 1
while last_day:
if last_day in self.current_analysis['days_stats'].keys():
break
last_day -= 1
if last_day:
for k in stats.keys():
stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0
for k in visits.keys():
if visits[k]['robot']: continue
if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
stats['nb_visitors'] += 1
print stats
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
self._decodeTime(hit)
t = hit['time_decoded']
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
2014-11-25 16:22:07 +01:00
return False
2014-11-21 14:46:12 +01:00
else:
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
if not self._decodeHTTPRequest(hit): return False
for k in hit.keys():
if hit[k] == '-': hit[k] = ''
self._appendHit(hit)
return True
def start(self):
2014-11-24 21:37:37 +01:00
self.cache_plugins = preloadPlugins(self.plugins, self)
2014-11-21 14:46:12 +01:00
2014-11-25 16:22:07 +01:00
print '==> Analyse previous database'
2014-11-21 14:46:12 +01:00
2014-11-24 21:37:37 +01:00
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
2014-11-21 14:46:12 +01:00
if self.meta_infos['last_time']:
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
2014-11-25 16:22:07 +01:00
print '==> Analysing log'
2014-11-24 21:37:37 +01:00
with open(conf.analyzed_filename) as f:
2014-11-21 14:46:12 +01:00
for l in f:
# print "line " + l
groups = self.log_re.match(l)
if groups:
if not self._newHit(groups.groupdict()):
break
else:
print "No match for " + l
2014-11-21 16:56:58 +01:00
#break
2014-11-21 14:46:12 +01:00
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
2014-11-24 21:37:37 +01:00
self._serialize(self.meta_infos, conf.META_PATH)
2014-11-21 14:46:12 +01:00
else:
print '==> Analyse not started : nothing to do'
self._generateMonthStats()
2014-11-21 16:56:58 +01:00
if __name__ == '__main__':
iwla = IWLA()
iwla.start()