iwla/iwla.py

664 lines
22 KiB
Python
Executable File

#!/usr/bin/env python
import os
import shutil
import sys
import re
import time
import pickle
import gzip
import importlib
import argparse
from calendar import monthrange
from datetime import date
import default_conf as conf
import conf as _
conf.__dict__.update(_.__dict__)
del _
from iplugin import *
from display import *
#
# Main class IWLA
# Parse Log, compute them, call plugins and produce output
# For now, only HTTP log are valid
#
# Plugin requirements :
# None
#
# Conf values needed :
# analyzed_filename
# domain_name
#
# Output files :
# DB_ROOT/meta.db
# DB_ROOT/year/month/iwla.db
# OUTPUT_ROOT/index.html
# OUTPUT_ROOT/year/month/index.html
#
# Statistics creation :
#
# meta =>
# last_time
# start_analysis_time
# stats =>
# year =>
# month =>
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visitors
#
# month_stats :
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visitors
#
# days_stats :
# day =>
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visitors
#
# visits :
# remote_addr =>
# remote_addr
# remote_ip
# viewed_pages
# viewed_hits
# not_viewed_pages
# not_viewed_hits
# bandwidth
# last_access
# requests =>
# [fields_from_format_log]
# extract_request =>
# extract_uri
# extract_parameters*
# extract_referer* =>
# extract_uri
# extract_parameters*
# robot
# hit_only
# is_page
#
# valid_visitors:
# month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
#
# Statistics update :
# None
#
# Statistics deletion :
# None
#
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.1'
def __init__(self):
print '==> Start'
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
self.cache_plugins = {}
self.display = DisplayHTMLBuild(self)
self.valid_visitors = None
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
def getVersion(self):
return IWLA.IWLA_VERSION
def getConfValue(self, key, default=None):
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
def getMonthStats(self):
return self.current_analysis['month_stats']
def getCurrentVisists(self):
return self.current_analysis['visits']
def getValidVisitors(self):
return self.valid_visitors
def getDisplay(self):
return self.display
def getCurTime(self):
return self.meta_infos['last_time']
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
# Analyse not started
if not cur_time: return False
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def hasBeenViewed(self, request):
return int(request['status']) in conf.viewed_http_codes
def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time']
return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
def getResourcesPath(self):
return conf.resources_path
def getCSSPath(self):
return conf.css_path
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
'start_analysis_time' : None
}
return self.meta_infos
def _clearDisplay(self):
self.display = DisplayHTMLBuild(self)
return self.display
def getDBFilename(self, time):
return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
# TODO : remove return
#return
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
os.remove(filename + '.tmp')
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
with gzip.open(filename, 'r') as f:
return pickle.load(f)
return None
def _callPlugins(self, target_root, *args):
print '==> Call plugins (%s)' % target_root
for (root, plugins) in self.plugins:
if root != target_root: continue
for p in plugins:
mod = self.cache_plugins.get(root + '.' + p, None)
if mod:
print '\t%s' % (p)
mod.hook(*args)
def isPage(self, request):
for e in conf.pages_extensions:
if request.endswith(e):
return True
return False
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
if not remote_addr: return
if not remote_addr in self.current_analysis['visits'].keys():
self._createVisitor(hit)
super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
status = int(hit['status'])
if status not in conf.viewed_http_codes:
return
if super_hit['robot'] or\
not status in conf.viewed_http_codes:
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
else:
print "Bad request extraction " + hit['request']
return False
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict()
return True
def _decodeTime(self, hit):
try:
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
except ValueError, e:
if sys.version_info < (3, 2):
# Try without UTC value at the end (%z not recognized)
gmt_offset_str = hit['time_local'][-5:]
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
gmt_offset = gmt_offset_hours + gmt_offset_minutes
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
if gmt_offset_str[0] == '+':
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
else:
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
else:
raise e
return hit['time_decoded']
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = self.getCurDisplayPath('index.html')
return self.display.getPage(filename)
def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
filename = self.getCurDisplayPath('index.html')
print '==> Generate display (%s)' % (filename)
page = self.display.createPage(title, filename, conf.css_path)
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, 'By day', ['Day', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], None, nb_month_days, range(1,6))
days.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
nb_visits = 0
nb_days = 0
for i in range(1, nb_month_days+1):
day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
full_day = '%d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
if i in self.current_analysis['days_stats'].keys():
stats = self.current_analysis['days_stats'][i]
row = [full_day, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
nb_visits += stats['nb_visitors']
nb_days += 1
else:
row = [full_day, 0, 0, 0, 0, 0]
days.appendRow(row)
days.setCellValue(i-1, 4, bytesToStr(row[4]))
days.setCellValue(i-1, 5, bytesToStr(row[5]))
days.appendShortTitle(day)
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
week_day = adate.weekday()
if week_day == 5 or week_day == 6:
days.setRowCSSClass(i-1, 'iwla_weekend')
if adate == date.today():
css = days.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
days.setCellCSSClass(i-1, 0, css)
stats = self.current_analysis['month_stats']
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
average_row = map(lambda(v): int(v/nb_days), row)
else:
average_row = map(lambda(v): 0, row)
average_row[0] = 'Average'
average_row[4] = bytesToStr(average_row[4])
average_row[5] = bytesToStr(average_row[5])
days.appendRow(average_row)
row[0] = 'Total'
row[4] = bytesToStr(row[4])
row[5] = bytesToStr(row[5])
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
def _generateDisplayMonthStats(self, page, year, month_stats):
cur_time = time.localtime()
months_name = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
title = 'Summary %d' % (year)
cols = ['Month', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth', 'Details']
graph_cols=range(1,6)
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
total = [0] * len(cols)
for i in range(1, 13):
month = '%s<br/>%d' % (months_name[i], year)
full_month = '%s %d' % (months_name[i], year)
if i in month_stats.keys():
stats = month_stats[i]
link = '<a href="%d/%d/index.html">Details</a>' % (year, i)
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols:
total[j] += row[j]
else:
row = [full_month, 0, 0, 0, 0, 0, '']
months.appendRow(row)
months.setCellValue(i-1, 4, bytesToStr(row[4]))
months.setCellValue(i-1, 5, bytesToStr(row[5]))
months.appendShortTitle(month)
if year == cur_time.tm_year and i == cur_time.tm_mon:
css = months.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
months.setCellCSSClass(i-1, 0, css)
total[0] = 'Total'
total[4] = bytesToStr(total[4])
total[5] = bytesToStr(total[5])
months.appendRow(total)
page.appendBlock(months)
def _generateDisplayWholeMonthStats(self):
title = 'Stats for %s' % (conf.domain_name)
filename = 'index.html'
print '==> Generate main page (%s)' % (filename)
page = self.display.createPage(title, filename, conf.css_path)
last_update = '<b>Last update</b> %s<br />' % (time.strftime('%d %b %Y %H:%M', time.localtime()))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
for year in self.meta_infos['stats'].keys():
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
self.display.addPage(page)
def _generateDisplay(self):
self._generateDisplayDaysStats()
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
self._generateDisplayWholeMonthStats()
self.display.build(conf.DISPLAY_ROOT)
def _generateStats(self, visits):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
#stats['requests'] = set()
stats['nb_visitors'] = 0
for (k, super_hit) in visits.items():
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if conf.count_hit_only_visitors or\
super_hit['viewed_pages']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
# for p in super_hit['requests']:
# if not p['is_page']: continue
# req = p['extract_request']
# stats['requests'].add(req['extract_uri'])
return stats
def _generateMonthStats(self):
self._clearDisplay()
visits = self.current_analysis['visits']
stats = self._generateStats(visits)
duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats
if not 'month_stats' in self.current_analysis.keys():
self.current_analysis['month_stats'] = stats
else:
for (k,v) in stats.items():
self.current_analysis['month_stats'][k] = v
self.valid_visitors = {}
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
self.valid_visitors[k] = v
duplicated_stats['visitors'] = stats['visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
if os.path.exists(path):
os.remove(path)
print "==> Serialize to %s" % path
self._serialize(self.current_analysis, path)
# Save month stats
year = cur_time.tm_year
month = cur_time.tm_mon
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
if not year in self.meta_infos['stats'].keys():
self.meta_infos['stats'][year] = {}
self.meta_infos['stats'][year][month] = duplicated_stats
self._generateDisplay()
def _generateDayStats(self):
visits = self.current_analysis['visits']
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
stats = self._generateStats(visits)
cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
if cur_time.tm_mday > 1:
last_day = cur_time.tm_mday - 1
while last_day:
if last_day in self.current_analysis['days_stats'].keys():
break
last_day -= 1
if last_day:
for k in stats.keys():
stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
if v['last_access'].tm_mday == cur_time.tm_mday:
stats['nb_visitors'] += 1
print stats
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
t = self._decodeTime(hit)
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if not self.analyse_started:
if time.mktime(t) < time.mktime(cur_time):
return False
else:
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
if not self.meta_infos['start_analysis_time']:
self.meta_infos['start_analysis_time'] = t
if not self._decodeHTTPRequest(hit): return False
for k in hit.keys():
if hit[k] == '-' or hit[k] == '*':
hit[k] = ''
self._appendHit(hit)
return True
def start(self, _file):
print '==> Load previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
print '==> Analysing log'
for l in _file:
# print "line " + l
groups = self.log_re.match(l)
if groups:
if not self._newHit(groups.groupdict()):
continue
else:
print "No match for " + l
#break
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
del self.meta_infos['start_analysis_time']
self._serialize(self.meta_infos, conf.META_PATH)
else:
print '==> Analyse not started : nothing to do'
self._generateMonthStats()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
default=False,
help='Clean output before starting')
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
default=False,
help='Read data from stdin instead of conf.analyzed_filename')
args = parser.parse_args()
if args.clean_output:
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
iwla = IWLA()
required_conf = ['analyzed_filename', 'domain_name']
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
sys.exit(0)
if args.stdin:
iwla.start(sys.stdin)
else:
if not os.path.exists(conf.analyzed_filename):
print 'No such file \'%s\'' % (conf.analyzed_filename)
sys.exit(-1)
with open(conf.analyzed_filename) as f:
iwla.start(f)