iwla/iwla.py

719 lines
25 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import os
import shutil
import sys
import re
import time
import pickle
import gzip
import importlib
import argparse
import logging
import gettext
from calendar import monthrange
from datetime import date
import default_conf as conf
import conf as _
conf.__dict__.update(_.__dict__)
del _
from iplugin import *
from display import *
#
# Main class IWLA
# Parse Log, compute them, call plugins and produce output
# For now, only HTTP log are valid
#
# Plugin requirements :
# None
#
# Conf values needed :
# analyzed_filename
# domain_name
# locales_path
# compress_output_files*
#
# Output files :
# DB_ROOT/meta.db
# DB_ROOT/year/month/iwla.db
# OUTPUT_ROOT/index.html
# OUTPUT_ROOT/year/month/index.html
#
# Statistics creation :
#
# meta :
# last_time
# start_analysis_time
# stats =>
# year =>
# month =>
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visits
# nb_visitors
#
# month_stats :
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visits
#
# days_stats :
# day =>
# viewed_bandwidth
# not_viewed_bandwidth
# viewed_pages
# viewed_hits
# nb_visits
# nb_visitors
#
# visits :
# remote_addr =>
# remote_addr
# remote_ip
# viewed_pages
# viewed_hits
# not_viewed_pages
# not_viewed_hits
# bandwidth
# last_access
# requests =>
# [fields_from_format_log]
# extract_request =>
# extract_uri
# extract_parameters*
# extract_referer* =>
# extract_uri
# extract_parameters*
# robot
# hit_only
# is_page
#
# valid_visitors:
# month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
#
# Statistics update :
# None
#
# Statistics deletion :
# None
#
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.1'
def __init__(self, logLevel):
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
self.cache_plugins = {}
self.display = DisplayHTMLBuild(self)
self.valid_visitors = None
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
self.logger = logging.getLogger(self.__class__.__name__)
self.logger.info('==> Start')
try:
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
self.logger.info('\tUsing locale %s' % (conf.locale))
except IOError:
t = gettext.NullTranslations()
self.logger.info('\tUsing default locale en_EN')
self._ = t.ugettext
def getVersion(self):
return IWLA.IWLA_VERSION
def getConfValue(self, key, default=None):
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
def getMonthStats(self):
return self.current_analysis['month_stats']
def getCurrentVisists(self):
return self.current_analysis['visits']
def getValidVisitors(self):
return self.valid_visitors
def getDisplay(self):
return self.display
def getCurTime(self):
return self.meta_infos['last_time']
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
# Analyse not started
if not cur_time: return False
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def hasBeenViewed(self, request):
return int(request['status']) in conf.viewed_http_codes
def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time']
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
def getResourcesPath(self):
return conf.resources_path
def getCSSPath(self):
return conf.css_path
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
'start_analysis_time' : None
}
return self.meta_infos
def _clearDisplay(self):
self.display = DisplayHTMLBuild(self)
return self.display
def getDBFilename(self, time):
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
# TODO : remove return
#return
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
os.remove(filename + '.tmp')
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
with gzip.open(filename, 'r') as f:
return pickle.load(f)
return None
def _callPlugins(self, target_root, *args):
self.logger.info('==> Call plugins (%s)' % (target_root))
for (root, plugins) in self.plugins:
if root != target_root: continue
for p in plugins:
mod = self.cache_plugins.get(root + '.' + p, None)
if mod:
self.logger.info('\t%s' % (p))
mod.hook(*args)
def isPage(self, request):
for e in conf.pages_extensions:
if request.endswith(e):
return True
return False
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
if not remote_addr: return
if not remote_addr in self.current_analysis['visits'].keys():
self._createVisitor(hit)
super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
else:
self.logger.warning("Bad request extraction %s" % (hit['request']))
return False
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict()
return True
def _decodeTime(self, hit):
try:
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
except ValueError, e:
if sys.version_info < (3, 2):
# Try without UTC value at the end (%z not recognized)
gmt_offset_str = hit['time_local'][-5:]
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
gmt_offset = gmt_offset_hours + gmt_offset_minutes
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
if gmt_offset_str[0] == '+':
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
else:
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
else:
raise e
return hit['time_decoded']
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = self.getCurDisplayPath('index.html')
return self.display.getPage(filename)
def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time']
title = '%s %d/%02d' % (self._('Statistics'), cur_time.tm_year, cur_time.tm_mon)
filename = self.getCurDisplayPath('index.html')
self.logger.info('==> Generate display (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
nb_visits = 0
nb_days = 0
for i in range(1, nb_month_days+1):
day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
if i in self.current_analysis['days_stats'].keys():
stats = self.current_analysis['days_stats'][i]
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
nb_visits += stats['nb_visits']
nb_days += 1
else:
row = [full_day, 0, 0, 0, 0, 0]
days.appendRow(row)
days.setCellValue(i-1, 4, bytesToStr(row[4]))
days.setCellValue(i-1, 5, bytesToStr(row[5]))
days.appendShortTitle(day)
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
week_day = adate.weekday()
if week_day == 5 or week_day == 6:
days.setRowCSSClass(i-1, 'iwla_weekend')
if adate == date.today():
css = days.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
days.setCellCSSClass(i-1, 0, css)
stats = self.current_analysis['month_stats']
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
average_row = map(lambda(v): int(v/nb_days), row)
else:
average_row = map(lambda(v): 0, row)
average_row[0] = self._('Average')
average_row[4] = bytesToStr(average_row[4])
average_row[5] = bytesToStr(average_row[5])
days.appendRow(average_row)
row[0] = self._('Total')
row[4] = bytesToStr(row[4])
row[5] = bytesToStr(row[5])
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
def _generateDisplayMonthStats(self, page, year, month_stats):
cur_time = time.localtime()
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('July'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
title = '%s %d' % (self._('Summary'), year)
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
graph_cols=range(1,7)
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
total = [0] * len(cols)
for i in range(1, 13):
month = '%s<br/>%d' % (months_name[i], year)
full_month = '%s %d' % (months_name[i], year)
if i in month_stats.keys():
stats = month_stats[i]
link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols:
total[j] += row[j]
else:
row = [full_month, 0, 0, 0, 0, 0, 0, '']
months.appendRow(row)
months.setCellValue(i-1, 5, bytesToStr(row[5]))
months.setCellValue(i-1, 6, bytesToStr(row[6]))
months.appendShortTitle(month)
if year == cur_time.tm_year and i == cur_time.tm_mon:
css = months.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
months.setCellCSSClass(i-1, 0, css)
total[0] = self._('Total')
total[5] = bytesToStr(total[5])
total[6] = bytesToStr(total[6])
months.appendRow(total)
page.appendBlock(months)
def _generateDisplayWholeMonthStats(self):
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
filename = 'index.html'
self.logger.info('==> Generate main page (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
last_update = '<b>%s</b> %s<br />' % (self._('Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
self.display.addPage(page)
def _compressFile(self, build_time, root, filename):
path = os.path.join(root, filename)
gz_path = path + '.gz'
self.logger.debug('Compress %s => %s' % (path, gz_path))
if not os.path.exists(gz_path) or\
os.stat(path).st_mtime > build_time:
with open(path, 'rb') as f_in:
with gzip.open(gz_path, 'wb') as f_out:
f_out.write(f_in.read())
def _compressFiles(self, build_time, root):
if not conf.compress_output_files: return
for rootdir, subdirs, files in os.walk(root, followlinks=True):
for f in files:
for ext in conf.compress_output_files:
if f.endswith(ext):
self._compressFile(build_time, rootdir, f)
break
def _generateDisplay(self):
self._generateDisplayDaysStats()
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
self._generateDisplayWholeMonthStats()
build_time = time.localtime()
self.display.build(conf.DISPLAY_ROOT)
self._compressFiles(build_time, conf.DISPLAY_ROOT)
def _createEmptyStats(self):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
stats['nb_visits'] = 0
return stats
def _generateMonthStats(self):
self._clearDisplay()
visits = self.current_analysis['visits']
stats = self._createEmptyStats()
for (day, stat) in self.current_analysis['days_stats'].items():
for k in stats.keys():
stats[k] += stat[k]
duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time']
self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
self.logger.info(stats)
if not 'month_stats' in self.current_analysis.keys():
self.current_analysis['month_stats'] = stats
else:
for (k,v) in stats.items():
self.current_analysis['month_stats'][k] = v
self.valid_visitors = {}
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
self.valid_visitors[k] = v
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
if os.path.exists(path):
os.remove(path)
self.logger.info("==> Serialize to %s" % (path))
self._serialize(self.current_analysis, path)
# Save month stats
year = cur_time.tm_year
month = cur_time.tm_mon
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
if not year in self.meta_infos['stats'].keys():
self.meta_infos['stats'][year] = {}
self.meta_infos['stats'][year][month] = duplicated_stats
self._generateDisplay()
def _generateDayStats(self):
visits = self.current_analysis['visits']
cur_time = self.meta_infos['last_time']
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
stats = self._createEmptyStats()
for (k, super_hit) in visits.items():
if super_hit['last_access'].tm_mday != cur_time.tm_mday:
continue
viewed_page = False
for hit in super_hit['requests'][::-1]:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages) and\
not super_hit['robot']:
stats['nb_visits'] += 1
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
self.logger.info(stats)
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
if not self.domain_name_re.match(hit['server_name']):
return False
t = self._decodeTime(hit)
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if time.mktime(t) <= time.mktime(cur_time):
return False
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
if not self.meta_infos['start_analysis_time']:
self.meta_infos['start_analysis_time'] = t
if not self._decodeHTTPRequest(hit): return False
for k in hit.keys():
if hit[k] == '-' or hit[k] == '*':
hit[k] = ''
self._appendHit(hit)
return True
def start(self, _file):
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
self.logger.info('==> Analysing log')
for l in _file:
# print "line " + l
groups = self.log_re.match(l)
if groups:
self._newHit(groups.groupdict())
else:
self.logger.warning("No match for %s" % (l))
#break
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
del self.meta_infos['start_analysis_time']
self._serialize(self.meta_infos, conf.META_PATH)
else:
self.logger.info('==> Analyse not started : nothing new')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
default=False,
help='Clean output before starting')
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
default=False,
help='Read data from stdin instead of conf.analyzed_filename')
parser.add_argument('-f', '--file', dest='file',
help='Analyse this log file')
parser.add_argument('-d', '--log-level', dest='loglevel',
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
args = parser.parse_args()
if args.clean_output:
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
loglevel = getattr(logging, args.loglevel.upper(), None)
if not isinstance(loglevel, int):
raise ValueError('Invalid log level: %s' % (args.loglevel))
iwla = IWLA(loglevel)
required_conf = ['analyzed_filename', 'domain_name']
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
sys.exit(0)
if args.stdin:
iwla.start(sys.stdin)
else:
filename = args.file or conf.analyzed_filename
if not os.path.exists(filename):
print 'No such file \'%s\'' % (filename)
sys.exit(-1)
with open(filename) as f:
iwla.start(f)