iwla/iwla.py

721 lines
25 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import os
import shutil
import sys
import re
import time
import pickle
import gzip
import importlib
import argparse
import logging
import gettext
from calendar import monthrange
from datetime import date
import default_conf as conf
import conf as _
conf.__dict__.update(_.__dict__)
del _
from iplugin import *
from display import *
"""
Main class IWLA
Parse Log, compute them, call plugins and produce output
For now, only HTTP log are valid
Plugin requirements :
None
Conf values needed :
analyzed_filename
domain_name
locales_path
compress_output_files*
Output files :
DB_ROOT/meta.db
DB_ROOT/year/month/iwla.db
OUTPUT_ROOT/index.html
OUTPUT_ROOT/year/month/index.html
Statistics creation :
meta :
last_time
start_analysis_time
stats =>
year =>
month =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
month_stats :
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
days_stats :
day =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
visits :
remote_addr =>
remote_addr
remote_ip
viewed_pages
viewed_hits
not_viewed_pages
not_viewed_hits
bandwidth
last_access
requests =>
[fields_from_format_log]
extract_request =>
extract_uri
extract_parameters*
extract_referer* =>
extract_uri
extract_parameters*
robot
hit_only
is_page
valid_visitors:
month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
Statistics update :
None
Statistics deletion :
None
"""
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.1'
def __init__(self, logLevel):
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
self.cache_plugins = {}
self.display = DisplayHTMLBuild(self)
self.valid_visitors = None
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
self.logger = logging.getLogger(self.__class__.__name__)
self.logger.info('==> Start')
try:
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
self.logger.info('\tUsing locale %s' % (conf.locale))
except IOError:
t = gettext.NullTranslations()
self.logger.info('\tUsing default locale en_EN')
self._ = t.ugettext
def getVersion(self):
return IWLA.IWLA_VERSION
def getConfValue(self, key, default=None):
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
def getMonthStats(self):
return self.current_analysis['month_stats']
def getCurrentVisists(self):
return self.current_analysis['visits']
def getValidVisitors(self):
return self.valid_visitors
def getDisplay(self):
return self.display
def getCurTime(self):
return self.meta_infos['last_time']
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
# Analyse not started
if not cur_time: return False
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def hasBeenViewed(self, request):
return int(request['status']) in conf.viewed_http_codes
def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time']
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
def getResourcesPath(self):
return conf.resources_path
def getCSSPath(self):
return conf.css_path
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
'start_analysis_time' : None
}
return self.meta_infos
def _clearDisplay(self):
self.display = DisplayHTMLBuild(self)
return self.display
def getDBFilename(self, time):
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
# TODO : remove return
#return
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
os.remove(filename + '.tmp')
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
with gzip.open(filename, 'r') as f:
return pickle.load(f)
return None
def _callPlugins(self, target_root, *args):
self.logger.info('==> Call plugins (%s)' % (target_root))
for (root, plugins) in self.plugins:
if root != target_root: continue
for p in plugins:
mod = self.cache_plugins.get(root + '.' + p, None)
if mod:
self.logger.info('\t%s' % (p))
mod.hook(*args)
def isPage(self, request):
for e in conf.pages_extensions:
if request.endswith(e):
return True
return False
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
if not remote_addr: return
if not remote_addr in self.current_analysis['visits'].keys():
self._createVisitor(hit)
super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
else:
self.logger.warning("Bad request extraction %s" % (hit['request']))
return False
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict()
return True
def _decodeTime(self, hit):
try:
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
except ValueError, e:
if sys.version_info < (3, 2):
# Try without UTC value at the end (%z not recognized)
gmt_offset_str = hit['time_local'][-5:]
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
gmt_offset = gmt_offset_hours + gmt_offset_minutes
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
if gmt_offset_str[0] == '+':
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
else:
hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
else:
raise e
return hit['time_decoded']
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = self.getCurDisplayPath('index.html')
return self.display.getPage(filename)
def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time']
title = '%s %d/%02d' % (self._('Statistics'), cur_time.tm_year, cur_time.tm_mon)
filename = self.getCurDisplayPath('index.html')
self.logger.info('==> Generate display (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
nb_visits = 0
nb_days = 0
for i in range(1, nb_month_days+1):
day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
if i in self.current_analysis['days_stats'].keys():
stats = self.current_analysis['days_stats'][i]
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
nb_visits += stats['nb_visits']
nb_days += 1
else:
row = [full_day, 0, 0, 0, 0, 0]
days.appendRow(row)
days.setCellValue(i-1, 4, bytesToStr(row[4]))
days.setCellValue(i-1, 5, bytesToStr(row[5]))
days.appendShortTitle(day)
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
week_day = adate.weekday()
if week_day == 5 or week_day == 6:
days.setRowCSSClass(i-1, 'iwla_weekend')
if adate == date.today():
css = days.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
days.setCellCSSClass(i-1, 0, css)
stats = self.current_analysis['month_stats']
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
average_row = map(lambda(v): int(v/nb_days), row)
else:
average_row = map(lambda(v): 0, row)
average_row[0] = self._('Average')
average_row[4] = bytesToStr(average_row[4])
average_row[5] = bytesToStr(average_row[5])
days.appendRow(average_row)
row[0] = self._('Total')
row[4] = bytesToStr(row[4])
row[5] = bytesToStr(row[5])
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
def _generateDisplayMonthStats(self, page, year, month_stats):
cur_time = time.localtime()
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('July'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
title = '%s %d' % (self._('Summary'), year)
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
graph_cols=range(1,7)
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
total = [0] * len(cols)
for i in range(1, 13):
month = '%s<br/>%d' % (months_name[i], year)
full_month = '%s %d' % (months_name[i], year)
if i in month_stats.keys():
stats = month_stats[i]
link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols:
total[j] += row[j]
else:
row = [full_month, 0, 0, 0, 0, 0, 0, '']
months.appendRow(row)
months.setCellValue(i-1, 5, bytesToStr(row[5]))
months.setCellValue(i-1, 6, bytesToStr(row[6]))
months.appendShortTitle(month)
if year == cur_time.tm_year and i == cur_time.tm_mon:
css = months.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
months.setCellCSSClass(i-1, 0, css)
total[0] = self._('Total')
total[5] = bytesToStr(total[5])
total[6] = bytesToStr(total[6])
total[7] = u''
months.appendRow(total)
page.appendBlock(months)
def _generateDisplayWholeMonthStats(self):
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
filename = 'index.html'
self.logger.info('==> Generate main page (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
last_update = '<b>%s</b> %s<br />' % (self._('Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
self.display.addPage(page)
def _compressFile(self, build_time, root, filename):
path = os.path.join(root, filename)
gz_path = path + '.gz'
self.logger.debug('Compress %s => %s' % (path, gz_path))
if not os.path.exists(gz_path) or\
os.stat(path).st_mtime > build_time:
with open(path, 'rb') as f_in:
with gzip.open(gz_path, 'wb') as f_out:
f_out.write(f_in.read())
def _compressFiles(self, build_time, root):
if not conf.compress_output_files: return
for rootdir, subdirs, files in os.walk(root, followlinks=True):
for f in files:
for ext in conf.compress_output_files:
if f.endswith(ext):
self._compressFile(build_time, rootdir, f)
break
def _generateDisplay(self):
self._generateDisplayDaysStats()
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
self._generateDisplayWholeMonthStats()
build_time = time.localtime()
self.display.build(conf.DISPLAY_ROOT)
self._compressFiles(build_time, conf.DISPLAY_ROOT)
def _createEmptyStats(self):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
stats['nb_visits'] = 0
return stats
def _generateMonthStats(self):
self._clearDisplay()
visits = self.current_analysis['visits']
stats = self._createEmptyStats()
for (day, stat) in self.current_analysis['days_stats'].items():
for k in stats.keys():
stats[k] += stat[k]
duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time']
self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
self.logger.info(stats)
if not 'month_stats' in self.current_analysis.keys():
self.current_analysis['month_stats'] = stats
else:
for (k,v) in stats.items():
self.current_analysis['month_stats'][k] = v
self.valid_visitors = {}
for (k,v) in visits.items():
if v['robot']: continue
if not (conf.count_hit_only_visitors or\
v['viewed_pages']):
continue
self.valid_visitors[k] = v
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
if os.path.exists(path):
os.remove(path)
self.logger.info("==> Serialize to %s" % (path))
self._serialize(self.current_analysis, path)
# Save month stats
year = cur_time.tm_year
month = cur_time.tm_mon
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
if not year in self.meta_infos['stats'].keys():
self.meta_infos['stats'][year] = {}
self.meta_infos['stats'][year][month] = duplicated_stats
self._generateDisplay()
def _generateDayStats(self):
visits = self.current_analysis['visits']
cur_time = self.meta_infos['last_time']
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
stats = self._createEmptyStats()
for (k, super_hit) in visits.items():
if super_hit['last_access'].tm_mday != cur_time.tm_mday:
continue
viewed_page = False
for hit in super_hit['requests'][::-1]:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages) and\
not super_hit['robot']:
stats['nb_visits'] += 1
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
self.logger.info(stats)
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
if not self.domain_name_re.match(hit['server_name']):
return False
t = self._decodeTime(hit)
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if time.mktime(t) <= time.mktime(cur_time):
return False
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
if not self.meta_infos['start_analysis_time']:
self.meta_infos['start_analysis_time'] = t
if not self._decodeHTTPRequest(hit): return False
for k in hit.keys():
if hit[k] == '-' or hit[k] == '*':
hit[k] = ''
self._appendHit(hit)
return True
def start(self, _file):
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
self.logger.info('==> Analysing log')
for l in _file:
# print "line " + l
groups = self.log_re.match(l)
if groups:
self._newHit(groups.groupdict())
else:
self.logger.warning("No match for %s" % (l))
#break
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
del self.meta_infos['start_analysis_time']
self._serialize(self.meta_infos, conf.META_PATH)
else:
self.logger.info('==> Analyse not started : nothing new')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
default=False,
help='Clean output before starting')
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
default=False,
help='Read data from stdin instead of conf.analyzed_filename')
parser.add_argument('-f', '--file', dest='file',
help='Analyse this log file')
parser.add_argument('-d', '--log-level', dest='loglevel',
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
args = parser.parse_args()
if args.clean_output:
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
loglevel = getattr(logging, args.loglevel.upper(), None)
if not isinstance(loglevel, int):
raise ValueError('Invalid log level: %s' % (args.loglevel))
iwla = IWLA(loglevel)
required_conf = ['analyzed_filename', 'domain_name']
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
sys.exit(0)
if args.stdin:
iwla.start(sys.stdin)
else:
filename = args.file or conf.analyzed_filename
if not os.path.exists(filename):
print 'No such file \'%s\'' % (filename)
sys.exit(-1)
with open(filename) as f:
iwla.start(f)