iwla/iwla.py

876 lines
31 KiB
Python
Raw Normal View History

2014-11-18 20:18:53 +01:00
#!/usr/bin/env python
2014-12-18 19:54:31 +01:00
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
2014-11-18 20:18:53 +01:00
import os
2014-12-03 10:55:32 +01:00
import shutil
import sys
2014-11-18 20:18:53 +01:00
import re
import time
2014-11-19 19:34:16 +01:00
import pickle
import gzip
2014-11-22 19:23:56 +01:00
import importlib
2014-12-03 10:55:32 +01:00
import argparse
2014-12-15 22:30:49 +01:00
import logging
import gettext
2014-11-28 16:26:11 +01:00
from calendar import monthrange
2015-01-02 19:27:57 +01:00
from datetime import date, datetime
2014-11-19 19:45:41 +01:00
2014-11-24 21:37:37 +01:00
import default_conf as conf
import conf as user_conf
2014-11-24 21:37:37 +01:00
2014-11-24 17:13:59 +01:00
from iplugin import *
from display import *
"""
Main class IWLA
Parse Log, compute them, call plugins and produce output
For now, only HTTP log are valid
Plugin requirements :
None
Conf values needed :
analyzed_filename
domain_name
locales_path
compress_output_files*
Output files :
DB_ROOT/meta.db
DB_ROOT/year/month/iwla.db
OUTPUT_ROOT/index.html
2014-12-31 14:22:46 +01:00
OUTPUT_ROOT/year/_stats.html
OUTPUT_ROOT/year/month/index.html
Statistics creation :
meta :
last_time
start_analysis_time
stats =>
year =>
month =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
month_stats :
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
days_stats :
day =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
visits :
remote_addr =>
remote_addr
remote_ip
viewed_pages
viewed_hits
not_viewed_pages
not_viewed_hits
bandwidth
last_access
requests =>
[fields_from_format_log]
extract_request =>
http_method
http_uri
http_version
extract_uri
extract_parameters*
extract_referer* =>
extract_uri
extract_parameters*
robot
hit_only
is_page
valid_visitors:
month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
Statistics update :
None
Statistics deletion :
None
"""
2014-11-21 14:46:12 +01:00
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
2016-04-13 17:50:23 +02:00
IWLA_VERSION = '0.4-dev'
2014-11-21 14:46:12 +01:00
2014-12-16 20:23:33 +01:00
def __init__(self, logLevel):
2014-11-21 14:46:12 +01:00
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
2015-01-02 19:27:57 +01:00
self.start_time = 0
2014-11-21 14:46:12 +01:00
self.cache_plugins = {}
2014-11-30 19:05:17 +01:00
self.display = DisplayHTMLBuild(self)
2014-11-21 14:46:12 +01:00
self.valid_visitors = None
2014-11-24 21:37:37 +01:00
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
2014-11-21 14:46:12 +01:00
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
2014-12-17 21:06:48 +01:00
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
2016-01-16 08:43:29 +01:00
self.final_slashes_re = re.compile(r'/+$')
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
2014-11-21 14:46:12 +01:00
2014-12-16 20:23:33 +01:00
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
self.logger = logging.getLogger(self.__class__.__name__)
self.logger.info('==> Start')
try:
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
self.logger.info('\tUsing locale %s' % (conf.locale))
except IOError:
t = gettext.NullTranslations()
self.logger.info('\tUsing default locale en_EN')
self._ = t.ugettext
2014-12-17 19:00:42 +01:00
def getVersion(self):
return IWLA.IWLA_VERSION
def getConfValue(self, key, default=None):
2014-11-24 21:42:57 +01:00
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
2014-11-21 14:46:12 +01:00
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
2014-11-21 16:56:58 +01:00
def getMonthStats(self):
2014-11-21 14:46:12 +01:00
return self.current_analysis['month_stats']
def getCurrentVisits(self):
2014-11-21 14:46:12 +01:00
return self.current_analysis['visits']
def getValidVisitors(self):
2014-11-21 16:56:58 +01:00
return self.valid_visitors
def getDisplay(self):
return self.display
2014-11-21 14:46:12 +01:00
2014-11-25 16:59:29 +01:00
def getCurTime(self):
return self.meta_infos['last_time']
2014-11-26 19:53:00 +01:00
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
2014-11-27 14:11:47 +01:00
# Analyse not started
if not cur_time: return False
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
2014-11-27 13:07:14 +01:00
def hasBeenViewed(self, request):
return int(request['status']) in conf.viewed_http_codes
def getCurDisplayPath(self, filename):
2014-11-27 14:11:47 +01:00
cur_time = self.meta_infos['last_time']
2014-12-12 13:24:47 +01:00
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
2014-11-27 14:11:47 +01:00
2014-11-30 19:05:17 +01:00
def getResourcesPath(self):
return conf.resources_path
def getCSSPath(self):
return conf.css_path
2014-11-21 14:46:12 +01:00
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
'start_analysis_time' : None
2014-11-21 14:46:12 +01:00
}
return self.meta_infos
def _clearDisplay(self):
2016-02-04 20:46:12 +01:00
self.display.clear()
2014-11-21 14:46:12 +01:00
return self.display
def getDBFilename(self, time):
2014-12-12 13:24:47 +01:00
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
2014-11-21 14:46:12 +01:00
2015-05-23 16:38:39 +02:00
def _openDB(self, filename, prot='r'):
if self.args.dont_compress:
return open(filename, prot)
else:
return gzip.open(filename, prot)
2014-11-21 14:46:12 +01:00
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
2015-05-23 16:38:39 +02:00
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
2014-11-21 16:56:58 +01:00
pickle.dump(obj, f)
f.seek(0)
2014-12-19 18:06:02 +01:00
fzip.write(f.read())
2014-11-19 19:34:16 +01:00
os.remove(filename + '.tmp')
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
2015-05-23 16:38:39 +02:00
with self._openDB(filename) as f:
2014-11-21 14:46:12 +01:00
return pickle.load(f)
2014-11-19 19:34:16 +01:00
return None
2014-11-19 08:01:12 +01:00
def _callPlugins(self, target_root, *args):
2014-12-16 20:23:33 +01:00
self.logger.info('==> Call plugins (%s)' % (target_root))
for (root, plugins) in self.plugins:
if root != target_root: continue
for p in plugins:
mod = self.cache_plugins.get(root + '.' + p, None)
if mod:
2014-12-16 20:23:33 +01:00
self.logger.info('\t%s' % (p))
mod.hook(*args)
2014-11-19 08:01:12 +01:00
2014-11-21 14:46:12 +01:00
def isPage(self, request):
self.logger.debug("Is page %s" % (request))
2014-11-24 21:37:37 +01:00
for e in conf.pages_extensions:
2014-11-21 14:46:12 +01:00
if request.endswith(e):
self.logger.debug("True")
2014-11-21 14:46:12 +01:00
return True
self.logger.debug("False")
2014-11-21 14:46:12 +01:00
return False
2014-11-18 20:18:53 +01:00
def isMultimediaFile(self, request):
self.logger.debug("Is multimedia %s" % (request))
for e in conf.multimedia_files:
if request.endswith(e):
self.logger.debug("True")
return True
self.logger.debug("False")
return False
2016-01-18 07:33:48 +01:00
def isValidVisitor(self, hit):
if hit['robot']: return False
if not (conf.count_hit_only_visitors or\
hit['viewed_pages']):
return False
return True
2014-11-21 14:46:12 +01:00
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
2014-11-28 16:02:04 +01:00
if not remote_addr: return
2014-11-21 14:46:12 +01:00
if not remote_addr in self.current_analysis['visits'].keys():
2014-11-25 16:22:07 +01:00
self._createVisitor(hit)
2014-11-21 14:46:12 +01:00
super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
2014-11-21 14:46:12 +01:00
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
2014-12-14 15:10:13 +01:00
not self.hasBeenViewed(hit):
2014-11-21 14:46:12 +01:00
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
2014-11-25 16:22:07 +01:00
def _createVisitor(self, hit):
2014-11-21 14:46:12 +01:00
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
2014-11-26 16:17:16 +01:00
super_hit['remote_ip'] = hit['remote_addr']
2014-11-21 14:46:12 +01:00
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0
def _normalizeURI(self, uri):
if uri == '/': return uri
2016-01-16 08:43:29 +01:00
uri = self.final_slashes_re.sub('/', uri)
return uri
2016-01-16 08:43:29 +01:00
def _removeFinalSlashes(self, uri):
if uri == '/': return uri
return self.final_slashes_re.sub('', uri)
def _normalizeParameters(self, parameters):
# No parameters
if parameters == '?': return None
return parameters
2014-11-21 14:46:12 +01:00
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
2014-11-21 14:46:12 +01:00
if 'extract_parameters' in d.keys():
parameters = self._normalizeParameters(d['extract_parameters'])
if parameters:
hit['extract_request']['extract_parameters'] = parameters
2014-11-21 14:46:12 +01:00
else:
2014-12-16 20:23:33 +01:00
self.logger.warning("Bad request extraction %s" % (hit['request']))
2014-11-21 14:46:12 +01:00
return False
2014-11-26 16:17:16 +01:00
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict()
2016-01-16 08:43:29 +01:00
hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
2014-11-21 14:46:12 +01:00
return True
def _decodeTime(self, hit):
2014-12-10 21:41:22 +01:00
try:
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
except ValueError, e:
if sys.version_info < (3, 2):
# Try without UTC value at the end (%z not recognized)
gmt_offset_str = hit['time_local'][-5:]
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
gmt_offset = gmt_offset_hours + gmt_offset_minutes
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
2014-12-31 18:00:10 +01:00
# if gmt_offset_str[0] == '-':
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
# else:
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
2014-12-10 21:41:22 +01:00
else:
raise e
2014-11-26 19:53:00 +01:00
return hit['time_decoded']
2014-11-21 14:46:12 +01:00
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = self.getCurDisplayPath('index.html')
2014-11-21 14:46:12 +01:00
2014-11-21 16:56:58 +01:00
return self.display.getPage(filename)
2014-11-21 14:46:12 +01:00
2014-12-03 21:58:55 +01:00
def _generateDisplayDaysStats(self):
2014-11-21 14:46:12 +01:00
cur_time = self.meta_infos['last_time']
2014-12-19 17:50:45 +01:00
title = createCurTitle(self, self._('Statistics'))
filename = self.getCurDisplayPath('index.html')
2014-12-16 20:23:33 +01:00
self.logger.info('==> Generate display (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
2014-12-31 14:22:46 +01:00
link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
page.appendBlock(link)
2014-11-21 14:46:12 +01:00
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
2014-12-15 20:43:43 +01:00
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
2014-11-21 14:46:12 +01:00
nb_visits = 0
2014-11-28 16:26:11 +01:00
nb_days = 0
2014-12-04 19:15:15 +01:00
for i in range(1, nb_month_days+1):
day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
2014-12-15 20:43:43 +01:00
full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
2014-11-28 16:26:11 +01:00
if i in self.current_analysis['days_stats'].keys():
stats = self.current_analysis['days_stats'][i]
2014-12-15 20:43:43 +01:00
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
2014-12-15 20:43:43 +01:00
nb_visits += stats['nb_visits']
2014-11-28 16:26:11 +01:00
nb_days += 1
else:
2014-12-02 21:53:20 +01:00
row = [full_day, 0, 0, 0, 0, 0]
2014-11-21 16:56:58 +01:00
days.appendRow(row)
2014-12-04 19:15:15 +01:00
days.setCellValue(i-1, 4, bytesToStr(row[4]))
days.setCellValue(i-1, 5, bytesToStr(row[5]))
2014-12-02 21:53:20 +01:00
days.appendShortTitle(day)
2014-12-04 19:15:15 +01:00
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
2014-12-02 21:53:20 +01:00
week_day = adate.weekday()
2014-12-02 21:16:27 +01:00
if week_day == 5 or week_day == 6:
2014-12-04 19:15:15 +01:00
days.setRowCSSClass(i-1, 'iwla_weekend')
2014-12-02 21:53:20 +01:00
if adate == date.today():
css = days.getCellCSSClass(i-1, 0)
2014-12-02 21:53:20 +01:00
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
2014-12-04 19:15:15 +01:00
days.setCellCSSClass(i-1, 0, css)
2014-11-21 14:46:12 +01:00
stats = self.current_analysis['month_stats']
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
2014-11-26 16:17:16 +01:00
average_row = map(lambda(v): int(v/nb_days), row)
2014-11-21 14:46:12 +01:00
else:
2014-11-26 16:17:16 +01:00
average_row = map(lambda(v): 0, row)
2014-11-21 14:46:12 +01:00
average_row[0] = self._('Average')
2014-11-26 16:17:16 +01:00
average_row[4] = bytesToStr(average_row[4])
average_row[5] = bytesToStr(average_row[5])
days.appendRow(average_row)
2014-11-18 20:18:53 +01:00
row[0] = self._('Total')
row[4] = bytesToStr(row[4])
row[5] = bytesToStr(row[5])
2014-11-21 16:56:58 +01:00
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
2014-11-20 08:18:31 +01:00
2014-12-03 21:58:55 +01:00
def _generateDisplayMonthStats(self, page, year, month_stats):
2014-12-04 19:15:15 +01:00
cur_time = time.localtime()
2014-12-19 17:50:45 +01:00
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
title = '%s %d' % (self._('Summary'), year)
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
2014-12-15 20:43:43 +01:00
graph_cols=range(1,7)
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
2014-12-15 20:43:43 +01:00
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
2014-12-31 14:22:46 +01:00
months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
2014-12-03 21:58:55 +01:00
total = [0] * len(cols)
2014-12-04 19:15:15 +01:00
for i in range(1, 13):
month = '%s<br/>%d' % (months_name[i], year)
full_month = '%s %d' % (months_name[i], year)
2014-12-03 21:58:55 +01:00
if i in month_stats.keys():
stats = month_stats[i]
link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
2014-12-15 20:43:43 +01:00
row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
2014-12-04 19:15:15 +01:00
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols:
total[j] += row[j]
2014-12-03 21:58:55 +01:00
else:
2014-12-15 20:43:43 +01:00
row = [full_month, 0, 0, 0, 0, 0, 0, '']
2014-12-03 21:58:55 +01:00
months.appendRow(row)
2014-12-04 19:15:15 +01:00
months.setCellValue(i-1, 5, bytesToStr(row[5]))
2014-12-15 20:43:43 +01:00
months.setCellValue(i-1, 6, bytesToStr(row[6]))
2014-12-03 21:58:55 +01:00
months.appendShortTitle(month)
2014-12-31 14:22:46 +01:00
months_.appendRow(row[:-1])
months_.setCellValue(i-1, 5, bytesToStr(row[5]))
months_.setCellValue(i-1, 6, bytesToStr(row[6]))
months_.appendShortTitle(month)
2014-12-04 19:15:15 +01:00
if year == cur_time.tm_year and i == cur_time.tm_mon:
css = months.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
months.setCellCSSClass(i-1, 0, css)
2014-12-31 14:22:46 +01:00
months_.setCellCSSClass(i-1, 0, css)
2014-12-03 21:58:55 +01:00
total[0] = self._('Total')
2014-12-03 21:58:55 +01:00
total[5] = bytesToStr(total[5])
2014-12-15 20:43:43 +01:00
total[6] = bytesToStr(total[6])
total[7] = u''
2014-12-03 21:58:55 +01:00
months.appendRow(total)
page.appendBlock(months)
2014-12-31 14:22:46 +01:00
months_.appendRow(total[:-1])
filename = '%d/_stats.html' % (year)
page_ = self.display.createPage(u'', filename, conf.css_path)
page_.appendBlock(months_)
page_.build(conf.DISPLAY_ROOT, False)
2014-12-03 21:58:55 +01:00
def _generateDisplayWholeMonthStats(self):
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
2014-12-03 21:58:55 +01:00
filename = 'index.html'
2014-12-16 20:23:33 +01:00
self.logger.info('==> Generate main page (%s)' % (filename))
2014-12-03 21:58:55 +01:00
page = self.display.createPage(title, filename, conf.css_path)
2014-12-03 21:58:55 +01:00
2015-01-02 19:27:57 +01:00
last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
2015-01-02 19:27:57 +01:00
duration = datetime.now() - self.start_time
duration = time.gmtime(duration.seconds)
time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
if duration.tm_hour:
time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
2014-12-04 21:47:11 +01:00
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
2014-12-03 21:58:55 +01:00
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
self.display.addPage(page)
2014-12-15 21:28:25 +01:00
def _compressFile(self, build_time, root, filename):
path = os.path.join(root, filename)
gz_path = path + '.gz'
2014-12-16 20:23:33 +01:00
self.logger.debug('Compress %s => %s' % (path, gz_path))
2014-12-15 21:28:25 +01:00
if not os.path.exists(gz_path) or\
os.stat(path).st_mtime > build_time:
2014-12-19 18:06:02 +01:00
with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
f_out.write(f_in.read())
2014-12-15 21:28:25 +01:00
def _compressFiles(self, build_time, root):
if not conf.compress_output_files: return
for rootdir, subdirs, files in os.walk(root, followlinks=True):
for f in files:
for ext in conf.compress_output_files:
if f.endswith(ext):
self._compressFile(build_time, rootdir, f)
break
2014-11-21 14:46:12 +01:00
def _generateDisplay(self):
2014-12-03 21:58:55 +01:00
self._generateDisplayDaysStats()
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
2014-12-03 21:58:55 +01:00
self._generateDisplayWholeMonthStats()
2014-12-15 21:28:25 +01:00
build_time = time.localtime()
2014-11-24 21:37:37 +01:00
self.display.build(conf.DISPLAY_ROOT)
2014-12-15 21:28:25 +01:00
self._compressFiles(build_time, conf.DISPLAY_ROOT)
2014-11-20 08:18:31 +01:00
def _createEmptyStats(self):
2014-11-21 14:46:12 +01:00
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
2014-12-15 20:43:43 +01:00
stats['nb_visits'] = 0
2014-11-18 20:18:53 +01:00
2014-11-21 14:46:12 +01:00
return stats
2014-11-21 10:41:29 +01:00
2014-11-21 14:46:12 +01:00
def _generateMonthStats(self):
self._clearDisplay()
2014-11-21 14:46:12 +01:00
visits = self.current_analysis['visits']
stats = self._createEmptyStats()
for (day, stat) in self.current_analysis['days_stats'].items():
for k in stats.keys():
stats[k] += stat[k]
2014-12-15 20:43:43 +01:00
2014-12-01 21:13:35 +01:00
duplicated_stats = {k:v for (k,v) in stats.items()}
2014-11-21 14:46:12 +01:00
cur_time = self.meta_infos['last_time']
2014-12-16 20:23:33 +01:00
self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
self.logger.info(stats)
2014-11-21 14:46:12 +01:00
if not 'month_stats' in self.current_analysis.keys():
self.current_analysis['month_stats'] = stats
else:
for (k,v) in stats.items():
self.current_analysis['month_stats'][k] = v
2014-11-21 14:46:12 +01:00
self.valid_visitors = {}
for (k,v) in visits.items():
2016-01-18 07:33:48 +01:00
if self.isValidVisitor(v):
self.valid_visitors[k] = v
2014-12-15 20:43:43 +01:00
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
2014-12-03 21:58:55 +01:00
self._callPlugins(conf.POST_HOOK_DIRECTORY)
2014-11-21 16:56:58 +01:00
2014-11-21 14:46:12 +01:00
path = self.getDBFilename(cur_time)
if os.path.exists(path):
os.remove(path)
2014-12-16 20:23:33 +01:00
self.logger.info("==> Serialize to %s" % (path))
2014-11-21 14:46:12 +01:00
self._serialize(self.current_analysis, path)
2014-12-01 21:13:35 +01:00
# Save month stats
2014-12-03 21:58:55 +01:00
year = cur_time.tm_year
month = cur_time.tm_mon
2014-12-01 21:13:35 +01:00
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
if not year in self.meta_infos['stats'].keys():
self.meta_infos['stats'][year] = {}
self.meta_infos['stats'][year][month] = duplicated_stats
self.logger.info("==> Serialize to %s" % (conf.META_PATH))
self._serialize(self.meta_infos, conf.META_PATH)
2014-12-03 21:58:55 +01:00
self._generateDisplay()
2014-11-21 14:46:12 +01:00
def _generateDayStats(self):
visits = self.current_analysis['visits']
cur_time = self.meta_infos['last_time']
2014-11-21 14:46:12 +01:00
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
2014-11-21 14:46:12 +01:00
stats = self._createEmptyStats()
for (k, super_hit) in visits.items():
2014-12-14 14:50:30 +01:00
if super_hit['last_access'].tm_mday != cur_time.tm_mday:
continue
2016-01-16 08:44:57 +01:00
viewed_pages = False
2014-12-14 14:50:30 +01:00
for hit in super_hit['requests'][::-1]:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
2014-12-14 15:10:13 +01:00
not self.hasBeenViewed(hit):
2014-12-14 14:50:30 +01:00
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages) and\
not super_hit['robot']:
2014-12-15 20:43:43 +01:00
stats['nb_visits'] += 1
2014-11-21 14:46:12 +01:00
2014-12-16 20:23:33 +01:00
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
self.logger.info(stats)
2014-11-21 14:46:12 +01:00
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
2014-12-17 21:06:48 +01:00
if not self.domain_name_re.match(hit['server_name']):
self.logger.debug("Not in domain %s" % (hit))
2014-12-17 21:06:48 +01:00
return False
2014-11-26 19:53:00 +01:00
t = self._decodeTime(hit)
2014-11-21 14:46:12 +01:00
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if not self.analyse_started and\
time.mktime(t) <= time.mktime(cur_time):
self.logger.debug("Not in time")
2014-12-12 13:18:12 +01:00
return False
self.analyse_started = True
2014-11-21 14:46:12 +01:00
if cur_time.tm_mon != t.tm_mon:
self._generateDayStats()
2014-11-21 14:46:12 +01:00
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
2014-11-26 19:53:00 +01:00
if not self.meta_infos['start_analysis_time']:
self.meta_infos['start_analysis_time'] = t
2014-11-21 14:46:12 +01:00
if not self._decodeHTTPRequest(hit): return False
if hit['extract_request']['http_method'] not in ['GET', 'POST']:
return False
2014-11-21 14:46:12 +01:00
for k in hit.keys():
2014-11-28 16:02:04 +01:00
if hit[k] == '-' or hit[k] == '*':
hit[k] = ''
2014-12-17 21:06:48 +01:00
2014-11-21 14:46:12 +01:00
self._appendHit(hit)
return True
def _reset(self):
reset_time = time.strptime(self.args.reset, '%m/%Y')
self.logger.info('Reset time')
self.logger.info(reset_time)
self.meta_infos['last_time'] = reset_time
cur_time = time.localtime()
year = reset_time.tm_year
while year < cur_time.tm_year:
db_path = os.path.join(conf.DB_ROOT, str(year))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
if os.path.exists(output_path): shutil.rmtree(output_path)
year += 1
month = reset_time.tm_mon
while month <= cur_time.tm_mon:
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
if os.path.exists(output_path): shutil.rmtree(output_path)
month += 1
def start(self, _file, args):
self.args = args
2015-01-02 19:27:57 +01:00
self.start_time = datetime.now()
2014-12-16 20:23:33 +01:00
self.logger.info('==> Load previous database')
2014-11-21 14:46:12 +01:00
2014-11-24 21:37:37 +01:00
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
2014-12-10 21:41:22 +01:00
if self.meta_infos['last_time']:
if args.reset:
self._reset()
2014-12-16 20:23:33 +01:00
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
2014-11-21 14:46:12 +01:00
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
2014-11-26 19:53:00 +01:00
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
2014-12-16 20:23:33 +01:00
self.logger.info('==> Analysing log')
2014-11-25 16:22:07 +01:00
2014-12-03 10:55:32 +01:00
for l in _file:
# print "line " + l
2014-11-21 14:46:12 +01:00
2014-12-03 10:55:32 +01:00
groups = self.log_re.match(l)
2014-11-21 14:46:12 +01:00
2014-12-03 10:55:32 +01:00
if groups:
2014-12-17 21:06:48 +01:00
self._newHit(groups.groupdict())
2014-12-03 10:55:32 +01:00
else:
2014-12-16 20:23:33 +01:00
self.logger.warning("No match for %s" % (l))
2014-11-21 16:56:58 +01:00
#break
2014-11-21 14:46:12 +01:00
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
2014-11-26 19:53:00 +01:00
del self.meta_infos['start_analysis_time']
2014-11-21 14:46:12 +01:00
else:
2014-12-16 20:23:33 +01:00
self.logger.info('==> Analyse not started : nothing new')
2014-11-21 14:46:12 +01:00
class FileIter(object):
def __init__(self, filenames):
self.filenames = [f for f in filenames.split(',') if f]
for f in self.filenames:
if not os.path.exists(f):
print 'No such file \'%s\'' % (f)
sys.exit(-1)
self.cur_file = None
self._openNextFile()
def __iter__(self):
return self
def __next__(self):
return self.next()
def _openNextFile(self):
if self.cur_file:
self.cur_file.close()
self.cur_file = None
if not self.filenames:
raise StopIteration()
filename = self.filenames.pop(0)
if filename.endswith('gz'):
self.cur_file = gzip.open(filename, 'r')
else:
self.cur_file = open(filename)
def next(self):
l = self.cur_file.readline()
if not l:
self._openNextFile()
l = self.cur_file.readline()
return l[:-1]
2014-11-21 16:56:58 +01:00
if __name__ == '__main__':
2014-12-03 10:55:32 +01:00
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
default=False,
help='Clean output before starting')
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
default=False,
help='Read data from stdin instead of conf.analyzed_filename')
2014-12-14 15:46:01 +01:00
parser.add_argument('-f', '--file', dest='file',
2015-07-13 13:09:32 +02:00
help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
2014-12-14 15:46:01 +01:00
2014-12-15 22:30:49 +01:00
parser.add_argument('-d', '--log-level', dest='loglevel',
2014-12-16 20:23:33 +01:00
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
2014-12-15 22:30:49 +01:00
parser.add_argument('-r', '--reset', dest='reset',
2015-05-23 08:53:06 +02:00
default=False,
help='Reset analysis to a specific date (month/year)')
2015-05-23 16:38:39 +02:00
parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
default=False,
2015-07-13 13:09:32 +02:00
help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
2015-05-23 16:38:39 +02:00
2014-12-03 10:55:32 +01:00
args = parser.parse_args()
# Load user conf
for (k,v) in user_conf.__dict__.items():
if k.endswith('_append'):
new_k = k[:-7]
if new_k in dir(conf):
if type(conf.__dict__[new_k]) == list:
if type(v) == list:
conf.__dict__[new_k] += v
else:
conf.__dict__[new_k].append(v)
else:
print("Error %s is not a list" % (new_k))
else:
print("Error %s doesn't exists in default conf" % (new_k))
else:
conf.__dict__.update({k:v})
2014-12-03 10:55:32 +01:00
if args.clean_output:
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
2014-12-16 20:23:33 +01:00
loglevel = getattr(logging, args.loglevel.upper(), None)
if not isinstance(loglevel, int):
2014-12-15 22:30:49 +01:00
raise ValueError('Invalid log level: %s' % (args.loglevel))
2014-12-17 19:00:42 +01:00
2014-12-16 20:23:33 +01:00
iwla = IWLA(loglevel)
2014-12-15 22:30:49 +01:00
required_conf = ['analyzed_filename', 'domain_name']
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
sys.exit(0)
2014-12-03 10:55:32 +01:00
if args.stdin:
iwla.start(sys.stdin, args)
2014-12-03 10:55:32 +01:00
else:
2014-12-14 15:46:01 +01:00
filename = args.file or conf.analyzed_filename
iwla.start(FileIter(filename), args)