bb268114b2
Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin
907 lines
33 KiB
Python
Executable File
907 lines
33 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright Grégory Soutadé 2015
|
|
|
|
# This file is part of iwla
|
|
|
|
# iwla is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# iwla is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import re
|
|
import time
|
|
import cPickle
|
|
import gzip
|
|
import importlib
|
|
import argparse
|
|
import logging
|
|
import gettext
|
|
from calendar import monthrange
|
|
from datetime import date, datetime
|
|
|
|
import default_conf as conf
|
|
import conf as user_conf
|
|
|
|
from iplugin import *
|
|
from display import *
|
|
|
|
"""
|
|
Main class IWLA
|
|
Parse Log, compute them, call plugins and produce output
|
|
For now, only HTTP log are valid
|
|
|
|
Plugin requirements :
|
|
None
|
|
|
|
Conf values needed :
|
|
analyzed_filename
|
|
domain_name
|
|
locales_path
|
|
compress_output_files*
|
|
|
|
Output files :
|
|
DB_ROOT/meta.db
|
|
DB_ROOT/year/month/iwla.db
|
|
OUTPUT_ROOT/index.html
|
|
OUTPUT_ROOT/year/_stats.html
|
|
OUTPUT_ROOT/year/month/index.html
|
|
|
|
Statistics creation :
|
|
|
|
meta :
|
|
last_time
|
|
start_analysis_time
|
|
stats =>
|
|
year =>
|
|
month =>
|
|
viewed_bandwidth
|
|
not_viewed_bandwidth
|
|
viewed_pages
|
|
viewed_hits
|
|
nb_visits
|
|
nb_visitors
|
|
|
|
month_stats :
|
|
viewed_bandwidth
|
|
not_viewed_bandwidth
|
|
viewed_pages
|
|
viewed_hits
|
|
nb_visits
|
|
|
|
days_stats :
|
|
day =>
|
|
viewed_bandwidth
|
|
not_viewed_bandwidth
|
|
viewed_pages
|
|
viewed_hits
|
|
nb_visits
|
|
nb_visitors
|
|
|
|
visits :
|
|
remote_addr =>
|
|
remote_addr
|
|
remote_ip
|
|
viewed_pages{0..31} # 0 contains total
|
|
viewed_hits{0..31} # 0 contains total
|
|
not_viewed_pages{0..31}
|
|
not_viewed_hits{0..31}
|
|
bandwidth{0..31}
|
|
last_access
|
|
requests =>
|
|
[fields_from_format_log]
|
|
extract_request =>
|
|
http_method
|
|
http_uri
|
|
http_version
|
|
extract_uri
|
|
extract_parameters*
|
|
extract_referer* =>
|
|
extract_uri
|
|
extract_parameters*
|
|
robot
|
|
hit_only
|
|
is_page
|
|
|
|
valid_visitors:
|
|
month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
|
|
|
|
Statistics update :
|
|
None
|
|
|
|
Statistics deletion :
|
|
None
|
|
"""
|
|
|
|
|
|
class IWLA(object):
|
|
|
|
ANALYSIS_CLASS = 'HTTP'
|
|
API_VERSION = 1
|
|
IWLA_VERSION = '0.5-dev'
|
|
|
|
def __init__(self, logLevel, dry_run):
|
|
self.meta_infos = {}
|
|
self.analyse_started = False
|
|
self.current_analysis = {}
|
|
self.start_time = 0
|
|
self.cache_plugins = {}
|
|
self.display = DisplayHTMLBuild(self)
|
|
self.valid_visitors = None
|
|
self.dry_run = dry_run
|
|
|
|
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
|
|
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
|
|
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
|
self.log_re = re.compile(self.log_format_extracted)
|
|
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
|
|
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
|
|
self.final_slashes_re = re.compile(r'/+$')
|
|
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
|
|
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
|
|
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
|
|
|
|
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
if self.dry_run:
|
|
self.logger.info('==> Start (DRY RUN)')
|
|
else:
|
|
self.logger.info('==> Start')
|
|
try:
|
|
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
|
|
self.logger.info('\tUsing locale %s' % (conf.locale))
|
|
except IOError:
|
|
t = gettext.NullTranslations()
|
|
self.logger.info('\tUsing default locale en_EN')
|
|
self._ = t.ugettext
|
|
|
|
def getVersion(self):
|
|
return IWLA.IWLA_VERSION
|
|
|
|
def getConfValue(self, key, default=None):
|
|
if not key in dir(conf):
|
|
return default
|
|
else:
|
|
return conf.__dict__[key]
|
|
|
|
def _clearVisits(self):
|
|
self.current_analysis = {
|
|
'days_stats' : {},
|
|
'month_stats' : {},
|
|
'visits' : {}
|
|
}
|
|
self.valid_visitors = None
|
|
return self.current_analysis
|
|
|
|
def getDaysStats(self):
|
|
return self.current_analysis['days_stats']
|
|
|
|
def getMonthStats(self):
|
|
return self.current_analysis['month_stats']
|
|
|
|
def getCurrentVisits(self):
|
|
return self.current_analysis['visits']
|
|
|
|
def getValidVisitors(self):
|
|
return self.valid_visitors
|
|
|
|
def getDisplay(self):
|
|
return self.display
|
|
|
|
def getCurTime(self):
|
|
return self.meta_infos['last_time']
|
|
|
|
def getStartAnalysisTime(self):
|
|
return self.meta_infos['start_analysis_time']
|
|
|
|
def isValidForCurrentAnalysis(self, request):
|
|
cur_time = self.meta_infos['start_analysis_time']
|
|
# Analyse not started
|
|
if not cur_time: return False
|
|
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
|
|
|
|
def hasBeenViewed(self, request):
|
|
return int(request['status']) in conf.viewed_http_codes
|
|
|
|
def getCurDisplayPath(self, filename):
|
|
cur_time = self.meta_infos['last_time']
|
|
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
|
|
|
|
def getResourcesPath(self):
|
|
return conf.resources_path
|
|
|
|
def getCSSPath(self):
|
|
return conf.css_path
|
|
|
|
def _clearMeta(self):
|
|
self.meta_infos = {
|
|
'last_time' : None,
|
|
'start_analysis_time' : None
|
|
}
|
|
return self.meta_infos
|
|
|
|
def _clearDisplay(self):
|
|
self.display.clear()
|
|
return self.display
|
|
|
|
def getDBFilename(self, time):
|
|
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
|
|
|
|
def _openDB(self, filename, prot='r'):
|
|
if self.args.dont_compress:
|
|
return open(filename, prot)
|
|
else:
|
|
return gzip.open(filename, prot)
|
|
|
|
def _serialize(self, obj, filename):
|
|
if self.dry_run: return
|
|
base = os.path.dirname(filename)
|
|
if not os.path.exists(base):
|
|
os.makedirs(base)
|
|
|
|
# Make a backup in case of something fails
|
|
if os.path.exists(filename):
|
|
shutil.copy(filename, filename + '.bak')
|
|
|
|
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
|
|
cPickle.dump(obj, f)
|
|
f.seek(0)
|
|
fzip.write(f.read())
|
|
os.fsync(fzip)
|
|
os.remove(filename + '.tmp')
|
|
if os.path.exists(filename + '.bak'):
|
|
os.remove(filename + '.bak')
|
|
|
|
def _deserialize(self, filename):
|
|
if not os.path.exists(filename):
|
|
return None
|
|
|
|
res = None
|
|
with self._openDB(filename) as f:
|
|
res = cPickle.load(f)
|
|
return res
|
|
|
|
def _callPlugins(self, target_root, *args):
|
|
self.logger.info('==> Call plugins (%s)' % (target_root))
|
|
for (root, plugins) in self.plugins:
|
|
if root != target_root: continue
|
|
for p in plugins:
|
|
mod = self.cache_plugins.get(root + '.' + p, None)
|
|
if mod:
|
|
self.logger.info('\t%s' % (p))
|
|
mod.hook(*args)
|
|
|
|
def isPage(self, request):
|
|
self.logger.debug("Is page %s" % (request))
|
|
for e in conf.pages_extensions:
|
|
if request.endswith(e):
|
|
self.logger.debug("True")
|
|
return True
|
|
self.logger.debug("False")
|
|
return False
|
|
|
|
def isMultimediaFile(self, request):
|
|
self.logger.debug("Is multimedia %s" % (request))
|
|
for e in conf.multimedia_files:
|
|
if request.endswith(e):
|
|
self.logger.debug("True")
|
|
return True
|
|
self.logger.debug("False")
|
|
return False
|
|
|
|
def isValidVisitor(self, hit):
|
|
if hit['robot']: return False
|
|
if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
|
|
return False
|
|
return True
|
|
|
|
def isRobot(self, hit):
|
|
return hit['robot']
|
|
|
|
def _appendHit(self, hit):
|
|
remote_addr = hit['remote_addr']
|
|
|
|
if not remote_addr: return
|
|
|
|
if not remote_addr in self.current_analysis['visits'].keys():
|
|
self._createVisitor(hit)
|
|
|
|
super_hit = self.current_analysis['visits'][remote_addr]
|
|
# Don't keep all requests for robots
|
|
if not super_hit['robot']:
|
|
super_hit['requests'].append(hit)
|
|
|
|
day = self.meta_infos['last_time'].tm_mday
|
|
if self.hasBeenViewed(hit):
|
|
super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
|
|
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
|
|
|
request = hit['extract_request']
|
|
|
|
uri = request.get('extract_uri', request['http_uri'])
|
|
|
|
hit['is_page'] = self.isPage(uri)
|
|
|
|
if super_hit['robot'] or\
|
|
not self.hasBeenViewed(hit):
|
|
page_key = 'not_viewed_pages'
|
|
hit_key = 'not_viewed_hits'
|
|
else:
|
|
page_key = 'viewed_pages'
|
|
hit_key = 'viewed_hits'
|
|
|
|
if hit['is_page']:
|
|
super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
|
|
super_hit[page_key][0] += 1
|
|
else:
|
|
super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
|
|
super_hit[hit_key][0] += 1
|
|
|
|
def _createVisitor(self, hit):
|
|
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
|
|
super_hit['remote_addr'] = hit['remote_addr']
|
|
super_hit['remote_ip'] = hit['remote_addr']
|
|
super_hit['viewed_pages'] = {0:0}
|
|
super_hit['viewed_hits'] = {0:0}
|
|
super_hit['not_viewed_pages'] = {0:0}
|
|
super_hit['not_viewed_hits'] = {0:0}
|
|
super_hit['bandwidth'] = {0:0}
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
|
super_hit['requests'] = []
|
|
super_hit['robot'] = False
|
|
super_hit['hit_only'] = 0
|
|
|
|
def _normalizeURI(self, uri):
|
|
if uri == '/': return uri
|
|
uri = self.final_slashes_re.sub('/', uri)
|
|
return uri
|
|
|
|
def _removeFinalSlashes(self, uri):
|
|
if uri == '/': return uri
|
|
return self.final_slashes_re.sub('', uri)
|
|
|
|
def _normalizeParameters(self, parameters):
|
|
# No parameters
|
|
if parameters == '?': return None
|
|
return parameters
|
|
|
|
def _decodeHTTPRequest(self, hit):
|
|
if not 'request' in hit.keys(): return False
|
|
|
|
groups = self.http_request_extracted.match(hit['request'])
|
|
|
|
if groups:
|
|
hit['extract_request'] = groups.groupdict("")
|
|
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
|
|
if uri_groups:
|
|
d = uri_groups.groupdict("")
|
|
hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
|
|
if 'extract_parameters' in d.keys():
|
|
parameters = self._normalizeParameters(d['extract_parameters'])
|
|
if parameters:
|
|
hit['extract_request']['extract_parameters'] = parameters
|
|
else:
|
|
self.logger.warning("Bad request extraction %s" % (hit['request']))
|
|
return False
|
|
|
|
if hit['http_referer']:
|
|
referer_groups = self.uri_re.match(hit['http_referer'])
|
|
if referer_groups:
|
|
hit['extract_referer'] = referer_groups.groupdict("")
|
|
hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
|
|
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
|
|
return True
|
|
|
|
def _decodeTime(self, hit):
|
|
try:
|
|
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
|
|
except ValueError, e:
|
|
if sys.version_info < (3, 2):
|
|
# Try without UTC value at the end (%z not recognized)
|
|
gmt_offset_str = hit['time_local'][-5:]
|
|
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
|
|
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
|
|
gmt_offset = gmt_offset_hours + gmt_offset_minutes
|
|
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
|
|
# if gmt_offset_str[0] == '-':
|
|
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
|
|
# else:
|
|
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
|
|
else:
|
|
raise e
|
|
return hit['time_decoded']
|
|
|
|
def getDisplayIndex(self):
|
|
cur_time = self.meta_infos['last_time']
|
|
filename = self.getCurDisplayPath('index.html')
|
|
|
|
return self.display.getPage(filename)
|
|
|
|
def _generateDisplayDaysStats(self):
|
|
cur_time = self.meta_infos['last_time']
|
|
title = createCurTitle(self, self._('Statistics'))
|
|
filename = self.getCurDisplayPath('index.html')
|
|
self.logger.info('==> Generate display (%s)' % (filename))
|
|
page = self.display.createPage(title, filename, conf.css_path)
|
|
link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
|
|
page.appendBlock(link)
|
|
|
|
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
|
|
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
|
|
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
|
nb_visits = 0
|
|
nb_days = 0
|
|
for i in range(1, nb_month_days+1):
|
|
day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
|
|
full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
|
|
if i in self.current_analysis['days_stats'].keys():
|
|
stats = self.current_analysis['days_stats'][i]
|
|
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
|
nb_visits += stats['nb_visits']
|
|
nb_days += 1
|
|
else:
|
|
row = [full_day, 0, 0, 0, 0, 0]
|
|
days.appendRow(row)
|
|
days.setCellValue(i-1, 4, bytesToStr(row[4]))
|
|
days.setCellValue(i-1, 5, bytesToStr(row[5]))
|
|
days.appendShortTitle(day)
|
|
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
|
|
week_day = adate.weekday()
|
|
if week_day == 5 or week_day == 6:
|
|
days.setRowCSSClass(i-1, 'iwla_weekend')
|
|
if adate == date.today():
|
|
css = days.getCellCSSClass(i-1, 0)
|
|
if css: css = '%s %s' % (css, 'iwla_curday')
|
|
else: css = 'iwla_curday'
|
|
days.setCellCSSClass(i-1, 0, css)
|
|
|
|
stats = self.current_analysis['month_stats']
|
|
|
|
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
|
if nb_days:
|
|
average_row = map(lambda(v): int(v/nb_days), row)
|
|
else:
|
|
average_row = map(lambda(v): 0, row)
|
|
|
|
average_row[0] = self._('Average')
|
|
average_row[4] = bytesToStr(average_row[4])
|
|
average_row[5] = bytesToStr(average_row[5])
|
|
days.appendRow(average_row)
|
|
|
|
row[0] = self._('Total')
|
|
row[4] = bytesToStr(row[4])
|
|
row[5] = bytesToStr(row[5])
|
|
days.appendRow(row)
|
|
page.appendBlock(days)
|
|
self.display.addPage(page)
|
|
|
|
def _generateDisplayMonthStats(self, page, year, month_stats):
|
|
cur_time = time.localtime()
|
|
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
|
|
title = '%s %d' % (self._('Summary'), year)
|
|
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
|
|
graph_cols=range(1,7)
|
|
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
|
|
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
|
|
months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
|
|
months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
|
total = [0] * len(cols)
|
|
for i in range(1, 13):
|
|
month = '%s<br/>%d' % (months_name[i], year)
|
|
full_month = '%s %d' % (months_name[i], year)
|
|
if i in month_stats.keys():
|
|
stats = month_stats[i]
|
|
link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
|
|
row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
|
|
for j in graph_cols:
|
|
total[j] += row[j]
|
|
else:
|
|
row = [full_month, 0, 0, 0, 0, 0, 0, '']
|
|
months.appendRow(row)
|
|
months.setCellValue(i-1, 5, bytesToStr(row[5]))
|
|
months.setCellValue(i-1, 6, bytesToStr(row[6]))
|
|
months.appendShortTitle(month)
|
|
months_.appendRow(row[:-1])
|
|
months_.setCellValue(i-1, 5, bytesToStr(row[5]))
|
|
months_.setCellValue(i-1, 6, bytesToStr(row[6]))
|
|
months_.appendShortTitle(month)
|
|
if year == cur_time.tm_year and i == cur_time.tm_mon:
|
|
css = months.getCellCSSClass(i-1, 0)
|
|
if css: css = '%s %s' % (css, 'iwla_curday')
|
|
else: css = 'iwla_curday'
|
|
months.setCellCSSClass(i-1, 0, css)
|
|
months_.setCellCSSClass(i-1, 0, css)
|
|
|
|
total[0] = self._('Total')
|
|
total[5] = bytesToStr(total[5])
|
|
total[6] = bytesToStr(total[6])
|
|
total[7] = u''
|
|
months.appendRow(total)
|
|
page.appendBlock(months)
|
|
|
|
months_.appendRow(total[:-1])
|
|
filename = '%d/_stats.html' % (year)
|
|
page_ = self.display.createPage(u'', filename, conf.css_path)
|
|
page_.appendBlock(months_)
|
|
page_.build(conf.DISPLAY_ROOT, False)
|
|
|
|
def _generateDisplayWholeMonthStats(self):
|
|
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
|
|
filename = 'index.html'
|
|
|
|
self.logger.info('==> Generate main page (%s)' % (filename))
|
|
|
|
page = self.display.createPage(title, filename, conf.css_path)
|
|
|
|
last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
|
|
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
|
|
duration = datetime.now() - self.start_time
|
|
duration = time.gmtime(duration.seconds)
|
|
time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
|
|
if duration.tm_hour:
|
|
time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
|
|
time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
|
|
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
|
|
|
|
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
|
|
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
|
|
|
|
self.display.addPage(page)
|
|
|
|
def _compressFile(self, build_time, root, filename):
|
|
path = os.path.join(root, filename)
|
|
gz_path = path + '.gz'
|
|
|
|
self.logger.debug('Compress %s => %s' % (path, gz_path))
|
|
|
|
if not os.path.exists(gz_path) or\
|
|
os.stat(path).st_mtime >= build_time:
|
|
if self.dry_run: return
|
|
with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
|
|
f_out.write(f_in.read())
|
|
|
|
def _compressFiles(self, build_time, root):
|
|
if not conf.compress_output_files: return
|
|
for rootdir, subdirs, files in os.walk(root, followlinks=True):
|
|
for f in files:
|
|
for ext in conf.compress_output_files:
|
|
if f.endswith(ext):
|
|
self._compressFile(build_time, rootdir, f)
|
|
break
|
|
|
|
def _generateDisplay(self):
|
|
self._generateDisplayDaysStats()
|
|
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
|
|
self._generateDisplayWholeMonthStats()
|
|
build_time = time.mktime(time.localtime())
|
|
self.display.build(conf.DISPLAY_ROOT)
|
|
self._compressFiles(build_time, conf.DISPLAY_ROOT)
|
|
|
|
def _createEmptyStats(self):
|
|
stats = {}
|
|
stats['viewed_bandwidth'] = 0
|
|
stats['not_viewed_bandwidth'] = 0
|
|
stats['viewed_pages'] = 0
|
|
stats['viewed_hits'] = 0
|
|
stats['nb_visits'] = 0
|
|
|
|
return stats
|
|
|
|
def _generateMonthStats(self):
|
|
self._clearDisplay()
|
|
|
|
visits = self.current_analysis['visits']
|
|
|
|
stats = self._createEmptyStats()
|
|
for (day, stat) in self.current_analysis['days_stats'].items():
|
|
for k in stats.keys():
|
|
stats[k] += stat[k]
|
|
|
|
duplicated_stats = {k:v for (k,v) in stats.items()}
|
|
|
|
cur_time = self.meta_infos['last_time']
|
|
self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
|
|
self.logger.info(stats)
|
|
|
|
if not 'month_stats' in self.current_analysis.keys():
|
|
self.current_analysis['month_stats'] = stats
|
|
else:
|
|
for (k,v) in stats.items():
|
|
self.current_analysis['month_stats'][k] = v
|
|
|
|
self.valid_visitors = {}
|
|
for (k,v) in visits.items():
|
|
if self.isValidVisitor(v):
|
|
self.valid_visitors[k] = v
|
|
|
|
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
|
|
|
|
self._callPlugins(conf.POST_HOOK_DIRECTORY)
|
|
|
|
if args.display_only:
|
|
self._generateDisplay()
|
|
return
|
|
|
|
path = self.getDBFilename(cur_time)
|
|
|
|
self.logger.info("==> Serialize to %s" % (path))
|
|
self._serialize(self.current_analysis, path)
|
|
|
|
# Save month stats
|
|
year = cur_time.tm_year
|
|
month = cur_time.tm_mon
|
|
if not 'stats' in self.meta_infos.keys():
|
|
self.meta_infos['stats'] = {}
|
|
if not year in self.meta_infos['stats'].keys():
|
|
self.meta_infos['stats'][year] = {}
|
|
self.meta_infos['stats'][year][month] = duplicated_stats
|
|
|
|
self.logger.info("==> Serialize to %s" % (conf.META_PATH))
|
|
self._serialize(self.meta_infos, conf.META_PATH)
|
|
|
|
self._generateDisplay()
|
|
|
|
def _generateDayStats(self):
|
|
if args.display_only:
|
|
return
|
|
|
|
visits = self.current_analysis['visits']
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
|
|
|
|
stats = self._createEmptyStats()
|
|
|
|
day = cur_time.tm_mday
|
|
for (k, super_hit) in visits.items():
|
|
if super_hit['last_access'].tm_mday != day:
|
|
continue
|
|
if super_hit['robot']:
|
|
stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
|
|
continue
|
|
stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
|
|
stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
|
|
stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
|
|
if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
|
|
super_hit['viewed_pages'].get(day, 0)):
|
|
stats['nb_visits'] += 1
|
|
|
|
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
|
|
self.logger.info(stats)
|
|
|
|
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
|
|
|
|
def _newHit(self, hit):
|
|
if not self.domain_name_re.match(hit['server_name']):
|
|
self.logger.debug("Not in domain %s" % (hit))
|
|
return False
|
|
|
|
t = self._decodeTime(hit)
|
|
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
if cur_time == None:
|
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
|
self.analyse_started = True
|
|
else:
|
|
if not self.analyse_started and\
|
|
time.mktime(t) <= time.mktime(cur_time):
|
|
self.logger.debug("Not in time")
|
|
return False
|
|
self.analyse_started = True
|
|
if t < cur_time: # Don't accept past hits
|
|
return False
|
|
if cur_time.tm_mon != t.tm_mon:
|
|
self._generateDayStats()
|
|
self._generateMonthStats()
|
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
|
elif cur_time.tm_mday != t.tm_mday:
|
|
self._generateDayStats()
|
|
|
|
self.meta_infos['last_time'] = t
|
|
|
|
if not self.meta_infos['start_analysis_time']:
|
|
self.meta_infos['start_analysis_time'] = t
|
|
|
|
if not self._decodeHTTPRequest(hit): return False
|
|
|
|
if hit['extract_request']['http_method'] not in ['GET', 'POST']:
|
|
return False
|
|
|
|
for k in hit.keys():
|
|
if hit[k] == '-' or hit[k] == '*':
|
|
hit[k] = ''
|
|
|
|
self._appendHit(hit)
|
|
|
|
return True
|
|
|
|
def _reset(self):
|
|
reset_time = time.strptime(self.args.reset, '%m/%Y')
|
|
|
|
self.logger.info('Reset time')
|
|
self.logger.info(reset_time)
|
|
|
|
self.meta_infos['last_time'] = reset_time
|
|
|
|
cur_time = time.localtime()
|
|
year = reset_time.tm_year
|
|
while year < cur_time.tm_year:
|
|
db_path = os.path.join(conf.DB_ROOT, str(year))
|
|
if os.path.exists(db_path): shutil.rmtree(db_path)
|
|
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
|
|
if os.path.exists(output_path): shutil.rmtree(output_path)
|
|
year += 1
|
|
month = reset_time.tm_mon
|
|
while month <= cur_time.tm_mon:
|
|
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
|
|
if os.path.exists(db_path): shutil.rmtree(db_path)
|
|
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
|
|
if os.path.exists(output_path): shutil.rmtree(output_path)
|
|
month += 1
|
|
|
|
def start(self, _file, args):
|
|
self.args = args
|
|
self.start_time = datetime.now()
|
|
|
|
self.logger.info('==> Load previous database')
|
|
|
|
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
|
if self.meta_infos['last_time']:
|
|
if args.reset:
|
|
self._reset()
|
|
self.logger.info('Last time')
|
|
self.logger.info(self.meta_infos['last_time'])
|
|
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
|
|
else:
|
|
self._clearVisits()
|
|
|
|
self.meta_infos['start_analysis_time'] = None
|
|
|
|
self.cache_plugins = preloadPlugins(self.plugins, self)
|
|
|
|
self.logger.info('==> Analysing log')
|
|
|
|
for l in _file:
|
|
# print "line " + l
|
|
|
|
groups = self.log_re.match(l)
|
|
|
|
if groups:
|
|
self._newHit(groups.groupdict(""))
|
|
else:
|
|
self.logger.warning("No match for %s" % (l))
|
|
#break
|
|
|
|
if self.analyse_started:
|
|
self._generateDayStats()
|
|
self._generateMonthStats()
|
|
del self.meta_infos['start_analysis_time']
|
|
else:
|
|
self.logger.info('==> Analyse not started : nothing new')
|
|
|
|
|
|
class FileIter(object):
|
|
def __init__(self, filenames):
|
|
self.filenames = [f for f in filenames.split(',') if f]
|
|
for f in self.filenames:
|
|
if not os.path.exists(f):
|
|
print 'No such file \'%s\'' % (f)
|
|
sys.exit(-1)
|
|
self.cur_file = None
|
|
self._openNextFile()
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
return self.next()
|
|
|
|
def _openNextFile(self):
|
|
if self.cur_file:
|
|
self.cur_file.close()
|
|
self.cur_file = None
|
|
if not self.filenames:
|
|
raise StopIteration()
|
|
filename = self.filenames.pop(0)
|
|
if filename.endswith('gz'):
|
|
self.cur_file = gzip.open(filename, 'r')
|
|
else:
|
|
self.cur_file = open(filename)
|
|
|
|
def next(self):
|
|
l = self.cur_file.readline()
|
|
if not l:
|
|
self._openNextFile()
|
|
l = self.cur_file.readline()
|
|
return l[:-1]
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
|
|
|
|
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
|
|
default=False,
|
|
help='Clean output before starting')
|
|
|
|
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
|
|
default=False,
|
|
help='Read data from stdin instead of conf.analyzed_filename')
|
|
|
|
parser.add_argument('-f', '--file', dest='file',
|
|
help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
|
|
|
|
parser.add_argument('-d', '--log-level', dest='loglevel',
|
|
default='INFO', type=str,
|
|
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
|
|
|
|
parser.add_argument('-r', '--reset', dest='reset',
|
|
default=False,
|
|
help='Reset analysis to a specific date (month/year)')
|
|
|
|
parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
|
|
default=False,
|
|
help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
|
|
|
|
parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
|
|
default=False,
|
|
help='Only generate display')
|
|
|
|
parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
|
|
default=False,
|
|
help='Process log but don\'t write files (database and HTML) to disk')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load user conf
|
|
for (k,v) in user_conf.__dict__.items():
|
|
if k.endswith('_append'):
|
|
new_k = k[:-7]
|
|
if new_k in dir(conf):
|
|
if type(conf.__dict__[new_k]) == list:
|
|
if type(v) == list:
|
|
conf.__dict__[new_k] += v
|
|
else:
|
|
conf.__dict__[new_k].append(v)
|
|
else:
|
|
print("Error %s is not a list" % (new_k))
|
|
else:
|
|
print("Error %s doesn't exists in default conf" % (new_k))
|
|
else:
|
|
conf.__dict__.update({k:v})
|
|
|
|
if args.clean_output and not args.dry_run:
|
|
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
|
|
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
|
|
|
|
loglevel = getattr(logging, args.loglevel.upper(), None)
|
|
if not isinstance(loglevel, int):
|
|
raise ValueError('Invalid log level: %s' % (args.loglevel))
|
|
|
|
iwla = IWLA(loglevel, args.dry_run)
|
|
|
|
required_conf = ['analyzed_filename', 'domain_name']
|
|
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
|
|
sys.exit(0)
|
|
|
|
if args.stdin:
|
|
iwla.start(sys.stdin, args)
|
|
else:
|
|
filename = args.file or conf.analyzed_filename
|
|
iwla.start(FileIter(filename), args)
|