iwla/iwla.py

1016 lines
37 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
import os
import shutil
import sys
import re
import time
import pickle
import gzip
import importlib
import argparse
import logging
import gettext
from calendar import monthrange
from datetime import date, datetime
import socket
import default_conf as conf
from iplugin import *
from display import *
"""
Main class IWLA
Parse Log, compute them, call plugins and produce output
For now, only HTTP log are valid
Plugin requirements :
None
Conf values needed :
analyzed_filename
domain_name
locales_path
compress_output_files
excluded_ip
excluded_domain_name
reverse_dns_timeout*
Output files :
DB_ROOT/meta.db
DB_ROOT/year/month/iwla.db
OUTPUT_ROOT/index.html
OUTPUT_ROOT/year/_stats.html
OUTPUT_ROOT/year/month/index.html
Statistics creation :
meta :
last_time
start_analysis_time
stats =>
year =>
month =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
month_stats :
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
days_stats :
day =>
viewed_bandwidth
not_viewed_bandwidth
viewed_pages
viewed_hits
nb_visits
nb_visitors
visits :
remote_ip =>
remote_addr
remote_ip
viewed_pages{0..31} # 0 contains total
viewed_hits{0..31} # 0 contains total
not_viewed_pages{0..31}
not_viewed_hits{0..31}
bandwidth{0..31}
last_access
requests =>
[fields_from_format_log]
extract_request =>
http_method
http_uri
http_version
extract_uri
extract_parameters*
extract_referer* =>
extract_uri
extract_parameters*
robot
hit_only
is_page
keep_requests
valid_visitors:
month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
Statistics update :
None
Statistics deletion :
None
"""
class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.8'
DEFAULT_DNS_TIMEOUT = 0.5
def __init__(self, logLevel, args):
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
self.start_time = 0
self.cache_plugins = {}
self.display = DisplayHTMLBuild(self)
self.valid_visitors = None
self.args = args
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
IWLA.DEFAULT_DNS_TIMEOUT)
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
self.slash_re = re.compile(r'//')
self.protocol_re = re.compile(r'^.*://')
self.excluded_ip = []
for ip in conf.excluded_ip:
self.excluded_ip += [re.compile(ip)]
self.excluded_domain_name = []
for domain_name in conf.excluded_domain_name:
self.excluded_domain_name += [re.compile(domain_name)]
self.multimedia_files_re = []
for file_re in conf.multimedia_files_re:
self.multimedia_files_re += [re.compile(file_re)]
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
self.logger = logging.getLogger(self.__class__.__name__)
if self.args.dry_run:
self.logger.info('==> Start (DRY RUN)')
else:
self.logger.info('==> Start')
try:
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale])
self.logger.info('\tUsing locale %s' % (conf.locale))
except IOError:
t = gettext.NullTranslations()
self.logger.info('\tUsing default locale en_EN')
self._ = t.gettext
def getVersion(self):
return IWLA.IWLA_VERSION
def getConfValue(self, key, default=None):
if not key in dir(conf):
return default
else:
return conf.__dict__[key]
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def getDaysStats(self):
return self.current_analysis['days_stats']
def getMonthStats(self):
return self.current_analysis['month_stats']
def getCurrentVisits(self):
return self.current_analysis['visits']
def getSortedCurrentVisits(self):
visits = self.current_analysis['visits'].values()
return sorted(visits, key=lambda hit: hit['last_access'])
def getValidVisitors(self):
return self.valid_visitors
def getDisplay(self):
return self.display
def getCurTime(self):
return self.meta_infos['last_time']
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
# Analyse not started
if not cur_time: return False
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def hasBeenViewed(self, request):
return int(request['status']) in conf.viewed_http_codes
def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time']
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
def getResourcesPath(self):
return conf.resources_path
def getCSSPath(self):
return conf.css_path
def reverseDNS(self, hit):
if hit.get('dns_name_replaced', False):
return hit['remote_addr']
try:
timeout = socket.getdefaulttimeout()
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(self.reverse_dns_timeout)
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(timeout)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except socket.herror:
pass
finally:
hit['dns_analysed'] = True
return hit['remote_addr']
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
'start_analysis_time' : None
}
return self.meta_infos
def _clearDisplay(self):
self.display.clear()
return self.display
def getDBFilename(self, time):
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
def _openDB(self, filename, prot='r'):
if self.args.dont_compress:
return open(filename, prot)
else:
return gzip.open(filename, prot)
def _serialize(self, obj, filename):
if self.args.dry_run: return
self.logger.info("==> Serialize to %s" % (filename))
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
# Make a backup in case of something fails
if os.path.exists(filename):
shutil.copy(filename, filename + '.bak')
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
pickle.dump(obj, f)
f.seek(0)
fzip.write(f.read())
os.fsync(fzip)
os.remove(filename + '.tmp')
if os.path.exists(filename + '.bak'):
os.remove(filename + '.bak')
def _deserialize(self, filename):
if not os.path.exists(filename):
return None
res = None
with self._openDB(filename) as f:
res = pickle.load(f)
return res
def _callPlugins(self, target_root, *args):
self.logger.info('==> Call plugins (%s)' % (target_root))
for (root, plugins) in self.plugins:
if root != target_root: continue
for p in plugins:
mod = self.cache_plugins.get(root + '.' + p, None)
if mod:
self.logger.info('\t%s' % (p))
mod.hook(*args)
def isPage(self, request):
self.logger.debug("Is page %s" % (request))
for e in conf.pages_extensions:
if request.endswith(e):
self.logger.debug("True")
return True
# No extension -> page
if not '.' in request.split('/')[-1]:
self.logger.debug("True")
return True
self.logger.debug("False")
return False
def isMultimediaFile(self, uri):
self.logger.debug("Is multimedia %s" % (uri))
for e in conf.multimedia_files:
if uri.lower().endswith(e):
self.logger.debug("True")
return True
self.logger.debug("False")
for file_re in self.multimedia_files_re:
if file_re.match(uri):
self.logger.debug("Is multimedia re True")
return True
return False
def isValidVisitor(self, hit):
if hit['robot']: return False
if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
return False
return True
def isRobot(self, hit):
# By default robot is None
return hit['robot'] == True
def _appendHit(self, hit):
remote_ip = hit['remote_ip']
if not remote_ip: return
for ip in self.excluded_ip:
if ip.match(remote_ip):
return
# Redirected page/hit
if int(hit['status']) in (301, 302, 307, 308):
return
if not remote_ip in self.current_analysis['visits'].keys():
self._createVisitor(hit)
super_hit = self.current_analysis['visits'][remote_ip]
# Don't keep all requests for robots
if not super_hit['robot']:
super_hit['requests'].append(hit)
day = self.meta_infos['last_time'].tm_mday
if self.hasBeenViewed(hit):
super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
super_hit[page_key][0] += 1
else:
super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
super_hit[hit_key][0] += 1
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = {0:0}
super_hit['viewed_hits'] = {0:0}
super_hit['not_viewed_pages'] = {0:0}
super_hit['not_viewed_hits'] = {0:0}
super_hit['bandwidth'] = {0:0}
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = None
super_hit['hit_only'] = 0
def _normalizeURI(self, uri, removeFileSlash=True):
if uri == '/': return uri
# Remove protocol
uri = self.protocol_re.sub('', uri)
# Remove double /
uri = self.slash_re.sub('/', uri)
if removeFileSlash:
while len(uri) > 1 and uri[-1] == '/':
uri = uri[:-1]
return uri
def _normalizeParameters(self, parameters):
# No parameters
if parameters == '?': return None
return parameters
def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False
groups = self.http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict("")
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict("")
hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
if 'extract_parameters' in d.keys():
parameters = self._normalizeParameters(d['extract_parameters'])
if parameters:
hit['extract_request']['extract_parameters'] = parameters
else:
self.logger.warning("Bad request extraction %s" % (hit['request']))
return False
if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups:
hit['extract_referer'] = referer_groups.groupdict("")
hit['extract_referer']['extract_uri'] = self._normalizeURI(hit['extract_referer']['extract_uri'])
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
hit['remote_ip'] = hit['remote_addr']
return True
def _decodeTime(self, hit):
try:
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
except ValueError as e:
if sys.version_info < (3, 2):
# Try without UTC value at the end (%z not recognized)
gmt_offset_str = hit['time_local'][-5:]
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
gmt_offset = gmt_offset_hours + gmt_offset_minutes
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
# if gmt_offset_str[0] == '-':
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
# else:
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
else:
raise e
return hit['time_decoded']
def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
filename = self.getCurDisplayPath('index.html')
return self.display.getPage(filename)
def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time']
title = createCurTitle(self, self._('Statistics'))
filename = self.getCurDisplayPath('index.html')
self.logger.info('==> Generate display (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
page.appendBlock(link)
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6), [4, 5])
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
nb_visits = 0
nb_days = 0
for i in range(1, nb_month_days+1):
month = months_name[int(time.strftime('%m', cur_time), 10)]
day = '%d<br/>%s' % (i, month)
full_day = '%02d %s %d' % (i, month, cur_time.tm_year)
if i in self.current_analysis['days_stats'].keys():
stats = self.current_analysis['days_stats'][i]
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
nb_visits += stats['nb_visits']
nb_days += 1
else:
row = [full_day, 0, 0, 0, 0, 0]
days.appendRow(row)
viewed_bandwidth = row[4]
not_viewed_bandwidth = row[5]
days.setCellValue(i-1, 4, viewed_bandwidth)
days.setCellValue(i-1, 5, not_viewed_bandwidth)
days.appendShortTitle(day)
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
week_day = adate.weekday()
if week_day == 5 or week_day == 6:
days.setRowCSSClass(i-1, 'iwla_weekend')
if adate == date.today():
css = days.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
days.setCellCSSClass(i-1, 0, css)
stats = self.current_analysis['month_stats']
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
if nb_days:
average_row = list(map(lambda v: int(v/nb_days), row))
else:
average_row = list(map(lambda v: 0, row))
average_row[0] = self._('Average')
days.appendRow(average_row)
row[0] = self._('Total')
days.appendRow(row)
page.appendBlock(days)
self.display.addPage(page)
def _generateDisplayMonthStats(self, page, year, month_stats):
cur_time = time.localtime()
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
title = '%s %d' % (self._('Summary'), year)
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')]
graph_cols=range(1,6)
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols, [5, 6])
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
total = [0] * len(cols)
for i in range(1, 13):
month = '%s<br/>%d' % (months_name[i], year)
full_month = '%s %d' % (months_name[i], year)
link_month = '<a target="_top" href="/%d/%02d/index.html">%s</a>' % (year, i, full_month)
if i in month_stats.keys():
stats = month_stats[i]
row = [link_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
for j in range(1,7):
total[j] += row[j]
else:
row = [full_month, 0, 0, 0, 0, 0, 0]
months.appendRow(row)
months.appendShortTitle(month)
if year == cur_time.tm_year and i == cur_time.tm_mon:
css = months.getCellCSSClass(i-1, 0)
if css: css = '%s %s' % (css, 'iwla_curday')
else: css = 'iwla_curday'
months.setCellCSSClass(i-1, 0, css)
total[0] = self._('Total')
months.appendRow(total)
page.appendBlock(months)
filename = '%d/_stats.html' % (year)
page_ = self.display.createPage(u'', filename, conf.css_path)
page_.appendBlock(months)
page_.build(conf.DISPLAY_ROOT, False)
months.resetHTML()
def _generateDisplayWholeMonthStats(self):
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
filename = 'index.html'
self.logger.info('==> Generate main page (%s)' % (filename))
page = self.display.createPage(title, filename, conf.css_path)
last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
duration = datetime.now() - self.start_time
duration = time.gmtime(duration.seconds)
time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
if duration.tm_hour:
time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
self.display.addPage(page)
def _compressFile(self, root, filename):
path = os.path.join(root, filename)
gz_path = path + '.gz'
self.logger.debug('Compress %s => %s' % (path, gz_path))
if not os.path.exists(gz_path) or\
os.stat(path).st_mtime > os.stat(gz_path).st_mtime:
if self.args.dry_run: return
with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
f_out.write(f_in.read())
def _compressFiles(self, root):
if not conf.compress_output_files: return
for rootdir, subdirs, files in os.walk(root, followlinks=True):
for f in files:
for ext in conf.compress_output_files:
if f.endswith(ext):
self._compressFile(rootdir, f)
break
def _generateDisplay(self):
if self.args.disable_display: return
self._generateDisplayDaysStats()
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
self._generateDisplayWholeMonthStats()
if self.args.dry_run: return
self.display.build(conf.DISPLAY_ROOT)
self._compressFiles(conf.DISPLAY_ROOT)
def _createEmptyStats(self):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
stats['nb_visits'] = 0
return stats
def _generateMonthStats(self):
self._clearDisplay()
visits = self.current_analysis['visits']
stats = self._createEmptyStats()
for (day, stat) in self.current_analysis['days_stats'].items():
for k in stats.keys():
stats[k] += stat[k]
duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time']
self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
self.logger.info(stats)
if not 'month_stats' in self.current_analysis.keys():
self.current_analysis['month_stats'] = stats
else:
for (k,v) in stats.items():
self.current_analysis['month_stats'][k] = v
self.valid_visitors = {}
for (k,v) in visits.items():
if self.isValidVisitor(v):
self.valid_visitors[k] = v
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
if self.args.display_only:
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
self._generateDisplay()
return
for (k,v) in visits.items():
# Keep at least one request (for referers...)
if not v.get('keep_requests', conf.keep_requests):
if len(v['requests']) > 1:
v['requests'] = [v['requests'][0]]
path = self.getDBFilename(cur_time)
self._serialize(self.current_analysis, path)
# Save month stats
year = cur_time.tm_year
month = cur_time.tm_mon
if not 'stats' in self.meta_infos.keys():
self.meta_infos['stats'] = {}
if not year in self.meta_infos['stats'].keys():
self.meta_infos['stats'][year] = {}
self.meta_infos['stats'][year][month] = duplicated_stats
meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME)
self._serialize(self.meta_infos, meta_path)
self._generateDisplay()
def _generateDayStats(self):
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
visits = self.current_analysis['visits']
cur_time = self.meta_infos['last_time']
stats = self._createEmptyStats()
day = cur_time.tm_mday
for (k, super_hit) in visits.items():
if super_hit['last_access'].tm_mday != day:
continue
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
continue
stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
super_hit['viewed_pages'].get(day, 0)):
stats['nb_visits'] += 1
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
self.logger.info(stats)
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def _newHit(self, hit):
if not self.domain_name_re.match(hit['server_name']):
self.logger.debug("Not in domain %s" % (hit))
return False
for domain_name in self.excluded_domain_name:
if domain_name.match(hit['server_name']):
self.logger.debug("Domain name %s excluded" % (hit['server_name']))
return False
t = self._decodeTime(hit)
cur_time = self.meta_infos['last_time']
if cur_time == None:
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
if not self.analyse_started and\
time.mktime(t) <= time.mktime(cur_time):
self.logger.debug("Not in time")
return False
self.analyse_started = True
if t < cur_time: # Don't accept past hits
return False
if cur_time.tm_mon != t.tm_mon:
self._generateDayStats()
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday:
self._generateDayStats()
self.meta_infos['last_time'] = t
if not self.meta_infos['start_analysis_time']:
self.meta_infos['start_analysis_time'] = t
if not self._decodeHTTPRequest(hit): return False
if hit['extract_request']['http_method'] not in ['GET', 'POST']:
return False
for k in hit.keys():
if hit[k] == '-' or hit[k] == '*':
hit[k] = ''
self._appendHit(hit)
return True
def _reset(self):
reset_time = time.strptime(self.args.reset, '%m/%Y')
self.logger.info('Reset time')
self.logger.info(reset_time)
self.meta_infos['last_time'] = reset_time
cur_time = time.localtime()
year = reset_time.tm_year
while year < cur_time.tm_year:
db_path = os.path.join(conf.DB_ROOT, str(year))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
if os.path.exists(output_path): shutil.rmtree(output_path)
year += 1
month = reset_time.tm_mon
while month <= cur_time.tm_mon:
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
if os.path.exists(output_path): shutil.rmtree(output_path)
month += 1
def start(self, _file):
self.start_time = datetime.now()
meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME)
if os.path.exists(meta_path):
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(meta_path) or self._clearMeta()
if self.meta_infos['last_time']:
if args.reset:
self._reset()
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
self.logger.info('==> Analysing log')
for l in _file:
# print "line " + l
sanitized = l.replace('<', '')
sanitized = sanitized.replace('>', '')
groups = self.log_re.match(sanitized)
if groups:
self._newHit(groups.groupdict(""))
else:
self.logger.warning("No match for %s" % (sanitized))
#break
if self.analyse_started:
self._generateDayStats()
self._generateMonthStats()
del self.meta_infos['start_analysis_time']
else:
self.logger.info('==> Analyse not started : nothing new')
def displayOnly(self, start_time):
self.start_time = datetime.now()
meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME)
if os.path.exists(meta_path):
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(meta_path) or self._clearMeta()
self.meta_infos['last_time'] = time.strptime(start_time, '%m/%Y')
if self.meta_infos['last_time']:
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else:
self._clearVisits()
self.meta_infos['start_analysis_time'] = None
self.cache_plugins = preloadPlugins(self.plugins, self)
self.logger.info('==> Analysing log')
self._generateDayStats()
self._generateMonthStats()
class FileIter(object):
def __init__(self, filenames):
self.filenames = [f for f in filenames.split(',') if f]
for f in self.filenames:
if not os.path.exists(f):
print('No such file \'%s\'' % (f))
sys.exit(-1)
self.cur_file = None
self._openNextFile()
def __iter__(self):
return self
def __next__(self):
return self.next()
def _openNextFile(self):
if self.cur_file:
self.cur_file.close()
self.cur_file = None
if not self.filenames:
raise StopIteration()
filename = self.filenames.pop(0)
if filename.endswith('gz'):
self.cur_file = gzip.open(filename, 'rt')
else:
self.cur_file = open(filename, 'rt')
def next(self):
l = self.cur_file.readline()
if not l:
self._openNextFile()
l = self.cur_file.readline()
return l[:-1]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
parser.add_argument('-c', '--config-file', dest='config_file',
default='conf.py', type=str,
help='Config file to use (default conf.py)')
parser.add_argument('-C', '--clean-output', dest='clean_output', action='store_true',
default=False,
help='Clean output before starting')
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
default=False,
help='Read data from stdin instead of conf.analyzed_filename')
parser.add_argument('-f', '--file', dest='file',
help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
parser.add_argument('-d', '--log-level', dest='loglevel',
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
parser.add_argument('-r', '--reset', dest='reset',
default=False,
help='Reset analysis to a specific date (month/year)')
parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
default=False,
help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
parser.add_argument('-p', '--display-only', dest='display_only',
default='', type=str,
help='Only generate display for a specific date (month/year)')
parser.add_argument('-P', '--disable-display', dest='disable_display', action='store_true',
default=False,
help='Don\'t generate display')
parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
default=False,
help='Process log but don\'t write files (database and HTML) to disk')
args = parser.parse_args()
if args.config_file.endswith('.py'):
args.config_file = args.config_file[:-3]
user_conf = importlib.import_module(args.config_file)
# Load user conf
for (k,v) in user_conf.__dict__.items():
if k.endswith('_append'):
new_k = k[:-7]
if new_k in dir(conf):
if type(conf.__dict__[new_k]) == list:
if type(v) == list:
conf.__dict__[new_k] += v
else:
conf.__dict__[new_k].append(v)
else:
print("Error %s is not a list" % (new_k))
else:
print("Error %s doesn't exists in default conf" % (new_k))
else:
conf.__dict__.update({k:v})
if args.clean_output and not args.dry_run:
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
loglevel = getattr(logging, args.loglevel.upper(), None)
if not isinstance(loglevel, int):
raise ValueError('Invalid log level: %s' % (args.loglevel))
iwla = IWLA(loglevel, args)
required_conf = ['analyzed_filename', 'domain_name']
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
sys.exit(0)
if args.display_only:
iwla.displayOnly(args.display_only)
else:
if args.stdin:
iwla.start(sys.stdin)
else:
filename = args.file or conf.analyzed_filename
iwla.start(FileIter(filename))