2014-11-18 20:18:53 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import os
|
2014-12-03 10:55:32 +01:00
|
|
|
import shutil
|
|
|
|
import sys
|
2014-11-18 20:18:53 +01:00
|
|
|
import re
|
|
|
|
import time
|
2014-11-19 19:34:16 +01:00
|
|
|
import pickle
|
|
|
|
import gzip
|
2014-11-22 19:23:56 +01:00
|
|
|
import importlib
|
2014-12-03 10:55:32 +01:00
|
|
|
import argparse
|
2014-11-28 16:26:11 +01:00
|
|
|
from calendar import monthrange
|
2014-12-02 21:16:27 +01:00
|
|
|
from datetime import date
|
2014-11-19 19:45:41 +01:00
|
|
|
|
2014-11-24 21:37:37 +01:00
|
|
|
import default_conf as conf
|
|
|
|
import conf as _
|
|
|
|
conf.__dict__.update(_.__dict__)
|
|
|
|
del _
|
|
|
|
|
2014-11-24 17:13:59 +01:00
|
|
|
from iplugin import *
|
2014-11-20 16:31:00 +01:00
|
|
|
from display import *
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
class IWLA(object):
|
|
|
|
|
|
|
|
ANALYSIS_CLASS = 'HTTP'
|
|
|
|
API_VERSION = 1
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
print '==> Start'
|
|
|
|
|
|
|
|
self.meta_infos = {}
|
|
|
|
self.analyse_started = False
|
|
|
|
self.current_analysis = {}
|
|
|
|
self.cache_plugins = {}
|
2014-11-30 19:05:17 +01:00
|
|
|
self.display = DisplayHTMLBuild(self)
|
2014-11-21 14:46:12 +01:00
|
|
|
self.valid_visitors = None
|
|
|
|
|
2014-11-24 21:37:37 +01:00
|
|
|
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
|
2014-11-21 14:46:12 +01:00
|
|
|
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
|
|
|
|
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
|
|
|
self.log_re = re.compile(self.log_format_extracted)
|
2014-11-26 16:17:16 +01:00
|
|
|
self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
|
2014-11-27 12:34:42 +01:00
|
|
|
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
|
|
|
|
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
|
|
|
|
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-27 12:34:42 +01:00
|
|
|
def getConfValue(self, key, default=None):
|
2014-11-24 21:42:57 +01:00
|
|
|
if not key in dir(conf):
|
|
|
|
return default
|
|
|
|
else:
|
|
|
|
return conf.__dict__[key]
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _clearVisits(self):
|
|
|
|
self.current_analysis = {
|
|
|
|
'days_stats' : {},
|
|
|
|
'month_stats' : {},
|
|
|
|
'visits' : {}
|
|
|
|
}
|
|
|
|
self.valid_visitors = None
|
|
|
|
return self.current_analysis
|
|
|
|
|
|
|
|
def getDaysStats(self):
|
|
|
|
return self.current_analysis['days_stats']
|
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
def getMonthStats(self):
|
2014-11-21 14:46:12 +01:00
|
|
|
return self.current_analysis['month_stats']
|
|
|
|
|
|
|
|
def getCurrentVisists(self):
|
|
|
|
return self.current_analysis['visits']
|
|
|
|
|
|
|
|
def getValidVisitors(self):
|
2014-11-21 16:56:58 +01:00
|
|
|
return self.valid_visitors
|
|
|
|
|
|
|
|
def getDisplay(self):
|
|
|
|
return self.display
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-25 16:59:29 +01:00
|
|
|
def getCurTime(self):
|
|
|
|
return self.meta_infos['last_time']
|
|
|
|
|
2014-11-26 19:53:00 +01:00
|
|
|
def getStartAnalysisTime(self):
|
|
|
|
return self.meta_infos['start_analysis_time']
|
|
|
|
|
2014-11-27 09:01:51 +01:00
|
|
|
def isValidForCurrentAnalysis(self, request):
|
|
|
|
cur_time = self.meta_infos['start_analysis_time']
|
2014-11-27 14:11:47 +01:00
|
|
|
# Analyse not started
|
|
|
|
if not cur_time: return False
|
|
|
|
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
|
2014-11-27 09:01:51 +01:00
|
|
|
|
2014-11-27 13:07:14 +01:00
|
|
|
def hasBeenViewed(self, request):
|
|
|
|
return int(request['status']) in conf.viewed_http_codes
|
|
|
|
|
2014-11-27 14:29:25 +01:00
|
|
|
def getCurDisplayPath(self, filename):
|
2014-11-27 14:11:47 +01:00
|
|
|
cur_time = self.meta_infos['last_time']
|
2014-11-27 14:29:25 +01:00
|
|
|
return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
|
2014-11-27 14:11:47 +01:00
|
|
|
|
2014-11-30 19:05:17 +01:00
|
|
|
def getResourcesPath(self):
|
|
|
|
return conf.resources_path
|
|
|
|
|
|
|
|
def getCSSPath(self):
|
|
|
|
return conf.css_path
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _clearMeta(self):
|
|
|
|
self.meta_infos = {
|
|
|
|
'last_time' : None
|
|
|
|
}
|
|
|
|
return self.meta_infos
|
|
|
|
|
|
|
|
def _clearDisplay(self):
|
2014-11-30 19:05:17 +01:00
|
|
|
self.display = DisplayHTMLBuild(self)
|
2014-11-21 14:46:12 +01:00
|
|
|
return self.display
|
|
|
|
|
|
|
|
def getDBFilename(self, time):
|
2014-11-27 14:29:25 +01:00
|
|
|
return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
def _serialize(self, obj, filename):
|
|
|
|
base = os.path.dirname(filename)
|
|
|
|
if not os.path.exists(base):
|
|
|
|
os.makedirs(base)
|
|
|
|
|
|
|
|
# TODO : remove return
|
2014-12-01 21:13:35 +01:00
|
|
|
#return
|
2014-11-20 08:18:31 +01:00
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
with open(filename + '.tmp', 'wb+') as f:
|
|
|
|
pickle.dump(obj, f)
|
|
|
|
f.seek(0)
|
|
|
|
with gzip.open(filename, 'w') as fzip:
|
|
|
|
fzip.write(f.read())
|
2014-11-19 19:34:16 +01:00
|
|
|
os.remove(filename + '.tmp')
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _deserialize(self, filename):
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
return None
|
|
|
|
|
|
|
|
with gzip.open(filename, 'r') as f:
|
|
|
|
return pickle.load(f)
|
2014-11-19 19:34:16 +01:00
|
|
|
return None
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-27 12:34:42 +01:00
|
|
|
def _callPlugins(self, target_root, *args):
|
|
|
|
print '==> Call plugins (%s)' % target_root
|
|
|
|
for (root, plugins) in self.plugins:
|
|
|
|
if root != target_root: continue
|
|
|
|
for p in plugins:
|
|
|
|
mod = self.cache_plugins.get(root + '.' + p, None)
|
|
|
|
if mod:
|
|
|
|
print '\t%s' % (p)
|
|
|
|
mod.hook(*args)
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def isPage(self, request):
|
2014-11-24 21:37:37 +01:00
|
|
|
for e in conf.pages_extensions:
|
2014-11-21 14:46:12 +01:00
|
|
|
if request.endswith(e):
|
|
|
|
return True
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
return False
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _appendHit(self, hit):
|
|
|
|
remote_addr = hit['remote_addr']
|
|
|
|
|
2014-11-28 16:02:04 +01:00
|
|
|
if not remote_addr: return
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
if not remote_addr in self.current_analysis['visits'].keys():
|
2014-11-25 16:22:07 +01:00
|
|
|
self._createVisitor(hit)
|
2014-11-21 14:46:12 +01:00
|
|
|
return
|
|
|
|
|
|
|
|
super_hit = self.current_analysis['visits'][remote_addr]
|
|
|
|
super_hit['requests'].append(hit)
|
|
|
|
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
|
|
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
|
|
|
|
|
|
|
request = hit['extract_request']
|
|
|
|
|
2014-11-27 12:34:42 +01:00
|
|
|
uri = request.get('extract_uri', request['http_uri'])
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
hit['is_page'] = self.isPage(uri)
|
|
|
|
|
|
|
|
status = int(hit['status'])
|
2014-11-26 16:17:16 +01:00
|
|
|
if status not in conf.viewed_http_codes:
|
|
|
|
return
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
if super_hit['robot'] or\
|
2014-11-24 21:37:37 +01:00
|
|
|
not status in conf.viewed_http_codes:
|
2014-11-21 14:46:12 +01:00
|
|
|
page_key = 'not_viewed_pages'
|
|
|
|
hit_key = 'not_viewed_hits'
|
|
|
|
else:
|
|
|
|
page_key = 'viewed_pages'
|
|
|
|
hit_key = 'viewed_hits'
|
|
|
|
|
|
|
|
if hit['is_page']:
|
|
|
|
super_hit[page_key] += 1
|
|
|
|
else:
|
|
|
|
super_hit[hit_key] += 1
|
|
|
|
|
2014-11-25 16:22:07 +01:00
|
|
|
def _createVisitor(self, hit):
|
2014-11-21 14:46:12 +01:00
|
|
|
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
|
|
|
|
super_hit['remote_addr'] = hit['remote_addr']
|
2014-11-26 16:17:16 +01:00
|
|
|
super_hit['remote_ip'] = hit['remote_addr']
|
2014-11-21 14:46:12 +01:00
|
|
|
super_hit['viewed_pages'] = 0
|
|
|
|
super_hit['viewed_hits'] = 0
|
|
|
|
super_hit['not_viewed_pages'] = 0
|
|
|
|
super_hit['not_viewed_hits'] = 0
|
|
|
|
super_hit['bandwidth'] = 0
|
|
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
|
|
|
super_hit['requests'] = []
|
|
|
|
super_hit['robot'] = False
|
|
|
|
super_hit['hit_only'] = 0
|
|
|
|
self._appendHit(hit)
|
|
|
|
|
|
|
|
def _decodeHTTPRequest(self, hit):
|
|
|
|
if not 'request' in hit.keys(): return False
|
|
|
|
|
|
|
|
groups = self.http_request_extracted.match(hit['request'])
|
|
|
|
|
|
|
|
if groups:
|
|
|
|
hit['extract_request'] = groups.groupdict()
|
|
|
|
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
|
|
|
|
if uri_groups:
|
|
|
|
d = uri_groups.groupdict()
|
|
|
|
hit['extract_request']['extract_uri'] = d['extract_uri']
|
|
|
|
if 'extract_parameters' in d.keys():
|
|
|
|
hit['extract_request']['extract_parameters'] = d['extract_parameters']
|
|
|
|
else:
|
|
|
|
print "Bad request extraction " + hit['request']
|
|
|
|
return False
|
|
|
|
|
2014-11-26 16:17:16 +01:00
|
|
|
if hit['http_referer']:
|
|
|
|
referer_groups = self.uri_re.match(hit['http_referer'])
|
|
|
|
if referer_groups:
|
|
|
|
hit['extract_referer'] = referer_groups.groupdict()
|
2014-11-21 14:46:12 +01:00
|
|
|
return True
|
|
|
|
|
|
|
|
def _decodeTime(self, hit):
|
2014-11-24 21:37:37 +01:00
|
|
|
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
|
2014-11-26 19:53:00 +01:00
|
|
|
return hit['time_decoded']
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
def getDisplayIndex(self):
|
|
|
|
cur_time = self.meta_infos['last_time']
|
2014-11-27 14:29:25 +01:00
|
|
|
filename = self.getCurDisplayPath('index.html')
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
return self.display.getPage(filename)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
def _generateDisplayDaysStats(self):
|
2014-11-21 14:46:12 +01:00
|
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
|
2014-11-27 14:29:25 +01:00
|
|
|
filename = self.getCurDisplayPath('index.html')
|
2014-11-21 16:56:58 +01:00
|
|
|
print '==> Generate display (%s)' % (filename)
|
2014-11-30 19:05:17 +01:00
|
|
|
page = DisplayHTMLPage(title, filename, conf.css_path)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-02 16:53:54 +01:00
|
|
|
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
|
2014-12-03 21:58:55 +01:00
|
|
|
days = DisplayHTMLBlockTableWithGraph('By day', ['Day', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], nb_valid_rows=nb_month_days)
|
|
|
|
days.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
2014-11-21 14:46:12 +01:00
|
|
|
nb_visits = 0
|
2014-11-28 16:26:11 +01:00
|
|
|
nb_days = 0
|
2014-12-02 20:49:56 +01:00
|
|
|
for i in range(0, nb_month_days):
|
2014-12-02 21:53:20 +01:00
|
|
|
day = '%d<br/>%s' % (i+1, time.strftime('%b', cur_time))
|
|
|
|
full_day = '%d %s %d' % (i+1, time.strftime('%b', cur_time), cur_time.tm_year)
|
2014-11-28 16:26:11 +01:00
|
|
|
if i in self.current_analysis['days_stats'].keys():
|
|
|
|
stats = self.current_analysis['days_stats'][i]
|
2014-12-02 21:53:20 +01:00
|
|
|
row = [full_day, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
|
2014-12-02 16:53:54 +01:00
|
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
2014-11-28 16:26:11 +01:00
|
|
|
nb_visits += stats['nb_visitors']
|
|
|
|
nb_days += 1
|
|
|
|
else:
|
2014-12-02 21:53:20 +01:00
|
|
|
row = [full_day, 0, 0, 0, 0, 0]
|
2014-11-21 16:56:58 +01:00
|
|
|
days.appendRow(row)
|
2014-12-02 16:53:54 +01:00
|
|
|
days.setCellValue(i, 4, bytesToStr(row[4]))
|
|
|
|
days.setCellValue(i, 5, bytesToStr(row[5]))
|
2014-12-02 21:53:20 +01:00
|
|
|
days.appendShortTitle(day)
|
|
|
|
adate = date(cur_time.tm_year, cur_time.tm_mon, i+1)
|
|
|
|
week_day = adate.weekday()
|
2014-12-02 21:16:27 +01:00
|
|
|
if week_day == 5 or week_day == 6:
|
|
|
|
days.setRowCSSClass(i, 'iwla_weekend')
|
2014-12-02 21:53:20 +01:00
|
|
|
if adate == date.today():
|
|
|
|
css = days.getCellCSSClass(i, 0)
|
|
|
|
if css: css = '%s %s' % (css, 'iwla_curday')
|
|
|
|
else: css = 'iwla_curday'
|
|
|
|
days.setCellCSSClass(i, 0, css)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
stats = self.current_analysis['month_stats']
|
|
|
|
|
|
|
|
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
|
|
|
if nb_days:
|
2014-11-26 16:17:16 +01:00
|
|
|
average_row = map(lambda(v): int(v/nb_days), row)
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
2014-11-26 16:17:16 +01:00
|
|
|
average_row = map(lambda(v): 0, row)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-24 13:44:04 +01:00
|
|
|
average_row[0] = 'Average'
|
2014-11-26 16:17:16 +01:00
|
|
|
average_row[4] = bytesToStr(average_row[4])
|
|
|
|
average_row[5] = bytesToStr(average_row[5])
|
2014-11-24 13:44:04 +01:00
|
|
|
days.appendRow(average_row)
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-24 13:44:04 +01:00
|
|
|
row[0] = 'Total'
|
|
|
|
row[4] = bytesToStr(row[4])
|
|
|
|
row[5] = bytesToStr(row[5])
|
2014-11-21 16:56:58 +01:00
|
|
|
days.appendRow(row)
|
|
|
|
page.appendBlock(days)
|
|
|
|
self.display.addPage(page)
|
2014-11-20 08:18:31 +01:00
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
def _generateDisplayMonthStats(self, page, year, month_stats):
|
|
|
|
|
|
|
|
title = 'Summary %d' % (year)
|
|
|
|
cols = ['Month', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth']
|
|
|
|
months = DisplayHTMLBlockTableWithGraph(title, cols, nb_valid_rows=12)
|
|
|
|
months.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
|
|
|
total = [0] * len(cols)
|
|
|
|
for i in range(0, 12):
|
|
|
|
month = '%s<br/>%d' % ('Jan', year)
|
|
|
|
full_month = '%s %d' % ('Jan', year)
|
|
|
|
if i in month_stats.keys():
|
|
|
|
stats = month_stats[i]
|
|
|
|
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
|
|
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
|
|
|
for i in range(1, len(row[1:])):
|
|
|
|
total[i] += row[i]
|
|
|
|
else:
|
|
|
|
row = [full_month, 0, 0, 0, 0, 0]
|
|
|
|
months.appendRow(row)
|
|
|
|
months.setCellValue(i, 4, bytesToStr(row[4]))
|
|
|
|
months.setCellValue(i, 5, bytesToStr(row[5]))
|
|
|
|
months.appendShortTitle(month)
|
|
|
|
# adate = date(cur_time.tm_year, cur_time.tm_mon, i+1)
|
|
|
|
# week_day = adate.weekday()
|
|
|
|
# if week_day == 5 or week_day == 6:
|
|
|
|
# months.setRowCSSClass(i, 'iwla_weekend')
|
|
|
|
# if adate == date.today():
|
|
|
|
# css = months.getCellCSSClass(i, 0)
|
|
|
|
# if css: css = '%s %s' % (css, 'iwla_curday')
|
|
|
|
# else: css = 'iwla_curday'
|
|
|
|
# months.setCellCSSClass(i, 0, css)
|
|
|
|
|
|
|
|
total[0] = 'Total'
|
|
|
|
total[4] = bytesToStr(total[4])
|
|
|
|
total[5] = bytesToStr(total[5])
|
|
|
|
months.appendRow(total)
|
|
|
|
page.appendBlock(months)
|
|
|
|
|
|
|
|
def _generateDisplayWholeMonthStats(self):
|
|
|
|
title = 'Stats for %s' % (conf.domain_name)
|
|
|
|
filename = 'index.html'
|
|
|
|
print '==> Generate main page (%s)' % (filename)
|
|
|
|
|
|
|
|
page = DisplayHTMLPage(title, filename, conf.css_path)
|
|
|
|
|
|
|
|
for year in self.meta_infos['stats'].keys():
|
|
|
|
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
|
|
|
|
|
|
|
|
self.display.addPage(page)
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _generateDisplay(self):
|
2014-12-03 21:58:55 +01:00
|
|
|
self._generateDisplayDaysStats()
|
2014-11-26 20:31:13 +01:00
|
|
|
self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
|
2014-12-03 21:58:55 +01:00
|
|
|
self._generateDisplayWholeMonthStats()
|
2014-11-24 21:37:37 +01:00
|
|
|
self.display.build(conf.DISPLAY_ROOT)
|
2014-11-20 08:18:31 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _generateStats(self, visits):
|
|
|
|
stats = {}
|
|
|
|
stats['viewed_bandwidth'] = 0
|
|
|
|
stats['not_viewed_bandwidth'] = 0
|
|
|
|
stats['viewed_pages'] = 0
|
|
|
|
stats['viewed_hits'] = 0
|
|
|
|
#stats['requests'] = set()
|
|
|
|
stats['nb_visitors'] = 0
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-27 09:01:51 +01:00
|
|
|
for (k, super_hit) in visits.items():
|
2014-11-21 14:46:12 +01:00
|
|
|
if super_hit['robot']:
|
|
|
|
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
|
|
|
|
continue
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-27 09:01:51 +01:00
|
|
|
if conf.count_hit_only_visitors or\
|
|
|
|
super_hit['viewed_pages']:
|
2014-11-21 14:46:12 +01:00
|
|
|
stats['nb_visitors'] += 1
|
|
|
|
stats['viewed_bandwidth'] += super_hit['bandwidth']
|
|
|
|
stats['viewed_pages'] += super_hit['viewed_pages']
|
|
|
|
stats['viewed_hits'] += super_hit['viewed_hits']
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
# for p in super_hit['requests']:
|
|
|
|
# if not p['is_page']: continue
|
|
|
|
# req = p['extract_request']
|
|
|
|
# stats['requests'].add(req['extract_uri'])
|
2014-11-21 10:41:29 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
return stats
|
2014-11-21 10:41:29 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _generateMonthStats(self):
|
|
|
|
self._clearDisplay()
|
2014-11-20 16:31:00 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
visits = self.current_analysis['visits']
|
|
|
|
|
|
|
|
stats = self._generateStats(visits)
|
2014-12-01 21:13:35 +01:00
|
|
|
duplicated_stats = {k:v for (k,v) in stats.items()}
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
|
|
|
|
print stats
|
|
|
|
|
2014-12-02 20:49:56 +01:00
|
|
|
if not 'month_stats' in self.current_analysis.keys():
|
|
|
|
self.current_analysis['month_stats'] = stats
|
|
|
|
else:
|
|
|
|
for (k,v) in stats.items():
|
|
|
|
self.current_analysis['month_stats'][k] = v
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-27 09:01:51 +01:00
|
|
|
self.valid_visitors = {}
|
|
|
|
for (k,v) in visits.items():
|
|
|
|
if v['robot']: continue
|
|
|
|
if conf.count_hit_only_visitors and\
|
|
|
|
(not v['viewed_pages']):
|
|
|
|
continue
|
|
|
|
self.valid_visitors[k] = v
|
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
duplicated_stats['visitors'] = stats['visitors'] = len(self.valid_visitors.keys())
|
|
|
|
|
2014-11-26 20:31:13 +01:00
|
|
|
self._callPlugins(conf.POST_HOOK_DIRECTORY)
|
2014-11-21 16:56:58 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
path = self.getDBFilename(cur_time)
|
|
|
|
if os.path.exists(path):
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
print "==> Serialize to %s" % path
|
|
|
|
|
|
|
|
self._serialize(self.current_analysis, path)
|
|
|
|
|
2014-12-01 21:13:35 +01:00
|
|
|
# Save month stats
|
2014-12-03 21:58:55 +01:00
|
|
|
year = cur_time.tm_year
|
|
|
|
month = cur_time.tm_mon
|
2014-12-01 21:13:35 +01:00
|
|
|
if not 'stats' in self.meta_infos.keys():
|
|
|
|
self.meta_infos['stats'] = {}
|
|
|
|
if not year in self.meta_infos['stats'].keys():
|
|
|
|
self.meta_infos['stats'][year] = {}
|
|
|
|
self.meta_infos['stats'][year][month] = duplicated_stats
|
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
self._generateDisplay()
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _generateDayStats(self):
|
|
|
|
visits = self.current_analysis['visits']
|
|
|
|
|
2014-11-26 20:31:13 +01:00
|
|
|
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
stats = self._generateStats(visits)
|
|
|
|
|
|
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
|
|
|
|
|
|
|
|
if cur_time.tm_mday > 1:
|
|
|
|
last_day = cur_time.tm_mday - 1
|
|
|
|
while last_day:
|
|
|
|
if last_day in self.current_analysis['days_stats'].keys():
|
|
|
|
break
|
|
|
|
last_day -= 1
|
|
|
|
if last_day:
|
|
|
|
for k in stats.keys():
|
|
|
|
stats[k] -= self.current_analysis['days_stats'][last_day][k]
|
|
|
|
stats['nb_visitors'] = 0
|
2014-11-27 09:01:51 +01:00
|
|
|
for (k,v) in visits.items():
|
|
|
|
if v['robot']: continue
|
|
|
|
if conf.count_hit_only_visitors and\
|
|
|
|
(not v['viewed_pages']):
|
|
|
|
continue
|
|
|
|
if v['last_access'].tm_mday == cur_time.tm_mday:
|
2014-11-21 14:46:12 +01:00
|
|
|
stats['nb_visitors'] += 1
|
|
|
|
print stats
|
|
|
|
|
|
|
|
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
|
|
|
|
|
|
|
|
def _newHit(self, hit):
|
2014-11-26 19:53:00 +01:00
|
|
|
t = self._decodeTime(hit)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
cur_time = self.meta_infos['last_time']
|
|
|
|
|
|
|
|
if cur_time == None:
|
|
|
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
|
|
|
self.analyse_started = True
|
|
|
|
else:
|
|
|
|
if not self.analyse_started:
|
2014-11-27 14:11:47 +01:00
|
|
|
if time.mktime(t) < time.mktime(cur_time):
|
2014-11-25 16:22:07 +01:00
|
|
|
return False
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
|
|
|
self.analyse_started = True
|
|
|
|
if cur_time.tm_mon != t.tm_mon:
|
|
|
|
self._generateMonthStats()
|
|
|
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
|
|
|
elif cur_time.tm_mday != t.tm_mday:
|
|
|
|
self._generateDayStats()
|
|
|
|
|
|
|
|
self.meta_infos['last_time'] = t
|
|
|
|
|
2014-11-26 19:53:00 +01:00
|
|
|
if not self.meta_infos['start_analysis_time']:
|
|
|
|
self.meta_infos['start_analysis_time'] = t
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
if not self._decodeHTTPRequest(hit): return False
|
|
|
|
|
|
|
|
for k in hit.keys():
|
2014-11-28 16:02:04 +01:00
|
|
|
if hit[k] == '-' or hit[k] == '*':
|
|
|
|
hit[k] = ''
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
self._appendHit(hit)
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
2014-12-03 10:55:32 +01:00
|
|
|
def start(self, _file):
|
2014-11-27 09:01:51 +01:00
|
|
|
print '==> Load previous database'
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-24 21:37:37 +01:00
|
|
|
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
2014-11-21 14:46:12 +01:00
|
|
|
if self.meta_infos['last_time']:
|
|
|
|
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
|
|
|
|
else:
|
|
|
|
self._clearVisits()
|
|
|
|
|
2014-11-26 19:53:00 +01:00
|
|
|
self.meta_infos['start_analysis_time'] = None
|
|
|
|
|
|
|
|
self.cache_plugins = preloadPlugins(self.plugins, self)
|
|
|
|
|
2014-11-25 16:22:07 +01:00
|
|
|
print '==> Analysing log'
|
|
|
|
|
2014-12-03 10:55:32 +01:00
|
|
|
for l in _file:
|
|
|
|
# print "line " + l
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-03 10:55:32 +01:00
|
|
|
groups = self.log_re.match(l)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-03 10:55:32 +01:00
|
|
|
if groups:
|
|
|
|
if not self._newHit(groups.groupdict()):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print "No match for " + l
|
2014-11-21 16:56:58 +01:00
|
|
|
#break
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
if self.analyse_started:
|
|
|
|
self._generateDayStats()
|
|
|
|
self._generateMonthStats()
|
2014-11-26 19:53:00 +01:00
|
|
|
del self.meta_infos['start_analysis_time']
|
2014-11-24 21:37:37 +01:00
|
|
|
self._serialize(self.meta_infos, conf.META_PATH)
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
|
|
|
print '==> Analyse not started : nothing to do'
|
|
|
|
self._generateMonthStats()
|
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
if __name__ == '__main__':
|
2014-12-03 10:55:32 +01:00
|
|
|
parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
|
|
|
|
|
|
|
|
parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
|
|
|
|
default=False,
|
|
|
|
help='Clean output before starting')
|
|
|
|
|
|
|
|
parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
|
|
|
|
default=False,
|
|
|
|
help='Read data from stdin instead of conf.analyzed_filename')
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if args.clean_output:
|
|
|
|
if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
|
|
|
|
if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
|
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
iwla = IWLA()
|
2014-12-03 10:55:32 +01:00
|
|
|
|
2014-12-03 11:29:05 +01:00
|
|
|
required_conf = ['analyzed_filename', 'domain_name']
|
|
|
|
if not validConfRequirements(required_conf, iwla, 'Main Conf'):
|
|
|
|
sys.exit(0)
|
|
|
|
|
2014-12-03 10:55:32 +01:00
|
|
|
if args.stdin:
|
|
|
|
iwla.start(sys.stdin)
|
|
|
|
else:
|
|
|
|
if not os.path.exists(conf.analyzed_filename):
|
|
|
|
print 'No such file \'%s\'' % (conf.analyzed_filename)
|
|
|
|
sys.exit(-1)
|
|
|
|
with open(conf.analyzed_filename) as f:
|
|
|
|
iwla.start(f)
|