iwla/iwla.py

330 lines
9.0 KiB
Python
Raw Normal View History

2014-11-18 20:18:53 +01:00
#!/usr/bin/env python
import os
import re
import time
import glob
import imp
2014-11-19 19:34:16 +01:00
import pickle
import gzip
2014-11-19 19:45:41 +01:00
2014-11-18 20:18:53 +01:00
from robots import awstats_robots;
print '==> Start'
2014-11-19 19:34:16 +01:00
meta_visit = {'last_time':None}
analyse_started = False
2014-11-19 19:45:41 +01:00
current_visits = {}
2014-11-19 21:37:37 +01:00
cache_plugins = {}
2014-11-18 20:18:53 +01:00
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
'"$request" $status $body_bytes_sent ' +\
'"$http_referer" "$http_user_agent"';
log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
#09/Nov/2014:06:35:16 +0100
time_format = '%d/%b/%Y:%H:%M:%S +0100'
#print "Log format : " + log_format_extracted
log_re = re.compile(log_format_extracted)
2014-11-19 08:01:12 +01:00
uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
2014-11-18 20:18:53 +01:00
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200]
2014-11-19 19:34:16 +01:00
PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py'
POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py'
DB_ROOT = './output/'
META_PATH = DB_ROOT + 'meta.db'
DB_FILENAME = 'iwla.db'
2014-11-19 08:01:12 +01:00
2014-11-18 20:18:53 +01:00
print '==> Generating robot dictionary'
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
2014-11-19 19:45:41 +01:00
def createEmptyVisits():
visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}}
return visits
2014-11-19 21:37:37 +01:00
def createEmptyMeta():
meta = {'last_time':None}
return meta
2014-11-19 19:45:41 +01:00
def getDBFilename(time):
2014-11-19 19:34:16 +01:00
return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
2014-11-19 08:01:12 +01:00
2014-11-19 19:34:16 +01:00
def serialize(obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
2014-11-19 08:01:12 +01:00
2014-11-20 08:18:31 +01:00
# TODO : remove return
return
2014-11-19 19:34:16 +01:00
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
os.remove(filename + '.tmp')
2014-11-19 08:01:12 +01:00
2014-11-19 19:34:16 +01:00
def deserialize(filename):
if not os.path.exists(filename):
return None
2014-11-19 08:01:12 +01:00
2014-11-19 19:34:16 +01:00
with gzip.open(filename, 'r') as f:
return pickle.load(f)
return None
2014-11-19 08:01:12 +01:00
2014-11-19 19:45:41 +01:00
def callPlugins(path, *kwargs):
2014-11-19 08:01:12 +01:00
print '==> Call plugins (%s)' % path
plugins = glob.glob(path)
plugins.sort()
for p in plugins:
print '\t%s' % (p)
2014-11-19 21:37:37 +01:00
if not p in cache_plugins:
mod = imp.load_source('hook', p)
cache_plugins[p] = mod
else:
mod = cache_plugins[p]
2014-11-19 08:01:12 +01:00
mod.hook(*kwargs)
2014-11-18 20:18:53 +01:00
def isPage(request):
for e in pages_extensions:
if request.endswith(e):
return True
return False
def appendHit(hit):
2014-11-20 08:18:31 +01:00
remote_addr = hit['remote_addr']
if not remote_addr in current_visits['visits'].keys():
createUser(hit)
return
super_hit = current_visits['visits'][remote_addr]
2014-11-18 20:18:53 +01:00
super_hit['pages'].append(hit)
super_hit['bandwith'] += int(hit['body_bytes_sent'])
2014-11-20 08:18:31 +01:00
super_hit['last_access'] = meta_visit['last_time']
2014-11-18 20:18:53 +01:00
request = hit['extract_request']
if 'extract_uri' in request.keys():
uri = request['extract_uri']
else:
uri = request['http_uri']
hit['is_page'] = isPage(uri)
2014-11-19 21:37:37 +01:00
# Don't count 3xx status
status = int(hit['status'])
if status >= 300 and status < 400: return
2014-11-18 20:18:53 +01:00
if super_hit['robot'] or\
not int(hit['status']) in viewed_http_codes:
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
page_key = 'viewed_pages'
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
else:
super_hit[hit_key] += 1
2014-11-19 08:01:12 +01:00
def createUser(hit):
2014-11-19 21:37:37 +01:00
super_hit = current_visits['visits'][hit['remote_addr']] = {}
2014-11-18 20:18:53 +01:00
super_hit['viewed_pages'] = 0;
super_hit['viewed_hits'] = 0;
super_hit['not_viewed_pages'] = 0;
super_hit['not_viewed_hits'] = 0;
super_hit['bandwith'] = 0;
2014-11-20 08:18:31 +01:00
super_hit['last_access'] = meta_visit['last_time']
2014-11-18 20:18:53 +01:00
super_hit['pages'] = [];
2014-11-19 08:01:12 +01:00
super_hit['robot'] = isRobot(hit);
2014-11-18 20:18:53 +01:00
appendHit(hit)
def isRobot(hit):
for r in awstats_robots:
if r.match(hit['http_user_agent']):
return True
return False
2014-11-19 19:45:41 +01:00
def decodeHTTPRequest(hit):
2014-11-18 20:18:53 +01:00
if not 'request' in hit.keys(): return False
groups = http_request_extracted.match(hit['request'])
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = uri_re.match(hit['extract_request']['http_uri']);
if uri_groups:
2014-11-19 08:01:12 +01:00
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
2014-11-18 20:18:53 +01:00
else:
print "Bad request extraction " + hit['request']
return False
referer_groups = uri_re.match(hit['http_referer']);
if referer_groups:
2014-11-19 08:01:12 +01:00
referer = hit['extract_referer'] = referer_groups.groupdict()
2014-11-18 20:18:53 +01:00
return True
2014-11-19 19:45:41 +01:00
def decodeTime(hit):
2014-11-18 20:18:53 +01:00
t = hit['time_local']
hit['time_decoded'] = time.strptime(t, time_format)
2014-11-19 21:37:37 +01:00
def generateStats(visits):
2014-11-19 19:34:16 +01:00
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
2014-11-19 21:37:37 +01:00
#stats['pages'] = set()
stats['nb_visitors'] = 0
2014-11-19 19:34:16 +01:00
2014-11-19 21:37:37 +01:00
for k in visits.keys():
super_hit = visits[k]
2014-11-19 19:34:16 +01:00
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwith']
continue
2014-11-19 21:37:37 +01:00
print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
stats['nb_visitors'] += 1
2014-11-19 19:34:16 +01:00
stats['viewed_bandwidth'] += super_hit['bandwith']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
2014-11-19 21:37:37 +01:00
# for p in super_hit['pages']:
# if not p['is_page']: continue
# req = p['extract_request']
# stats['pages'].add(req['extract_uri'])
2014-11-19 19:34:16 +01:00
2014-11-19 21:37:37 +01:00
return stats
def generateMonthStats():
visits = current_visits['visits']
stats = generateStats(visits)
2014-11-19 19:34:16 +01:00
cur_time = meta_visit['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats
2014-11-19 21:37:37 +01:00
valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
current_visits['month_stats'] = stats
2014-11-19 19:45:41 +01:00
path = getDBFilename(cur_time)
2014-11-19 19:34:16 +01:00
if os.path.exists(path):
os.remove(path)
print "==> Serialize to %s" % path
2014-11-19 19:45:41 +01:00
serialize(current_visits, path)
2014-11-19 19:34:16 +01:00
2014-11-19 21:37:37 +01:00
def generateDayStats():
visits = current_visits['visits']
callPlugins(PRE_HOOK_DIRECTORY, visits)
stats = generateStats(visits)
cur_time = meta_visit['last_time']
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
if cur_time.tm_mday > 1:
last_day = cur_time.tm_mday - 1
while last_day:
if last_day in current_visits['days_stats'].keys():
break
last_day -= 1
if last_day:
for k in stats.keys():
stats[k] -= current_visits['days_stats'][last_day][k]
2014-11-20 08:18:31 +01:00
stats['nb_visitors'] = 0
for k in visits.keys():
if visits[k]['robot']: continue
if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
stats['nb_visitors'] += 1
2014-11-19 21:37:37 +01:00
print stats
current_visits['days_stats'][cur_time.tm_mday] = stats
2014-11-19 19:34:16 +01:00
def newHit(hit):
2014-11-19 19:45:41 +01:00
global current_visits
2014-11-19 19:34:16 +01:00
global analyse_started
2014-11-18 20:18:53 +01:00
2014-11-19 19:45:41 +01:00
decodeTime(hit)
2014-11-18 20:18:53 +01:00
t = hit['time_decoded']
2014-11-19 19:34:16 +01:00
cur_time = meta_visit['last_time']
2014-11-18 20:18:53 +01:00
if cur_time == None:
2014-11-19 19:45:41 +01:00
current_visits = deserialize(getDBFilename(t))
2014-11-19 21:37:37 +01:00
if not current_visits: current_visits = createEmptyVisits()
2014-11-19 19:34:16 +01:00
analyse_started = True
2014-11-18 20:18:53 +01:00
else:
2014-11-19 19:34:16 +01:00
if not analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
return
else:
analyse_started = True
2014-11-19 19:45:41 +01:00
current_visits = deserialize(getDBFilename(t))
2014-11-19 21:37:37 +01:00
if not current_visits: current_visits = createEmptyVisits()
2014-11-19 19:34:16 +01:00
if cur_time.tm_mon != t.tm_mon:
2014-11-19 19:45:41 +01:00
generateMonthStats()
current_visits = deserialize(getDBFilename(t))
2014-11-19 21:37:37 +01:00
if not current_visits: current_visits = createEmptyVisits()
elif cur_time.tm_mday != t.tm_mday:
generateDayStats()
2014-11-19 19:34:16 +01:00
meta_visit['last_time'] = t
2014-11-19 19:45:41 +01:00
if not decodeHTTPRequest(hit): return False
2014-11-19 19:34:16 +01:00
for k in hit.keys():
if hit[k] == '-': hit[k] = ''
2014-11-18 20:18:53 +01:00
2014-11-20 08:18:31 +01:00
appendHit(hit)
2014-11-18 20:18:53 +01:00
return True
print '==> Analysing log'
2014-11-19 19:34:16 +01:00
meta_visit = deserialize(META_PATH)
if not meta_visit:
2014-11-19 21:37:37 +01:00
meta_visit = createEmptyMeta()
current_visits = createEmptyVisits()
2014-11-19 19:34:16 +01:00
2014-11-18 20:18:53 +01:00
f = open("access.log")
for l in f:
# print "line " + l;
groups = log_re.match(l)
if groups:
if not newHit(groups.groupdict()):
break
else:
print "No match " + l
f.close();
2014-11-19 19:45:41 +01:00
if analyse_started:
2014-11-20 08:18:31 +01:00
generateDayStats()
2014-11-19 19:45:41 +01:00
generateMonthStats()
serialize(meta_visit, META_PATH)
else:
print '==> Analyse not started : nothing to do'