2014-11-18 20:18:53 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
import glob
|
|
|
|
import imp
|
2014-11-19 19:34:16 +01:00
|
|
|
import pickle
|
|
|
|
import gzip
|
2014-11-19 19:45:41 +01:00
|
|
|
|
2014-11-18 20:18:53 +01:00
|
|
|
from robots import awstats_robots;
|
|
|
|
|
|
|
|
print '==> Start'
|
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
meta_visit = {'last_time':None}
|
|
|
|
analyse_started = False
|
2014-11-19 19:45:41 +01:00
|
|
|
current_visits = {}
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
|
|
|
|
'"$request" $status $body_bytes_sent ' +\
|
|
|
|
'"$http_referer" "$http_user_agent"';
|
|
|
|
|
|
|
|
log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
|
|
|
|
log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
|
|
|
|
http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
|
|
|
#09/Nov/2014:06:35:16 +0100
|
|
|
|
time_format = '%d/%b/%Y:%H:%M:%S +0100'
|
|
|
|
#print "Log format : " + log_format_extracted
|
|
|
|
|
|
|
|
log_re = re.compile(log_format_extracted)
|
2014-11-19 08:01:12 +01:00
|
|
|
uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
|
2014-11-18 20:18:53 +01:00
|
|
|
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
|
|
|
|
viewed_http_codes = [200]
|
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py'
|
|
|
|
POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py'
|
|
|
|
DB_ROOT = './output/'
|
|
|
|
META_PATH = DB_ROOT + 'meta.db'
|
|
|
|
DB_FILENAME = 'iwla.db'
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-18 20:18:53 +01:00
|
|
|
print '==> Generating robot dictionary'
|
|
|
|
|
|
|
|
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
def createEmptyVisits():
|
|
|
|
visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}}
|
|
|
|
return visits
|
|
|
|
|
|
|
|
def getDBFilename(time):
|
2014-11-19 19:34:16 +01:00
|
|
|
return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
def serialize(obj, filename):
|
|
|
|
base = os.path.dirname(filename)
|
|
|
|
if not os.path.exists(base):
|
|
|
|
os.makedirs(base)
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
with open(filename + '.tmp', 'wb+') as f:
|
|
|
|
pickle.dump(obj, f)
|
|
|
|
f.seek(0)
|
|
|
|
with gzip.open(filename, 'w') as fzip:
|
|
|
|
fzip.write(f.read())
|
|
|
|
os.remove(filename + '.tmp')
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
def deserialize(filename):
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
return None
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
with gzip.open(filename, 'r') as f:
|
|
|
|
return pickle.load(f)
|
|
|
|
return None
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
def createEmptyVisits():
|
|
|
|
pass
|
|
|
|
|
|
|
|
def callPlugins(path, *kwargs):
|
2014-11-19 08:01:12 +01:00
|
|
|
print '==> Call plugins (%s)' % path
|
|
|
|
plugins = glob.glob(path)
|
|
|
|
plugins.sort()
|
|
|
|
for p in plugins:
|
|
|
|
print '\t%s' % (p)
|
|
|
|
mod = imp.load_source('hook', p)
|
|
|
|
mod.hook(*kwargs)
|
|
|
|
|
2014-11-18 20:18:53 +01:00
|
|
|
def isPage(request):
|
|
|
|
for e in pages_extensions:
|
|
|
|
if request.endswith(e):
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
def appendHit(hit):
|
2014-11-19 19:45:41 +01:00
|
|
|
super_hit = current_visits[hit['remote_addr']]
|
2014-11-18 20:18:53 +01:00
|
|
|
super_hit['pages'].append(hit)
|
|
|
|
super_hit['bandwith'] += int(hit['body_bytes_sent'])
|
|
|
|
|
|
|
|
request = hit['extract_request']
|
|
|
|
|
|
|
|
if 'extract_uri' in request.keys():
|
|
|
|
uri = request['extract_uri']
|
|
|
|
else:
|
|
|
|
uri = request['http_uri']
|
|
|
|
|
|
|
|
hit['is_page'] = isPage(uri)
|
|
|
|
|
|
|
|
# Don't count redirect status
|
|
|
|
if int(hit['status']) == 302: return
|
|
|
|
|
|
|
|
if super_hit['robot'] or\
|
|
|
|
not int(hit['status']) in viewed_http_codes:
|
|
|
|
page_key = 'not_viewed_pages'
|
|
|
|
hit_key = 'not_viewed_hits'
|
|
|
|
else:
|
|
|
|
page_key = 'viewed_pages'
|
|
|
|
hit_key = 'viewed_hits'
|
|
|
|
|
|
|
|
if hit['is_page']:
|
|
|
|
super_hit[page_key] += 1
|
|
|
|
else:
|
|
|
|
super_hit[hit_key] += 1
|
|
|
|
|
2014-11-19 08:01:12 +01:00
|
|
|
def createUser(hit):
|
2014-11-19 19:45:41 +01:00
|
|
|
super_hit = current_visits[hit['remote_addr']] = {}
|
2014-11-18 20:18:53 +01:00
|
|
|
super_hit['viewed_pages'] = 0;
|
|
|
|
super_hit['viewed_hits'] = 0;
|
|
|
|
super_hit['not_viewed_pages'] = 0;
|
|
|
|
super_hit['not_viewed_hits'] = 0;
|
|
|
|
super_hit['bandwith'] = 0;
|
|
|
|
super_hit['pages'] = [];
|
2014-11-19 08:01:12 +01:00
|
|
|
super_hit['robot'] = isRobot(hit);
|
2014-11-18 20:18:53 +01:00
|
|
|
appendHit(hit)
|
|
|
|
|
|
|
|
def isRobot(hit):
|
|
|
|
for r in awstats_robots:
|
|
|
|
if r.match(hit['http_user_agent']):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
def decodeHTTPRequest(hit):
|
2014-11-18 20:18:53 +01:00
|
|
|
if not 'request' in hit.keys(): return False
|
|
|
|
|
|
|
|
groups = http_request_extracted.match(hit['request'])
|
|
|
|
|
|
|
|
if groups:
|
|
|
|
hit['extract_request'] = groups.groupdict()
|
|
|
|
uri_groups = uri_re.match(hit['extract_request']['http_uri']);
|
|
|
|
if uri_groups:
|
2014-11-19 08:01:12 +01:00
|
|
|
d = uri_groups.groupdict()
|
|
|
|
hit['extract_request']['extract_uri'] = d['extract_uri']
|
|
|
|
if 'extract_parameters' in d.keys():
|
|
|
|
hit['extract_request']['extract_parameters'] = d['extract_parameters']
|
2014-11-18 20:18:53 +01:00
|
|
|
else:
|
|
|
|
print "Bad request extraction " + hit['request']
|
|
|
|
return False
|
|
|
|
|
|
|
|
referer_groups = uri_re.match(hit['http_referer']);
|
|
|
|
if referer_groups:
|
2014-11-19 08:01:12 +01:00
|
|
|
referer = hit['extract_referer'] = referer_groups.groupdict()
|
2014-11-18 20:18:53 +01:00
|
|
|
return True
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
def decodeTime(hit):
|
2014-11-18 20:18:53 +01:00
|
|
|
t = hit['time_local']
|
|
|
|
|
|
|
|
hit['time_decoded'] = time.strptime(t, time_format)
|
|
|
|
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
def generateMonthStats():
|
|
|
|
callPlugins(PRE_HOOK_DIRECTORY, current_visits)
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
valid_visitors = {k: v for (k,v) in current_visits.items() if not current_visits[k]['robot']}
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
stats = {}
|
|
|
|
stats['viewed_bandwidth'] = 0
|
|
|
|
stats['not_viewed_bandwidth'] = 0
|
|
|
|
stats['viewed_pages'] = 0
|
|
|
|
stats['viewed_hits'] = 0
|
|
|
|
stats['pages'] = set()
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
for k in current_visits.keys():
|
|
|
|
super_hit = current_visits[k]
|
2014-11-19 19:34:16 +01:00
|
|
|
if super_hit['robot']:
|
|
|
|
stats['not_viewed_bandwidth'] += super_hit['bandwith']
|
|
|
|
continue
|
|
|
|
|
|
|
|
stats['viewed_bandwidth'] += super_hit['bandwith']
|
|
|
|
stats['viewed_pages'] += super_hit['viewed_pages']
|
|
|
|
stats['viewed_hits'] += super_hit['viewed_hits']
|
|
|
|
|
|
|
|
for p in super_hit['pages']:
|
|
|
|
if not p['is_page']: continue
|
|
|
|
req = p['extract_request']
|
|
|
|
stats['pages'].add(req['extract_uri'])
|
|
|
|
|
|
|
|
cur_time = meta_visit['last_time']
|
|
|
|
|
|
|
|
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
|
|
|
|
print stats
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
path = getDBFilename(cur_time)
|
2014-11-19 19:34:16 +01:00
|
|
|
if os.path.exists(path):
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
print "==> Serialize to %s" % path
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
serialize(current_visits, path)
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
def newHit(hit):
|
2014-11-19 19:45:41 +01:00
|
|
|
global current_visits
|
2014-11-19 19:34:16 +01:00
|
|
|
global analyse_started
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
decodeTime(hit)
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
t = hit['time_decoded']
|
|
|
|
|
2014-11-19 19:34:16 +01:00
|
|
|
cur_time = meta_visit['last_time']
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
if cur_time == None:
|
2014-11-19 19:45:41 +01:00
|
|
|
current_visits = deserialize(getDBFilename(t))
|
|
|
|
if not current_visits: current_visits = {}
|
2014-11-19 19:34:16 +01:00
|
|
|
analyse_started = True
|
2014-11-18 20:18:53 +01:00
|
|
|
else:
|
2014-11-19 19:34:16 +01:00
|
|
|
if not analyse_started:
|
|
|
|
if time.mktime(cur_time) >= time.mktime(t):
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
analyse_started = True
|
2014-11-19 19:45:41 +01:00
|
|
|
current_visits = deserialize(getDBFilename(t))
|
|
|
|
if not current_visits: current_visits = {}
|
2014-11-19 19:34:16 +01:00
|
|
|
if cur_time.tm_mon != t.tm_mon:
|
2014-11-19 19:45:41 +01:00
|
|
|
generateMonthStats()
|
|
|
|
current_visits = deserialize(getDBFilename(t))
|
|
|
|
if not current_visits: current_visits = {}
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
meta_visit['last_time'] = t
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
if not decodeHTTPRequest(hit): return False
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
for k in hit.keys():
|
|
|
|
if hit[k] == '-': hit[k] = ''
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
remote_addr = hit['remote_addr']
|
2014-11-19 19:45:41 +01:00
|
|
|
if remote_addr in current_visits.keys():
|
2014-11-18 20:18:53 +01:00
|
|
|
appendHit(hit)
|
|
|
|
else:
|
2014-11-19 08:01:12 +01:00
|
|
|
createUser(hit)
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
print '==> Analysing log'
|
2014-11-19 19:34:16 +01:00
|
|
|
|
|
|
|
meta_visit = deserialize(META_PATH)
|
|
|
|
if not meta_visit:
|
|
|
|
meta_visit = {'last_time':None}
|
|
|
|
|
2014-11-18 20:18:53 +01:00
|
|
|
f = open("access.log")
|
|
|
|
for l in f:
|
|
|
|
# print "line " + l;
|
|
|
|
|
|
|
|
groups = log_re.match(l)
|
|
|
|
|
|
|
|
if groups:
|
|
|
|
if not newHit(groups.groupdict()):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print "No match " + l
|
|
|
|
f.close();
|
|
|
|
|
2014-11-19 19:45:41 +01:00
|
|
|
if analyse_started:
|
|
|
|
generateMonthStats()
|
|
|
|
serialize(meta_visit, META_PATH)
|
|
|
|
else:
|
|
|
|
print '==> Analyse not started : nothing to do'
|