From 888b481b1dd2ee05d6bd087e3bb7122d26e33f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Wed, 19 Nov 2014 19:34:16 +0100 Subject: [PATCH] On r715 --- hooks/pre_analysis/H001_robot.py | 40 +++++++- hooks/pre_analysis/H002_soutade.py | 20 +++- hooks_pre/H001_robot.py | 1 - hooks_pre/H002_soutade.py | 1 - iwla.py | 149 +++++++++++++++++++---------- 5 files changed, 156 insertions(+), 55 deletions(-) mode change 120000 => 100644 hooks/pre_analysis/H001_robot.py mode change 120000 => 100644 hooks/pre_analysis/H002_soutade.py delete mode 120000 hooks_pre/H001_robot.py delete mode 120000 hooks_pre/H002_soutade.py diff --git a/hooks/pre_analysis/H001_robot.py b/hooks/pre_analysis/H001_robot.py deleted file mode 120000 index 7328242..0000000 --- a/hooks/pre_analysis/H001_robot.py +++ /dev/null @@ -1 +0,0 @@ -../../plugins/pre_analysis/H001_robot.py \ No newline at end of file diff --git a/hooks/pre_analysis/H001_robot.py b/hooks/pre_analysis/H001_robot.py new file mode 100644 index 0000000..9ec45cb --- /dev/null +++ b/hooks/pre_analysis/H001_robot.py @@ -0,0 +1,39 @@ + +# Basic rule to detect robots + +def hook(hits): + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + isRobot = False + referers = 0 + +# 1) no pages view --> robot + if not super_hit['viewed_pages']: + super_hit['robot'] = 1 + continue + +# 2) pages without hit --> robot + if not super_hit['viewed_hits']: + super_hit['robot'] = 1 + continue + + for hit in super_hit['pages']: +# 3) /robots.txt read + if hit['extract_request']['http_uri'] == '/robots.txt': + isRobot = True + break + +# 4) Any referer for hits + if not hit['is_page'] and hit['http_referer']: + referers += 1 + + if isRobot: + super_hit['robot'] = 1 + continue + + if super_hit['viewed_hits'] and not referers: + super_hit['robot'] = 1 + continue diff --git a/hooks/pre_analysis/H002_soutade.py b/hooks/pre_analysis/H002_soutade.py deleted file mode 120000 index 091105f..0000000 --- a/hooks/pre_analysis/H002_soutade.py +++ /dev/null @@ -1 +0,0 @@ -../../plugins/pre_analysis/H002_soutade.py \ No newline at end of file diff --git a/hooks/pre_analysis/H002_soutade.py b/hooks/pre_analysis/H002_soutade.py new file mode 100644 index 0000000..d6767aa --- /dev/null +++ b/hooks/pre_analysis/H002_soutade.py @@ -0,0 +1,19 @@ +import re + +# Remove logo from indefero +logo_re = re.compile(r'^.+/logo/$') + +# Basic rule to detect robots + +def hook(hits): + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + for p in super_hit['pages']: + if not p['is_page']: continue + if logo_re.match(p['extract_request']['extract_uri']): + p['is_page'] = False + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1 diff --git a/hooks_pre/H001_robot.py b/hooks_pre/H001_robot.py deleted file mode 120000 index 5e6d168..0000000 --- a/hooks_pre/H001_robot.py +++ /dev/null @@ -1 +0,0 @@ -../plugins/hooks_pre/H001_robot.py \ No newline at end of file diff --git a/hooks_pre/H002_soutade.py b/hooks_pre/H002_soutade.py deleted file mode 120000 index 345c147..0000000 --- a/hooks_pre/H002_soutade.py +++ /dev/null @@ -1 +0,0 @@ -../plugins/hooks_pre/H002_soutade.py \ No newline at end of file diff --git a/iwla.py b/iwla.py index b336243..5279434 100755 --- a/iwla.py +++ b/iwla.py @@ -5,11 +5,14 @@ import re import time import glob import imp +import pickle +import gzip from robots import awstats_robots; print '==> Start' -meta_visit = {} +meta_visit = {'last_time':None} +analyse_started = False current_visit = {} log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ @@ -28,39 +31,38 @@ uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200] -cur_time = None - -PRE_HOOK_DIRECTORY = './hooks_pre/*.py' -POST_HOOK_DIRECTORY = './hooks_post/*.py' +PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py' +POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py' +DB_ROOT = './output/' +META_PATH = DB_ROOT + 'meta.db' +DB_FILENAME = 'iwla.db' print '==> Generating robot dictionary' awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) -def generate_day_stats(): - days_stats = {} - days_stats['viewed_bandwidth'] = 0 - days_stats['not_viewed_bandwidth'] = 0 - days_stats['viewed_pages'] = 0 - days_stats['viewed_hits'] = 0 - days_stats['pages'] = set() +def get_db_filename(time): + return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) - for k in current_visit.keys(): - super_hit = current_visit[k] - if super_hit['robot']: - days_stats['not_viewed_bandwidth'] += super_hit['bandwith'] - continue +def serialize(obj, filename): + base = os.path.dirname(filename) + if not os.path.exists(base): + os.makedirs(base) - days_stats['viewed_bandwidth'] += super_hit['bandwith'] - days_stats['viewed_pages'] += super_hit['viewed_pages'] - days_stats['viewed_hits'] += super_hit['viewed_hits'] + with open(filename + '.tmp', 'wb+') as f: + pickle.dump(obj, f) + f.seek(0) + with gzip.open(filename, 'w') as fzip: + fzip.write(f.read()) + os.remove(filename + '.tmp') - for p in super_hit['pages']: - if not p['is_page']: continue - req = p['extract_request'] - days_stats['pages'].add(req['extract_uri']) +def deserialize(filename): + if not os.path.exists(filename): + return None - return days_stats + with gzip.open(filename, 'r') as f: + return pickle.load(f) + return None def call_plugins(path, *kwargs): print '==> Call plugins (%s)' % path @@ -153,25 +155,79 @@ def decode_time(hit): hit['time_decoded'] = time.strptime(t, time_format) +def generate_month_stats(): + call_plugins(PRE_HOOK_DIRECTORY, current_visit) + + valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']} + + call_plugins(POST_HOOK_DIRECTORY, valid_visitors) + + stats = {} + stats['viewed_bandwidth'] = 0 + stats['not_viewed_bandwidth'] = 0 + stats['viewed_pages'] = 0 + stats['viewed_hits'] = 0 + stats['pages'] = set() + + for k in current_visit.keys(): + super_hit = current_visit[k] + if super_hit['robot']: + stats['not_viewed_bandwidth'] += super_hit['bandwith'] + continue + + stats['viewed_bandwidth'] += super_hit['bandwith'] + stats['viewed_pages'] += super_hit['viewed_pages'] + stats['viewed_hits'] += super_hit['viewed_hits'] + + for p in super_hit['pages']: + if not p['is_page']: continue + req = p['extract_request'] + stats['pages'].add(req['extract_uri']) + + cur_time = meta_visit['last_time'] + + print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) + print stats + + path = get_db_filename(cur_time) + if os.path.exists(path): + os.remove(path) + + print "==> Serialize to %s" % path + + serialize(current_visit, path) + def newHit(hit): - global cur_time - - if not decode_http_request(hit): return - - for k in hit.keys(): - if hit[k] == '-': hit[k] = '' + global current_visit + global analyse_started decode_time(hit) t = hit['time_decoded'] - meta_visit['last_time'] = t + cur_time = meta_visit['last_time'] if cur_time == None: - cur_time = t + current_visit = deserialize(get_db_filename(t)) + if not current_visit: current_visit = {} + analyse_started = True else: - if cur_time.tm_mday != t.tm_mday: - return False + if not analyse_started: + if time.mktime(cur_time) >= time.mktime(t): + return + else: + analyse_started = True + if cur_time.tm_mon != t.tm_mon: + generate_month_stats() + current_visit = deserialize(get_db_filename(t)) + if not current_visit: current_visit = {} + + meta_visit['last_time'] = t + + if not decode_http_request(hit): return False + + for k in hit.keys(): + if hit[k] == '-': hit[k] = '' remote_addr = hit['remote_addr'] if remote_addr in current_visit.keys(): @@ -182,6 +238,11 @@ def newHit(hit): return True print '==> Analysing log' + +meta_visit = deserialize(META_PATH) +if not meta_visit: + meta_visit = {'last_time':None} + f = open("access.log") for l in f: # print "line " + l; @@ -195,19 +256,5 @@ for l in f: print "No match " + l f.close(); -call_plugins(PRE_HOOK_DIRECTORY, current_visit) - -stats = generate_day_stats() - -print stats -valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']} -#print valid_visitors -# for ip in current_visit.keys(): -# hit = current_visit[ip] -# if hit['robot']: continue -# print "%s =>" % (ip) -# for k in hit.keys(): -# if k != 'pages': -# print "\t%s : %s" % (k, current_visit[ip][k]) - -call_plugins(POST_HOOK_DIRECTORY, valid_visitors) +generate_month_stats() +serialize(meta_visit, META_PATH)