From b1549ca8845a03423a562fdb3b72ac558a356936 Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Wed, 19 Nov 2014 08:01:12 +0100 Subject: [PATCH] Initial commit --- hooks_pre/H001_robot.py | 1 + hooks_pre/H002_soutade.py | 1 + iwla.py | 91 +++++++++++++++++++++---------- plugins/hooks_pre/H001_robot.py | 39 +++++++++++++ plugins/hooks_pre/H002_soutade.py | 19 +++++++ 5 files changed, 123 insertions(+), 28 deletions(-) create mode 120000 hooks_pre/H001_robot.py create mode 120000 hooks_pre/H002_soutade.py create mode 100644 plugins/hooks_pre/H001_robot.py create mode 100644 plugins/hooks_pre/H002_soutade.py diff --git a/hooks_pre/H001_robot.py b/hooks_pre/H001_robot.py new file mode 120000 index 0000000..5e6d168 --- /dev/null +++ b/hooks_pre/H001_robot.py @@ -0,0 +1 @@ +../plugins/hooks_pre/H001_robot.py \ No newline at end of file diff --git a/hooks_pre/H002_soutade.py b/hooks_pre/H002_soutade.py new file mode 120000 index 0000000..345c147 --- /dev/null +++ b/hooks_pre/H002_soutade.py @@ -0,0 +1 @@ +../plugins/hooks_pre/H002_soutade.py \ No newline at end of file diff --git a/iwla.py b/iwla.py index 39fcafd..b336243 100755 --- a/iwla.py +++ b/iwla.py @@ -9,6 +9,7 @@ from robots import awstats_robots; print '==> Start' +meta_visit = {} current_visit = {} log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ @@ -23,16 +24,53 @@ time_format = '%d/%b/%Y:%H:%M:%S +0100' #print "Log format : " + log_format_extracted log_re = re.compile(log_format_extracted) -uri_re = re.compile(r'(?P[^\?]*)\?(?P.*)') +uri_re = re.compile(r'(?P[^\?]*)[\?(?P.*)]?') pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] viewed_http_codes = [200] cur_time = None +PRE_HOOK_DIRECTORY = './hooks_pre/*.py' +POST_HOOK_DIRECTORY = './hooks_post/*.py' + print '==> Generating robot dictionary' awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots) +def generate_day_stats(): + days_stats = {} + days_stats['viewed_bandwidth'] = 0 + days_stats['not_viewed_bandwidth'] = 0 + days_stats['viewed_pages'] = 0 + days_stats['viewed_hits'] = 0 + days_stats['pages'] = set() + + for k in current_visit.keys(): + super_hit = current_visit[k] + if super_hit['robot']: + days_stats['not_viewed_bandwidth'] += super_hit['bandwith'] + continue + + days_stats['viewed_bandwidth'] += super_hit['bandwith'] + days_stats['viewed_pages'] += super_hit['viewed_pages'] + days_stats['viewed_hits'] += super_hit['viewed_hits'] + + for p in super_hit['pages']: + if not p['is_page']: continue + req = p['extract_request'] + days_stats['pages'].add(req['extract_uri']) + + return days_stats + +def call_plugins(path, *kwargs): + print '==> Call plugins (%s)' % path + plugins = glob.glob(path) + plugins.sort() + for p in plugins: + print '\t%s' % (p) + mod = imp.load_source('hook', p) + mod.hook(*kwargs) + def isPage(request): for e in pages_extensions: if request.endswith(e): @@ -70,7 +108,7 @@ def appendHit(hit): else: super_hit[hit_key] += 1 -def createGeneric(hit): +def createUser(hit): super_hit = current_visit[hit['remote_addr']] = {} super_hit['viewed_pages'] = 0; super_hit['viewed_hits'] = 0; @@ -78,12 +116,7 @@ def createGeneric(hit): super_hit['not_viewed_hits'] = 0; super_hit['bandwith'] = 0; super_hit['pages'] = []; - - return super_hit - -def createUser(hit, robot): - super_hit = createGeneric(hit) - super_hit['robot'] = robot; + super_hit['robot'] = isRobot(hit); appendHit(hit) def isRobot(hit): @@ -101,16 +134,17 @@ def decode_http_request(hit): hit['extract_request'] = groups.groupdict() uri_groups = uri_re.match(hit['extract_request']['http_uri']); if uri_groups: - hit['extract_request']['extract_uri'] = uri_groups.group('extract_uri') - hit['extract_request']['extract_parameters'] = uri_groups.group('extract_parameters') + d = uri_groups.groupdict() + hit['extract_request']['extract_uri'] = d['extract_uri'] + if 'extract_parameters' in d.keys(): + hit['extract_request']['extract_parameters'] = d['extract_parameters'] else: print "Bad request extraction " + hit['request'] return False referer_groups = uri_re.match(hit['http_referer']); if referer_groups: - hit['extract_referer']['extract_uri'] = referer_groups.group('extract_uri') - hit['extract_referer']['extract_parameters'] = referer_groups.group('extract_parameters') + referer = hit['extract_referer'] = referer_groups.groupdict() return True def decode_time(hit): @@ -131,7 +165,7 @@ def newHit(hit): t = hit['time_decoded'] - current_visit['last_time'] = t + meta_visit['last_time'] = t if cur_time == None: cur_time = t @@ -143,7 +177,7 @@ def newHit(hit): if remote_addr in current_visit.keys(): appendHit(hit) else: - createUser(hit, isRobot(hit)) + createUser(hit) return True @@ -161,18 +195,19 @@ for l in f: print "No match " + l f.close(); -print '==> Call plugins' -plugins = glob.glob('./hooks_pre/*.py') -plugins.sort() -for p in plugins: - print '\t%s' % (p) - mod = imp.load_source('hook', p) - mod.hook(current_visit) +call_plugins(PRE_HOOK_DIRECTORY, current_visit) -for ip in current_visit.keys(): - hit = current_visit[ip] - if hit['robot']: continue - print "%s =>" % (ip) - for k in hit.keys(): - if k != 'pages': - print "\t%s : %s" % (k, current_visit[ip][k]) +stats = generate_day_stats() + +print stats +valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']} +#print valid_visitors +# for ip in current_visit.keys(): +# hit = current_visit[ip] +# if hit['robot']: continue +# print "%s =>" % (ip) +# for k in hit.keys(): +# if k != 'pages': +# print "\t%s : %s" % (k, current_visit[ip][k]) + +call_plugins(POST_HOOK_DIRECTORY, valid_visitors) diff --git a/plugins/hooks_pre/H001_robot.py b/plugins/hooks_pre/H001_robot.py new file mode 100644 index 0000000..9ec45cb --- /dev/null +++ b/plugins/hooks_pre/H001_robot.py @@ -0,0 +1,39 @@ + +# Basic rule to detect robots + +def hook(hits): + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + isRobot = False + referers = 0 + +# 1) no pages view --> robot + if not super_hit['viewed_pages']: + super_hit['robot'] = 1 + continue + +# 2) pages without hit --> robot + if not super_hit['viewed_hits']: + super_hit['robot'] = 1 + continue + + for hit in super_hit['pages']: +# 3) /robots.txt read + if hit['extract_request']['http_uri'] == '/robots.txt': + isRobot = True + break + +# 4) Any referer for hits + if not hit['is_page'] and hit['http_referer']: + referers += 1 + + if isRobot: + super_hit['robot'] = 1 + continue + + if super_hit['viewed_hits'] and not referers: + super_hit['robot'] = 1 + continue diff --git a/plugins/hooks_pre/H002_soutade.py b/plugins/hooks_pre/H002_soutade.py new file mode 100644 index 0000000..d6767aa --- /dev/null +++ b/plugins/hooks_pre/H002_soutade.py @@ -0,0 +1,19 @@ +import re + +# Remove logo from indefero +logo_re = re.compile(r'^.+/logo/$') + +# Basic rule to detect robots + +def hook(hits): + for k in hits.keys(): + super_hit = hits[k] + + if super_hit['robot']: continue + + for p in super_hit['pages']: + if not p['is_page']: continue + if logo_re.match(p['extract_request']['extract_uri']): + p['is_page'] = False + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1