From b8027fe509f673f13df40268b9bf50815f0afab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Wed, 19 Nov 2014 21:37:37 +0100 Subject: [PATCH] Need to separate day and month stats --- .../{H002_soutade.py => H001_soutade.py} | 5 +- .../{H001_robot.py => H002_robot.py} | 0 iwla.py | 99 +++++++++++++------ 3 files changed, 74 insertions(+), 30 deletions(-) rename hooks/pre_analysis/{H002_soutade.py => H001_soutade.py} (73%) rename hooks/pre_analysis/{H001_robot.py => H002_robot.py} (100%) diff --git a/hooks/pre_analysis/H002_soutade.py b/hooks/pre_analysis/H001_soutade.py similarity index 73% rename from hooks/pre_analysis/H002_soutade.py rename to hooks/pre_analysis/H001_soutade.py index d6767aa..50a7932 100644 --- a/hooks/pre_analysis/H002_soutade.py +++ b/hooks/pre_analysis/H001_soutade.py @@ -15,5 +15,6 @@ def hook(hits): if not p['is_page']: continue if logo_re.match(p['extract_request']['extract_uri']): p['is_page'] = False - super_hit['viewed_pages'] -= 1 - super_hit['viewed_hits'] += 1 + if super_hit['viewed_pages']: + super_hit['viewed_pages'] -= 1 + super_hit['viewed_hits'] += 1 diff --git a/hooks/pre_analysis/H001_robot.py b/hooks/pre_analysis/H002_robot.py similarity index 100% rename from hooks/pre_analysis/H001_robot.py rename to hooks/pre_analysis/H002_robot.py diff --git a/iwla.py b/iwla.py index 753179b..2cd4c9e 100755 --- a/iwla.py +++ b/iwla.py @@ -15,6 +15,7 @@ print '==> Start' meta_visit = {'last_time':None} analyse_started = False current_visits = {} +cache_plugins = {} log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ '"$request" $status $body_bytes_sent ' +\ @@ -46,6 +47,10 @@ def createEmptyVisits(): visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}} return visits +def createEmptyMeta(): + meta = {'last_time':None} + return meta + def getDBFilename(time): return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) @@ -69,16 +74,17 @@ def deserialize(filename): return pickle.load(f) return None -def createEmptyVisits(): - pass - def callPlugins(path, *kwargs): print '==> Call plugins (%s)' % path plugins = glob.glob(path) plugins.sort() for p in plugins: print '\t%s' % (p) - mod = imp.load_source('hook', p) + if not p in cache_plugins: + mod = imp.load_source('hook', p) + cache_plugins[p] = mod + else: + mod = cache_plugins[p] mod.hook(*kwargs) def isPage(request): @@ -89,7 +95,7 @@ def isPage(request): return False def appendHit(hit): - super_hit = current_visits[hit['remote_addr']] + super_hit = current_visits['visits'][hit['remote_addr']] super_hit['pages'].append(hit) super_hit['bandwith'] += int(hit['body_bytes_sent']) @@ -102,8 +108,9 @@ def appendHit(hit): hit['is_page'] = isPage(uri) - # Don't count redirect status - if int(hit['status']) == 302: return + # Don't count 3xx status + status = int(hit['status']) + if status >= 300 and status < 400: return if super_hit['robot'] or\ not int(hit['status']) in viewed_http_codes: @@ -119,7 +126,7 @@ def appendHit(hit): super_hit[hit_key] += 1 def createUser(hit): - super_hit = current_visits[hit['remote_addr']] = {} + super_hit = current_visits['visits'][hit['remote_addr']] = {} super_hit['viewed_pages'] = 0; super_hit['viewed_hits'] = 0; super_hit['not_viewed_pages'] = 0; @@ -163,40 +170,49 @@ def decodeTime(hit): hit['time_decoded'] = time.strptime(t, time_format) -def generateMonthStats(): - callPlugins(PRE_HOOK_DIRECTORY, current_visits) - - valid_visitors = {k: v for (k,v) in current_visits.items() if not current_visits[k]['robot']} - - callPlugins(POST_HOOK_DIRECTORY, valid_visitors) - +def generateStats(visits): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 - stats['pages'] = set() + #stats['pages'] = set() + stats['nb_visitors'] = 0 - for k in current_visits.keys(): - super_hit = current_visits[k] + for k in visits.keys(): + super_hit = visits[k] if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwith'] continue + print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) + + stats['nb_visitors'] += 1 stats['viewed_bandwidth'] += super_hit['bandwith'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits'] - for p in super_hit['pages']: - if not p['is_page']: continue - req = p['extract_request'] - stats['pages'].add(req['extract_uri']) + # for p in super_hit['pages']: + # if not p['is_page']: continue + # req = p['extract_request'] + # stats['pages'].add(req['extract_uri']) + return stats + +def generateMonthStats(): + visits = current_visits['visits'] + + stats = generateStats(visits) + cur_time = meta_visit['last_time'] - print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print stats + valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} + callPlugins(POST_HOOK_DIRECTORY, valid_visitors) + + current_visits['month_stats'] = stats + path = getDBFilename(cur_time) if os.path.exists(path): os.remove(path) @@ -205,6 +221,29 @@ def generateMonthStats(): serialize(current_visits, path) +def generateDayStats(): + visits = current_visits['visits'] + + callPlugins(PRE_HOOK_DIRECTORY, visits) + + stats = generateStats(visits) + + cur_time = meta_visit['last_time'] + print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) + + if cur_time.tm_mday > 1: + last_day = cur_time.tm_mday - 1 + while last_day: + if last_day in current_visits['days_stats'].keys(): + break + last_day -= 1 + if last_day: + for k in stats.keys(): + stats[k] -= current_visits['days_stats'][last_day][k] + print stats + + current_visits['days_stats'][cur_time.tm_mday] = stats + def newHit(hit): global current_visits global analyse_started @@ -217,7 +256,7 @@ def newHit(hit): if cur_time == None: current_visits = deserialize(getDBFilename(t)) - if not current_visits: current_visits = {} + if not current_visits: current_visits = createEmptyVisits() analyse_started = True else: if not analyse_started: @@ -226,11 +265,13 @@ def newHit(hit): else: analyse_started = True current_visits = deserialize(getDBFilename(t)) - if not current_visits: current_visits = {} + if not current_visits: current_visits = createEmptyVisits() if cur_time.tm_mon != t.tm_mon: generateMonthStats() current_visits = deserialize(getDBFilename(t)) - if not current_visits: current_visits = {} + if not current_visits: current_visits = createEmptyVisits() + elif cur_time.tm_mday != t.tm_mday: + generateDayStats() meta_visit['last_time'] = t @@ -240,7 +281,7 @@ def newHit(hit): if hit[k] == '-': hit[k] = '' remote_addr = hit['remote_addr'] - if remote_addr in current_visits.keys(): + if remote_addr in current_visits['visits'].keys(): appendHit(hit) else: createUser(hit) @@ -251,7 +292,9 @@ print '==> Analysing log' meta_visit = deserialize(META_PATH) if not meta_visit: - meta_visit = {'last_time':None} + meta_visit = createEmptyMeta() + +current_visits = createEmptyVisits() f = open("access.log") for l in f: