diff --git a/conf.py b/conf.py index 9a0f235..5a850e4 100644 --- a/conf.py +++ b/conf.py @@ -11,4 +11,10 @@ analyzed_filename = 'access.log' DB_ROOT = './output/' DISPLAY_ROOT = './output/' -pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] +pre_analysis_hooks = ['H002_soutade', 'H001_robot'] +post_analysis_hooks = ['top_visitors'] +display_hooks = ['top_visitors'] + +# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] +# post_analysis_hooks = ['top_visitors.py'] +# display_hooks = ['top_visitors.py'] diff --git a/display.py b/display.py index aa11976..4de1bd6 100644 --- a/display.py +++ b/display.py @@ -1,3 +1,4 @@ + def createPage(display, filename, title): page = {} page['title'] = title; @@ -14,27 +15,37 @@ def createTable(title, cols): table['cols'] = cols table['rows'] = [] + return table + def appendRowToTable(table, row): table['rows'].append(row) -def buildPages(display): +def buildTable(block, f): + print 'Write table %s' % block['title'] + f.write('') + f.write('') + for title in block['cols']: + f.write('' % (title)) + f.write('') + for row in block['rows']: + f.write('') + for v in row: + f.write('' % (v)) + f.write('') + f.write('
%s
%s
') + +def buildPages(display_root, display): for filename in display.keys(): page = display[filename] - with open(DISPLAY_ROOT + filename, 'w') as f: + print "OPEN %s" % (display_root + filename) + with open(display_root + filename, 'w') as f: f.write('%s' % (page['title'])) for block in page['blocks']: + print "Bluid block" + print block + print "End block" if block['type'] == 'html': f.write(block['value']) elif block['type'] == 'table': - f.write('') - f.write('') - for title in block['cols']: - f.write('' % (title)) - f.write('') - for row in block['rows']: - f.write('') - for v in row: - f.write('' % (v)) - f.write('') - f.write('
%s
%s
') + buildTable(block, f) f.write('') diff --git a/iwla.py b/iwla.py index f8441f2..d14695b 100755 --- a/iwla.py +++ b/iwla.py @@ -17,7 +17,7 @@ DISPLAY_ROOT = './output/' log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ '"$request" $status $body_bytes_sent ' +\ - '"$http_referer" "$http_user_agent"'; + '"$http_referer" "$http_user_agent"' time_format = '%d/%b/%Y:%H:%M:%S +0100' @@ -35,7 +35,7 @@ current_visits = {} cache_plugins = {} display = {} -log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format); +log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format) log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted) http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') @@ -57,11 +57,18 @@ ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 def preloadPlugins(): + ret = True for root in plugins.keys(): for plugin_name in plugins[root]: p = root + '/' + plugin_name try: - mod = cache_plugins[p] = imp.load_source('hook', p) + fp, pathname, description = imp.find_module(plugin_name, [root]) + cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description) + #cache_plugins[p] = imp.load_module(p,None,p,("py","r",imp.PKG_DIRECTORY)) + #cache_plugins[p] = imp.load_source(p, p) + mod = cache_plugins[p] + #print dir(mod) + #print "Register %s -> %s" % (p, mod) infos = mod.get_plugins_infos() if infos['class'] != ANALYSIS_CLASS or \ API_VERSION < infos['min_version'] or\ @@ -71,8 +78,8 @@ def preloadPlugins(): del cache_plugins[p] except Exception as e: print 'Error loading \'%s\' => %s' % (p, e) - return False - return True + ret = False + return ret def createEmptyVisits(): @@ -113,12 +120,12 @@ def deserialize(filename): return pickle.load(f) return None -def callPlugins(root, *kwargs): +def callPlugins(root, *args): print '==> Call plugins (%s)' % root for p in plugins[root]: print '\t%s' % (p) mod = cache_plugins[root + '/' + p] - mod.hook(*kwargs) + mod.hook(*args) def isPage(request): for e in pages_extensions: @@ -135,8 +142,8 @@ def appendHit(hit): return super_hit = current_visits['visits'][remote_addr] - super_hit['pages'].append(hit) - super_hit['bandwith'] += int(hit['body_bytes_sent']) + super_hit['requests'].append(hit) + super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['last_access'] = meta_visit['last_time'] request = hit['extract_request'] @@ -167,15 +174,16 @@ def appendHit(hit): def createUser(hit): super_hit = current_visits['visits'][hit['remote_addr']] = {} - super_hit['viewed_pages'] = 0; - super_hit['viewed_hits'] = 0; - super_hit['not_viewed_pages'] = 0; - super_hit['not_viewed_hits'] = 0; - super_hit['bandwith'] = 0; + super_hit['remote_addr'] = hit['remote_addr'] + super_hit['viewed_pages'] = 0 + super_hit['viewed_hits'] = 0 + super_hit['not_viewed_pages'] = 0 + super_hit['not_viewed_hits'] = 0 + super_hit['bandwidth'] = 0 super_hit['last_access'] = meta_visit['last_time'] - super_hit['pages'] = []; + super_hit['requests'] = [] super_hit['robot'] = False - super_hit['hit_only'] = 0; + super_hit['hit_only'] = 0 appendHit(hit) def decodeHTTPRequest(hit): @@ -185,7 +193,7 @@ def decodeHTTPRequest(hit): if groups: hit['extract_request'] = groups.groupdict() - uri_groups = uri_re.match(hit['extract_request']['http_uri']); + uri_groups = uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict() hit['extract_request']['extract_uri'] = d['extract_uri'] @@ -195,7 +203,7 @@ def decodeHTTPRequest(hit): print "Bad request extraction " + hit['request'] return False - referer_groups = uri_re.match(hit['http_referer']); + referer_groups = uri_re.match(hit['http_referer']) if referer_groups: referer = hit['extract_referer'] = referer_groups.groupdict() return True @@ -205,13 +213,19 @@ def decodeTime(hit): hit['time_decoded'] = time.strptime(t, time_format) +def getDisplayIndex(): + cur_time = meta_visit['last_time'] + filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) + + return display.get(filename, None) + def generateDisplayDaysStat(): cur_time = meta_visit['last_time'] title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) page = createPage(display, filename, title) - days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwith', 'Robot Bandwith']) + days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth']) keys = current_visits['days_stats'].keys() keys.sort() @@ -243,7 +257,7 @@ def generateDisplayDaysStat(): def generateDisplay(): generateDisplayDaysStat() callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display) - buildPages() + buildPages(DISPLAY_ROOT, display) def generateStats(visits): stats = {} @@ -251,27 +265,27 @@ def generateStats(visits): stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 - #stats['pages'] = set() + #stats['requests'] = set() stats['nb_visitors'] = 0 for k in visits.keys(): super_hit = visits[k] if super_hit['robot']: - stats['not_viewed_bandwidth'] += super_hit['bandwith'] + stats['not_viewed_bandwidth'] += super_hit['bandwidth'] continue - print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) + #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) if not super_hit['hit_only']: stats['nb_visitors'] += 1 - stats['viewed_bandwidth'] += super_hit['bandwith'] + stats['viewed_bandwidth'] += super_hit['bandwidth'] stats['viewed_pages'] += super_hit['viewed_pages'] stats['viewed_hits'] += super_hit['viewed_hits'] - # for p in super_hit['pages']: + # for p in super_hit['requests']: # if not p['is_page']: continue # req = p['extract_request'] - # stats['pages'].add(req['extract_uri']) + # stats['requests'].add(req['extract_uri']) return stats @@ -287,7 +301,7 @@ def generateMonthStats(): print stats valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} - callPlugins(POST_HOOK_DIRECTORY, valid_visitors) + callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats) current_visits['month_stats'] = stats @@ -348,7 +362,6 @@ def newHit(hit): return else: analyse_started = True - current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() if cur_time.tm_mon != t.tm_mon: generateMonthStats() current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() @@ -371,12 +384,14 @@ preloadPlugins() print '==> Analysing log' meta_visit = deserialize(META_PATH) or createEmptyMeta() - -current_visits = createEmptyVisits() +if meta_visit['last_time']: + current_visits = deserialize(getDBFilename(meta_visit['last_time'])) or createEmptyVisits() +else: + current_visits = createEmptyVisits() f = open(analyzed_filename) for l in f: - # print "line " + l; + # print "line " + l groups = log_re.match(l) @@ -385,7 +400,7 @@ for l in f: break else: print "No match " + l -f.close(); +f.close() if analyse_started: generateDayStats() @@ -393,3 +408,4 @@ if analyse_started: serialize(meta_visit, META_PATH) else: print '==> Analyse not started : nothing to do' + generateMonthStats() diff --git a/plugins/pre_analysis/H001_robot.py b/plugins/pre_analysis/H001_robot.py index 91cd5fc..a096dc8 100644 --- a/plugins/pre_analysis/H001_robot.py +++ b/plugins/pre_analysis/H001_robot.py @@ -30,10 +30,12 @@ def hook(hits): isRobot = False referers = 0 - for r in awstats_robots: - if r.match(super_hit['pages'][0]['http_user_agent']): - super_hit['robot'] = 1 - continue + first_page = super_hit['requests'][0] + if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday: + for r in awstats_robots: + if r.match(first_page['http_user_agent']): + super_hit['robot'] = 1 + continue # 1) no pages view --> robot if not super_hit['viewed_pages']: @@ -45,7 +47,7 @@ def hook(hits): super_hit['robot'] = 1 continue - for hit in super_hit['pages']: + for hit in super_hit['requests']: # 3) /robots.txt read if hit['extract_request']['http_uri'] == '/robots.txt': isRobot = True diff --git a/plugins/pre_analysis/H002_soutade.py b/plugins/pre_analysis/H002_soutade.py index f546d76..5b70f64 100644 --- a/plugins/pre_analysis/H002_soutade.py +++ b/plugins/pre_analysis/H002_soutade.py @@ -7,9 +7,11 @@ PLUGIN_CLASS = 'HTTP' API_VERSION = 1 def get_plugins_infos(): - infos = {'class' : PLUGIN_CLASS, - 'min_version' : API_VERSION, - 'max_version' : -1} + infos = { + 'class' : PLUGIN_CLASS, + 'min_version' : API_VERSION, + 'max_version' : -1 + } return infos def load(): @@ -23,9 +25,10 @@ def hook(hits): if super_hit['robot']: continue - for p in super_hit['pages']: + for p in super_hit['requests']: if not p['is_page']: continue if int(p['status']) != 200: continue + if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue if logo_re.match(p['extract_request']['extract_uri']): p['is_page'] = False super_hit['viewed_pages'] -= 1