diff --git a/TODO b/TODO new file mode 100644 index 0000000..f12a9cc --- /dev/null +++ b/TODO @@ -0,0 +1,14 @@ +reverse analysis +-f option to read a file instead of the one in conf +Other when pages truncated +translations +doc auto generation +doc enhancement +Limit hits/pages/downloads by rate +Automatic tests +Test separate directory for DB and display +quiet mode +Add 0 before month when < 10 +Add Licence +Free memory as soon as possible +Bug in bandwidth account (x10) \ No newline at end of file diff --git a/iwla.py b/iwla.py index c41f06b..93de2cd 100755 --- a/iwla.py +++ b/iwla.py @@ -174,7 +174,7 @@ class IWLA(object): def getCurDisplayPath(self, filename): cur_time = self.meta_infos['last_time'] - return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename) + return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename) def getResourcesPath(self): return conf.resources_path @@ -194,7 +194,7 @@ class IWLA(object): return self.display def getDBFilename(self, time): - return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME) + return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME) def _serialize(self, obj, filename): base = os.path.dirname(filename) @@ -255,12 +255,8 @@ class IWLA(object): hit['is_page'] = self.isPage(uri) - status = int(hit['status']) - if status not in conf.viewed_http_codes: - return - if super_hit['robot'] or\ - not status in conf.viewed_http_codes: + not self.hasBeenViewed(hit): page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: @@ -336,7 +332,7 @@ class IWLA(object): def _generateDisplayDaysStats(self): cur_time = self.meta_infos['last_time'] - title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) + title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon) filename = self.getCurDisplayPath('index.html') print '==> Generate display (%s)' % (filename) page = self.display.createPage(title, filename, conf.css_path) @@ -405,7 +401,7 @@ class IWLA(object): full_month = '%s %d' % (months_name[i], year) if i in month_stats.keys(): stats = month_stats[i] - link = 'Details' % (year, i) + link = 'Details' % (year, i) row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link] for j in graph_cols: @@ -449,34 +445,14 @@ class IWLA(object): self._generateDisplayWholeMonthStats() self.display.build(conf.DISPLAY_ROOT) - def _generateStats(self, visits): + def _createEmptyStats(self): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 - #stats['requests'] = set() stats['nb_visitors'] = 0 - for (k, super_hit) in visits.items(): - if super_hit['robot']: - stats['not_viewed_bandwidth'] += super_hit['bandwidth'] - continue - - #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits']) - - if conf.count_hit_only_visitors or\ - super_hit['viewed_pages']: - stats['nb_visitors'] += 1 - stats['viewed_bandwidth'] += super_hit['bandwidth'] - stats['viewed_pages'] += super_hit['viewed_pages'] - stats['viewed_hits'] += super_hit['viewed_hits'] - - # for p in super_hit['requests']: - # if not p['is_page']: continue - # req = p['extract_request'] - # stats['requests'].add(req['extract_uri']) - return stats def _generateMonthStats(self): @@ -484,11 +460,15 @@ class IWLA(object): visits = self.current_analysis['visits'] - stats = self._generateStats(visits) + stats = self._createEmptyStats() + for (day, stat) in self.current_analysis['days_stats'].items(): + for k in stats.keys(): + stats[k] += stat[k] + duplicated_stats = {k:v for (k,v) in stats.items()} cur_time = self.meta_infos['last_time'] - print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) + print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon) print stats if not 'month_stats' in self.current_analysis.keys(): @@ -514,7 +494,6 @@ class IWLA(object): os.remove(path) print "==> Serialize to %s" % path - self._serialize(self.current_analysis, path) # Save month stats @@ -530,31 +509,35 @@ class IWLA(object): def _generateDayStats(self): visits = self.current_analysis['visits'] + cur_time = self.meta_infos['last_time'] self._callPlugins(conf.PRE_HOOK_DIRECTORY) - stats = self._generateStats(visits) + stats = self._createEmptyStats() - cur_time = self.meta_infos['last_time'] - print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) - - if cur_time.tm_mday > 1: - last_day = cur_time.tm_mday - 1 - while last_day: - if last_day in self.current_analysis['days_stats'].keys(): + for (k, super_hit) in visits.items(): + if super_hit['last_access'].tm_mday != cur_time.tm_mday: + continue + viewed_page = False + for hit in super_hit['requests'][::-1]: + if hit['time_decoded'].tm_mday != cur_time.tm_mday: break - last_day -= 1 - if last_day: - for k in stats.keys(): - stats[k] -= self.current_analysis['days_stats'][last_day][k] - stats['nb_visitors'] = 0 - for (k,v) in visits.items(): - if v['robot']: continue - if conf.count_hit_only_visitors and\ - (not v['viewed_pages']): - continue - if v['last_access'].tm_mday == cur_time.tm_mday: - stats['nb_visitors'] += 1 + if super_hit['robot'] or\ + not self.hasBeenViewed(hit): + stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent']) + continue + stats['viewed_bandwidth'] += int(hit['body_bytes_sent']) + if hit['is_page']: + stats['viewed_pages'] += 1 + viewed_pages = True + else: + stats['viewed_hits'] += 1 + if (conf.count_hit_only_visitors or\ + viewed_pages): + stats['nb_visitors'] += 1 + + print "== Stats for %d/%02d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) + print stats self.current_analysis['days_stats'][cur_time.tm_mday] = stats @@ -568,11 +551,10 @@ class IWLA(object): self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: + if time.mktime(t) < time.mktime(cur_time): + return False if not self.analyse_started: - if time.mktime(t) < time.mktime(cur_time): - return False - else: - self.analyse_started = True + self.analyse_started = True if cur_time.tm_mon != t.tm_mon: self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index 100ab6b..f4ef9e9 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -109,15 +109,14 @@ class IWLAPostAnalysisReferers(IPlugin): key_phrases = month_stats.get('key_phrases', {}) for (k, super_hit) in stats.items(): - for r in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(r): continue + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): break if not r['http_referer']: continue uri = r['extract_referer']['extract_uri'] - is_search_engine = False - if self.own_domain_re.match(uri): continue + is_search_engine = False for (name, engine) in self.search_engines.items(): for (hashid, hashid_re) in engine['hashid']: if not hashid_re.match(uri): continue diff --git a/plugins/post_analysis/top_downloads.py b/plugins/post_analysis/top_downloads.py index ab73ba0..9aa9c6a 100644 --- a/plugins/post_analysis/top_downloads.py +++ b/plugins/post_analysis/top_downloads.py @@ -46,14 +46,12 @@ class IWLAPostAnalysisTopDownloads(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + r['is_page']: continue - if r['is_page']: continue - - - if not int(r['status']) in viewed_http_codes: continue uri = r['extract_request']['extract_uri'].lower() diff --git a/plugins/post_analysis/top_hits.py b/plugins/post_analysis/top_hits.py index a6c15e1..05f272a 100644 --- a/plugins/post_analysis/top_hits.py +++ b/plugins/post_analysis/top_hits.py @@ -40,15 +40,14 @@ class IWLAPostAnalysisTopHits(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if r['is_page']: continue - - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + r['is_page']: continue - uri = r['extract_request']['extract_uri'] - + uri = r['extract_request']['extract_uri'].lower() uri = "%s%s" % (r.get('server_name', ''), uri) if not uri in top_hits.keys(): diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py index 9c85cd8..8d938dd 100644 --- a/plugins/post_analysis/top_pages.py +++ b/plugins/post_analysis/top_pages.py @@ -46,11 +46,11 @@ class IWLAPostAnalysisTopPages(IPlugin): for (k, super_hit) in stats.items(): if super_hit['robot']: continue - for r in super_hit['requests']: - if not r['is_page']: continue - - if not self.iwla.isValidForCurrentAnalysis(r) or\ - not self.iwla.hasBeenViewed(r): + for r in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(r): + break + if not self.iwla.hasBeenViewed(r) or\ + not r['is_page']: continue uri = r['extract_request']['extract_uri'] diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py index fd8ad87..c202efb 100644 --- a/plugins/pre_analysis/page_to_hit.py +++ b/plugins/pre_analysis/page_to_hit.py @@ -54,9 +54,11 @@ class IWLAPreAnalysisPageToHit(IPlugin): for (k, super_hit) in hits.items(): if super_hit['robot']: continue - for request in super_hit['requests']: - if not self.iwla.isValidForCurrentAnalysis(request) or\ - not self.iwla.hasBeenViewed(request): + for request in super_hit['requests'][::-1]: + if not self.iwla.isValidForCurrentAnalysis(request): + break + + if not self.iwla.hasBeenViewed(request): continue uri = request['extract_request']['extract_uri']