diff --git a/TODO b/TODO
new file mode 100644
index 0000000..f12a9cc
--- /dev/null
+++ b/TODO
@@ -0,0 +1,14 @@
+reverse analysis
+-f option to read a file instead of the one in conf
+Other when pages truncated
+translations
+doc auto generation
+doc enhancement
+Limit hits/pages/downloads by rate
+Automatic tests
+Test separate directory for DB and display
+quiet mode
+Add 0 before month when < 10
+Add Licence
+Free memory as soon as possible
+Bug in bandwidth account (x10)
\ No newline at end of file
diff --git a/iwla.py b/iwla.py
index c41f06b..93de2cd 100755
--- a/iwla.py
+++ b/iwla.py
@@ -174,7 +174,7 @@ class IWLA(object):
def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time']
- return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
+ return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
def getResourcesPath(self):
return conf.resources_path
@@ -194,7 +194,7 @@ class IWLA(object):
return self.display
def getDBFilename(self, time):
- return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
+ return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
def _serialize(self, obj, filename):
base = os.path.dirname(filename)
@@ -255,12 +255,8 @@ class IWLA(object):
hit['is_page'] = self.isPage(uri)
- status = int(hit['status'])
- if status not in conf.viewed_http_codes:
- return
-
if super_hit['robot'] or\
- not status in conf.viewed_http_codes:
+ not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
@@ -336,7 +332,7 @@ class IWLA(object):
def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time']
- title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
+ title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
filename = self.getCurDisplayPath('index.html')
print '==> Generate display (%s)' % (filename)
page = self.display.createPage(title, filename, conf.css_path)
@@ -405,7 +401,7 @@ class IWLA(object):
full_month = '%s %d' % (months_name[i], year)
if i in month_stats.keys():
stats = month_stats[i]
- link = 'Details' % (year, i)
+ link = 'Details' % (year, i)
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols:
@@ -449,34 +445,14 @@ class IWLA(object):
self._generateDisplayWholeMonthStats()
self.display.build(conf.DISPLAY_ROOT)
- def _generateStats(self, visits):
+ def _createEmptyStats(self):
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
- #stats['requests'] = set()
stats['nb_visitors'] = 0
- for (k, super_hit) in visits.items():
- if super_hit['robot']:
- stats['not_viewed_bandwidth'] += super_hit['bandwidth']
- continue
-
- #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
-
- if conf.count_hit_only_visitors or\
- super_hit['viewed_pages']:
- stats['nb_visitors'] += 1
- stats['viewed_bandwidth'] += super_hit['bandwidth']
- stats['viewed_pages'] += super_hit['viewed_pages']
- stats['viewed_hits'] += super_hit['viewed_hits']
-
- # for p in super_hit['requests']:
- # if not p['is_page']: continue
- # req = p['extract_request']
- # stats['requests'].add(req['extract_uri'])
-
return stats
def _generateMonthStats(self):
@@ -484,11 +460,15 @@ class IWLA(object):
visits = self.current_analysis['visits']
- stats = self._generateStats(visits)
+ stats = self._createEmptyStats()
+ for (day, stat) in self.current_analysis['days_stats'].items():
+ for k in stats.keys():
+ stats[k] += stat[k]
+
duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time']
- print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
+ print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats
if not 'month_stats' in self.current_analysis.keys():
@@ -514,7 +494,6 @@ class IWLA(object):
os.remove(path)
print "==> Serialize to %s" % path
-
self._serialize(self.current_analysis, path)
# Save month stats
@@ -530,31 +509,35 @@ class IWLA(object):
def _generateDayStats(self):
visits = self.current_analysis['visits']
+ cur_time = self.meta_infos['last_time']
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
- stats = self._generateStats(visits)
+ stats = self._createEmptyStats()
- cur_time = self.meta_infos['last_time']
- print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
-
- if cur_time.tm_mday > 1:
- last_day = cur_time.tm_mday - 1
- while last_day:
- if last_day in self.current_analysis['days_stats'].keys():
+ for (k, super_hit) in visits.items():
+ if super_hit['last_access'].tm_mday != cur_time.tm_mday:
+ continue
+ viewed_page = False
+ for hit in super_hit['requests'][::-1]:
+ if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
- last_day -= 1
- if last_day:
- for k in stats.keys():
- stats[k] -= self.current_analysis['days_stats'][last_day][k]
- stats['nb_visitors'] = 0
- for (k,v) in visits.items():
- if v['robot']: continue
- if conf.count_hit_only_visitors and\
- (not v['viewed_pages']):
- continue
- if v['last_access'].tm_mday == cur_time.tm_mday:
- stats['nb_visitors'] += 1
+ if super_hit['robot'] or\
+ not self.hasBeenViewed(hit):
+ stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
+ continue
+ stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
+ if hit['is_page']:
+ stats['viewed_pages'] += 1
+ viewed_pages = True
+ else:
+ stats['viewed_hits'] += 1
+ if (conf.count_hit_only_visitors or\
+ viewed_pages):
+ stats['nb_visitors'] += 1
+
+ print "== Stats for %d/%02d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
+
print stats
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
@@ -568,11 +551,10 @@ class IWLA(object):
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True
else:
+ if time.mktime(t) < time.mktime(cur_time):
+ return False
if not self.analyse_started:
- if time.mktime(t) < time.mktime(cur_time):
- return False
- else:
- self.analyse_started = True
+ self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats()
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py
index 100ab6b..f4ef9e9 100644
--- a/plugins/post_analysis/referers.py
+++ b/plugins/post_analysis/referers.py
@@ -109,15 +109,14 @@ class IWLAPostAnalysisReferers(IPlugin):
key_phrases = month_stats.get('key_phrases', {})
for (k, super_hit) in stats.items():
- for r in super_hit['requests']:
- if not self.iwla.isValidForCurrentAnalysis(r): continue
+ for r in super_hit['requests'][::-1]:
+ if not self.iwla.isValidForCurrentAnalysis(r): break
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
- is_search_engine = False
-
if self.own_domain_re.match(uri): continue
+ is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
diff --git a/plugins/post_analysis/top_downloads.py b/plugins/post_analysis/top_downloads.py
index ab73ba0..9aa9c6a 100644
--- a/plugins/post_analysis/top_downloads.py
+++ b/plugins/post_analysis/top_downloads.py
@@ -46,14 +46,12 @@ class IWLAPostAnalysisTopDownloads(IPlugin):
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
- for r in super_hit['requests']:
- if not self.iwla.isValidForCurrentAnalysis(r) or\
- not self.iwla.hasBeenViewed(r):
+ for r in super_hit['requests'][::-1]:
+ if not self.iwla.isValidForCurrentAnalysis(r):
+ break
+ if not self.iwla.hasBeenViewed(r) or\
+ r['is_page']:
continue
- if r['is_page']: continue
-
-
- if not int(r['status']) in viewed_http_codes: continue
uri = r['extract_request']['extract_uri'].lower()
diff --git a/plugins/post_analysis/top_hits.py b/plugins/post_analysis/top_hits.py
index a6c15e1..05f272a 100644
--- a/plugins/post_analysis/top_hits.py
+++ b/plugins/post_analysis/top_hits.py
@@ -40,15 +40,14 @@ class IWLAPostAnalysisTopHits(IPlugin):
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
- for r in super_hit['requests']:
- if r['is_page']: continue
-
- if not self.iwla.isValidForCurrentAnalysis(r) or\
- not self.iwla.hasBeenViewed(r):
+ for r in super_hit['requests'][::-1]:
+ if not self.iwla.isValidForCurrentAnalysis(r):
+ break
+ if not self.iwla.hasBeenViewed(r) or\
+ r['is_page']:
continue
- uri = r['extract_request']['extract_uri']
-
+ uri = r['extract_request']['extract_uri'].lower()
uri = "%s%s" % (r.get('server_name', ''), uri)
if not uri in top_hits.keys():
diff --git a/plugins/post_analysis/top_pages.py b/plugins/post_analysis/top_pages.py
index 9c85cd8..8d938dd 100644
--- a/plugins/post_analysis/top_pages.py
+++ b/plugins/post_analysis/top_pages.py
@@ -46,11 +46,11 @@ class IWLAPostAnalysisTopPages(IPlugin):
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
- for r in super_hit['requests']:
- if not r['is_page']: continue
-
- if not self.iwla.isValidForCurrentAnalysis(r) or\
- not self.iwla.hasBeenViewed(r):
+ for r in super_hit['requests'][::-1]:
+ if not self.iwla.isValidForCurrentAnalysis(r):
+ break
+ if not self.iwla.hasBeenViewed(r) or\
+ not r['is_page']:
continue
uri = r['extract_request']['extract_uri']
diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py
index fd8ad87..c202efb 100644
--- a/plugins/pre_analysis/page_to_hit.py
+++ b/plugins/pre_analysis/page_to_hit.py
@@ -54,9 +54,11 @@ class IWLAPreAnalysisPageToHit(IPlugin):
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
- for request in super_hit['requests']:
- if not self.iwla.isValidForCurrentAnalysis(request) or\
- not self.iwla.hasBeenViewed(request):
+ for request in super_hit['requests'][::-1]:
+ if not self.iwla.isValidForCurrentAnalysis(request):
+ break
+
+ if not self.iwla.hasBeenViewed(request):
continue
uri = request['extract_request']['extract_uri']