Merge branch 'master' of soutade.fr:iwla
Conflicts: conf.py
This commit is contained in:
commit
bd31b04e9b
14
TODO
Normal file
14
TODO
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
reverse analysis
|
||||||
|
-f option to read a file instead of the one in conf
|
||||||
|
Other when pages truncated
|
||||||
|
translations
|
||||||
|
doc auto generation
|
||||||
|
doc enhancement
|
||||||
|
Limit hits/pages/downloads by rate
|
||||||
|
Automatic tests
|
||||||
|
Test separate directory for DB and display
|
||||||
|
quiet mode
|
||||||
|
Add 0 before month when < 10
|
||||||
|
Add Licence
|
||||||
|
Free memory as soon as possible
|
||||||
|
Bug in bandwidth account (x10)
|
96
iwla.py
96
iwla.py
|
@ -174,7 +174,7 @@ class IWLA(object):
|
||||||
|
|
||||||
def getCurDisplayPath(self, filename):
|
def getCurDisplayPath(self, filename):
|
||||||
cur_time = self.meta_infos['last_time']
|
cur_time = self.meta_infos['last_time']
|
||||||
return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
|
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
|
||||||
|
|
||||||
def getResourcesPath(self):
|
def getResourcesPath(self):
|
||||||
return conf.resources_path
|
return conf.resources_path
|
||||||
|
@ -194,7 +194,7 @@ class IWLA(object):
|
||||||
return self.display
|
return self.display
|
||||||
|
|
||||||
def getDBFilename(self, time):
|
def getDBFilename(self, time):
|
||||||
return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
|
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
|
||||||
|
|
||||||
def _serialize(self, obj, filename):
|
def _serialize(self, obj, filename):
|
||||||
base = os.path.dirname(filename)
|
base = os.path.dirname(filename)
|
||||||
|
@ -255,12 +255,8 @@ class IWLA(object):
|
||||||
|
|
||||||
hit['is_page'] = self.isPage(uri)
|
hit['is_page'] = self.isPage(uri)
|
||||||
|
|
||||||
status = int(hit['status'])
|
|
||||||
if status not in conf.viewed_http_codes:
|
|
||||||
return
|
|
||||||
|
|
||||||
if super_hit['robot'] or\
|
if super_hit['robot'] or\
|
||||||
not status in conf.viewed_http_codes:
|
not self.hasBeenViewed(hit):
|
||||||
page_key = 'not_viewed_pages'
|
page_key = 'not_viewed_pages'
|
||||||
hit_key = 'not_viewed_hits'
|
hit_key = 'not_viewed_hits'
|
||||||
else:
|
else:
|
||||||
|
@ -336,7 +332,7 @@ class IWLA(object):
|
||||||
|
|
||||||
def _generateDisplayDaysStats(self):
|
def _generateDisplayDaysStats(self):
|
||||||
cur_time = self.meta_infos['last_time']
|
cur_time = self.meta_infos['last_time']
|
||||||
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
|
title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
|
||||||
filename = self.getCurDisplayPath('index.html')
|
filename = self.getCurDisplayPath('index.html')
|
||||||
print '==> Generate display (%s)' % (filename)
|
print '==> Generate display (%s)' % (filename)
|
||||||
page = self.display.createPage(title, filename, conf.css_path)
|
page = self.display.createPage(title, filename, conf.css_path)
|
||||||
|
@ -405,7 +401,7 @@ class IWLA(object):
|
||||||
full_month = '%s %d' % (months_name[i], year)
|
full_month = '%s %d' % (months_name[i], year)
|
||||||
if i in month_stats.keys():
|
if i in month_stats.keys():
|
||||||
stats = month_stats[i]
|
stats = month_stats[i]
|
||||||
link = '<a href="%d/%d/index.html">Details</a>' % (year, i)
|
link = '<a href="%d/%02d/index.html">Details</a>' % (year, i)
|
||||||
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
|
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
|
||||||
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
|
||||||
for j in graph_cols:
|
for j in graph_cols:
|
||||||
|
@ -449,34 +445,14 @@ class IWLA(object):
|
||||||
self._generateDisplayWholeMonthStats()
|
self._generateDisplayWholeMonthStats()
|
||||||
self.display.build(conf.DISPLAY_ROOT)
|
self.display.build(conf.DISPLAY_ROOT)
|
||||||
|
|
||||||
def _generateStats(self, visits):
|
def _createEmptyStats(self):
|
||||||
stats = {}
|
stats = {}
|
||||||
stats['viewed_bandwidth'] = 0
|
stats['viewed_bandwidth'] = 0
|
||||||
stats['not_viewed_bandwidth'] = 0
|
stats['not_viewed_bandwidth'] = 0
|
||||||
stats['viewed_pages'] = 0
|
stats['viewed_pages'] = 0
|
||||||
stats['viewed_hits'] = 0
|
stats['viewed_hits'] = 0
|
||||||
#stats['requests'] = set()
|
|
||||||
stats['nb_visitors'] = 0
|
stats['nb_visitors'] = 0
|
||||||
|
|
||||||
for (k, super_hit) in visits.items():
|
|
||||||
if super_hit['robot']:
|
|
||||||
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
|
|
||||||
continue
|
|
||||||
|
|
||||||
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
|
|
||||||
|
|
||||||
if conf.count_hit_only_visitors or\
|
|
||||||
super_hit['viewed_pages']:
|
|
||||||
stats['nb_visitors'] += 1
|
|
||||||
stats['viewed_bandwidth'] += super_hit['bandwidth']
|
|
||||||
stats['viewed_pages'] += super_hit['viewed_pages']
|
|
||||||
stats['viewed_hits'] += super_hit['viewed_hits']
|
|
||||||
|
|
||||||
# for p in super_hit['requests']:
|
|
||||||
# if not p['is_page']: continue
|
|
||||||
# req = p['extract_request']
|
|
||||||
# stats['requests'].add(req['extract_uri'])
|
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
def _generateMonthStats(self):
|
def _generateMonthStats(self):
|
||||||
|
@ -484,11 +460,15 @@ class IWLA(object):
|
||||||
|
|
||||||
visits = self.current_analysis['visits']
|
visits = self.current_analysis['visits']
|
||||||
|
|
||||||
stats = self._generateStats(visits)
|
stats = self._createEmptyStats()
|
||||||
|
for (day, stat) in self.current_analysis['days_stats'].items():
|
||||||
|
for k in stats.keys():
|
||||||
|
stats[k] += stat[k]
|
||||||
|
|
||||||
duplicated_stats = {k:v for (k,v) in stats.items()}
|
duplicated_stats = {k:v for (k,v) in stats.items()}
|
||||||
|
|
||||||
cur_time = self.meta_infos['last_time']
|
cur_time = self.meta_infos['last_time']
|
||||||
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
|
print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
|
||||||
print stats
|
print stats
|
||||||
|
|
||||||
if not 'month_stats' in self.current_analysis.keys():
|
if not 'month_stats' in self.current_analysis.keys():
|
||||||
|
@ -514,7 +494,6 @@ class IWLA(object):
|
||||||
os.remove(path)
|
os.remove(path)
|
||||||
|
|
||||||
print "==> Serialize to %s" % path
|
print "==> Serialize to %s" % path
|
||||||
|
|
||||||
self._serialize(self.current_analysis, path)
|
self._serialize(self.current_analysis, path)
|
||||||
|
|
||||||
# Save month stats
|
# Save month stats
|
||||||
|
@ -530,31 +509,35 @@ class IWLA(object):
|
||||||
|
|
||||||
def _generateDayStats(self):
|
def _generateDayStats(self):
|
||||||
visits = self.current_analysis['visits']
|
visits = self.current_analysis['visits']
|
||||||
|
cur_time = self.meta_infos['last_time']
|
||||||
|
|
||||||
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
|
self._callPlugins(conf.PRE_HOOK_DIRECTORY)
|
||||||
|
|
||||||
stats = self._generateStats(visits)
|
stats = self._createEmptyStats()
|
||||||
|
|
||||||
cur_time = self.meta_infos['last_time']
|
for (k, super_hit) in visits.items():
|
||||||
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
|
if super_hit['last_access'].tm_mday != cur_time.tm_mday:
|
||||||
|
continue
|
||||||
if cur_time.tm_mday > 1:
|
viewed_page = False
|
||||||
last_day = cur_time.tm_mday - 1
|
for hit in super_hit['requests'][::-1]:
|
||||||
while last_day:
|
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
|
||||||
if last_day in self.current_analysis['days_stats'].keys():
|
|
||||||
break
|
break
|
||||||
last_day -= 1
|
if super_hit['robot'] or\
|
||||||
if last_day:
|
not self.hasBeenViewed(hit):
|
||||||
for k in stats.keys():
|
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
|
||||||
stats[k] -= self.current_analysis['days_stats'][last_day][k]
|
continue
|
||||||
stats['nb_visitors'] = 0
|
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
|
||||||
for (k,v) in visits.items():
|
if hit['is_page']:
|
||||||
if v['robot']: continue
|
stats['viewed_pages'] += 1
|
||||||
if conf.count_hit_only_visitors and\
|
viewed_pages = True
|
||||||
(not v['viewed_pages']):
|
else:
|
||||||
continue
|
stats['viewed_hits'] += 1
|
||||||
if v['last_access'].tm_mday == cur_time.tm_mday:
|
if (conf.count_hit_only_visitors or\
|
||||||
stats['nb_visitors'] += 1
|
viewed_pages):
|
||||||
|
stats['nb_visitors'] += 1
|
||||||
|
|
||||||
|
print "== Stats for %d/%02d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
|
||||||
|
|
||||||
print stats
|
print stats
|
||||||
|
|
||||||
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
|
self.current_analysis['days_stats'][cur_time.tm_mday] = stats
|
||||||
|
@ -568,11 +551,10 @@ class IWLA(object):
|
||||||
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
||||||
self.analyse_started = True
|
self.analyse_started = True
|
||||||
else:
|
else:
|
||||||
|
if time.mktime(t) < time.mktime(cur_time):
|
||||||
|
return False
|
||||||
if not self.analyse_started:
|
if not self.analyse_started:
|
||||||
if time.mktime(t) < time.mktime(cur_time):
|
self.analyse_started = True
|
||||||
return False
|
|
||||||
else:
|
|
||||||
self.analyse_started = True
|
|
||||||
if cur_time.tm_mon != t.tm_mon:
|
if cur_time.tm_mon != t.tm_mon:
|
||||||
self._generateMonthStats()
|
self._generateMonthStats()
|
||||||
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
|
||||||
|
|
|
@ -109,15 +109,14 @@ class IWLAPostAnalysisReferers(IPlugin):
|
||||||
key_phrases = month_stats.get('key_phrases', {})
|
key_phrases = month_stats.get('key_phrases', {})
|
||||||
|
|
||||||
for (k, super_hit) in stats.items():
|
for (k, super_hit) in stats.items():
|
||||||
for r in super_hit['requests']:
|
for r in super_hit['requests'][::-1]:
|
||||||
if not self.iwla.isValidForCurrentAnalysis(r): continue
|
if not self.iwla.isValidForCurrentAnalysis(r): break
|
||||||
if not r['http_referer']: continue
|
if not r['http_referer']: continue
|
||||||
|
|
||||||
uri = r['extract_referer']['extract_uri']
|
uri = r['extract_referer']['extract_uri']
|
||||||
is_search_engine = False
|
|
||||||
|
|
||||||
if self.own_domain_re.match(uri): continue
|
if self.own_domain_re.match(uri): continue
|
||||||
|
|
||||||
|
is_search_engine = False
|
||||||
for (name, engine) in self.search_engines.items():
|
for (name, engine) in self.search_engines.items():
|
||||||
for (hashid, hashid_re) in engine['hashid']:
|
for (hashid, hashid_re) in engine['hashid']:
|
||||||
if not hashid_re.match(uri): continue
|
if not hashid_re.match(uri): continue
|
||||||
|
|
|
@ -46,14 +46,12 @@ class IWLAPostAnalysisTopDownloads(IPlugin):
|
||||||
|
|
||||||
for (k, super_hit) in stats.items():
|
for (k, super_hit) in stats.items():
|
||||||
if super_hit['robot']: continue
|
if super_hit['robot']: continue
|
||||||
for r in super_hit['requests']:
|
for r in super_hit['requests'][::-1]:
|
||||||
if not self.iwla.isValidForCurrentAnalysis(r) or\
|
if not self.iwla.isValidForCurrentAnalysis(r):
|
||||||
not self.iwla.hasBeenViewed(r):
|
break
|
||||||
|
if not self.iwla.hasBeenViewed(r) or\
|
||||||
|
r['is_page']:
|
||||||
continue
|
continue
|
||||||
if r['is_page']: continue
|
|
||||||
|
|
||||||
|
|
||||||
if not int(r['status']) in viewed_http_codes: continue
|
|
||||||
|
|
||||||
uri = r['extract_request']['extract_uri'].lower()
|
uri = r['extract_request']['extract_uri'].lower()
|
||||||
|
|
||||||
|
|
|
@ -40,15 +40,14 @@ class IWLAPostAnalysisTopHits(IPlugin):
|
||||||
|
|
||||||
for (k, super_hit) in stats.items():
|
for (k, super_hit) in stats.items():
|
||||||
if super_hit['robot']: continue
|
if super_hit['robot']: continue
|
||||||
for r in super_hit['requests']:
|
for r in super_hit['requests'][::-1]:
|
||||||
if r['is_page']: continue
|
if not self.iwla.isValidForCurrentAnalysis(r):
|
||||||
|
break
|
||||||
if not self.iwla.isValidForCurrentAnalysis(r) or\
|
if not self.iwla.hasBeenViewed(r) or\
|
||||||
not self.iwla.hasBeenViewed(r):
|
r['is_page']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
uri = r['extract_request']['extract_uri']
|
uri = r['extract_request']['extract_uri'].lower()
|
||||||
|
|
||||||
uri = "%s%s" % (r.get('server_name', ''), uri)
|
uri = "%s%s" % (r.get('server_name', ''), uri)
|
||||||
|
|
||||||
if not uri in top_hits.keys():
|
if not uri in top_hits.keys():
|
||||||
|
|
|
@ -46,11 +46,11 @@ class IWLAPostAnalysisTopPages(IPlugin):
|
||||||
|
|
||||||
for (k, super_hit) in stats.items():
|
for (k, super_hit) in stats.items():
|
||||||
if super_hit['robot']: continue
|
if super_hit['robot']: continue
|
||||||
for r in super_hit['requests']:
|
for r in super_hit['requests'][::-1]:
|
||||||
if not r['is_page']: continue
|
if not self.iwla.isValidForCurrentAnalysis(r):
|
||||||
|
break
|
||||||
if not self.iwla.isValidForCurrentAnalysis(r) or\
|
if not self.iwla.hasBeenViewed(r) or\
|
||||||
not self.iwla.hasBeenViewed(r):
|
not r['is_page']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
uri = r['extract_request']['extract_uri']
|
uri = r['extract_request']['extract_uri']
|
||||||
|
|
|
@ -54,9 +54,11 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
||||||
for (k, super_hit) in hits.items():
|
for (k, super_hit) in hits.items():
|
||||||
if super_hit['robot']: continue
|
if super_hit['robot']: continue
|
||||||
|
|
||||||
for request in super_hit['requests']:
|
for request in super_hit['requests'][::-1]:
|
||||||
if not self.iwla.isValidForCurrentAnalysis(request) or\
|
if not self.iwla.isValidForCurrentAnalysis(request):
|
||||||
not self.iwla.hasBeenViewed(request):
|
break
|
||||||
|
|
||||||
|
if not self.iwla.hasBeenViewed(request):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
uri = request['extract_request']['extract_uri']
|
uri = request['extract_request']['extract_uri']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user