New format for (not_)viewed pages/hits and bandwidth that are now recorded by day (in a dictionnary were only element 0 is initialized). Element 0 is the total. WARNING : not backward compatible with previous databases.

This commit is contained in:
Gregory Soutade 2017-08-24 07:55:53 +02:00
parent fffab335fa
commit 007be71ad6
8 changed files with 61 additions and 59 deletions

63
iwla.py
View File

@ -95,11 +95,11 @@ visits :
remote_addr => remote_addr =>
remote_addr remote_addr
remote_ip remote_ip
viewed_pages viewed_pages{0..31} # 0 contains total
viewed_hits viewed_hits{0..31} # 0 contains total
not_viewed_pages not_viewed_pages{0..31}
not_viewed_hits not_viewed_hits{0..31}
bandwidth bandwidth{0..31}
last_access last_access
requests => requests =>
[fields_from_format_log] [fields_from_format_log]
@ -298,8 +298,7 @@ class IWLA(object):
def isValidVisitor(self, hit): def isValidVisitor(self, hit):
if hit['robot']: return False if hit['robot']: return False
if not (conf.count_hit_only_visitors or\ if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
hit['viewed_pages']):
return False return False
return True return True
@ -318,7 +317,11 @@ class IWLA(object):
# Don't keep all requests for robots # Don't keep all requests for robots
if not super_hit['robot']: if not super_hit['robot']:
super_hit['requests'].append(hit) super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
day = self.meta_infos['last_time'].tm_mday
if self.hasBeenViewed(hit):
super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time'] super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request'] request = hit['extract_request']
@ -336,19 +339,21 @@ class IWLA(object):
hit_key = 'viewed_hits' hit_key = 'viewed_hits'
if hit['is_page']: if hit['is_page']:
super_hit[page_key] += 1 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
super_hit[page_key][0] += 1
else: else:
super_hit[hit_key] += 1 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
super_hit[hit_key][0] += 1
def _createVisitor(self, hit): def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr'] super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = 0 super_hit['viewed_pages'] = {0:0}
super_hit['viewed_hits'] = 0 super_hit['viewed_hits'] = {0:0}
super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_pages'] = {0:0}
super_hit['not_viewed_hits'] = 0 super_hit['not_viewed_hits'] = {0:0}
super_hit['bandwidth'] = 0 super_hit['bandwidth'] = {0:0}
super_hit['last_access'] = self.meta_infos['last_time'] super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = [] super_hit['requests'] = []
super_hit['robot'] = False super_hit['robot'] = False
@ -659,26 +664,18 @@ class IWLA(object):
stats = self._createEmptyStats() stats = self._createEmptyStats()
day = cur_time.tm_mday
for (k, super_hit) in visits.items(): for (k, super_hit) in visits.items():
if super_hit['last_access'].tm_mday != cur_time.tm_mday: if super_hit['last_access'].tm_mday != day:
continue continue
viewed_pages = False if super_hit['robot']:
for hit in super_hit['requests'][::-1]: stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
if hit['time_decoded'].tm_mday != cur_time.tm_mday: continue
break stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
if super_hit['robot'] or\ stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
not self.hasBeenViewed(hit): stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent']) if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
continue super_hit['viewed_pages'].get(day, 0)):
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages) and\
not super_hit['robot']:
stats['nb_visits'] += 1 stats['nb_visits'] += 1
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)) self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))

View File

@ -78,9 +78,9 @@ class IWLADisplayAllVisits(IPlugin):
row = [ row = [
address, address,
super_hit['viewed_pages'], super_hit['viewed_pages'][0],
super_hit['viewed_hits'], super_hit['viewed_hits'][0],
bytesToStr(super_hit['bandwidth']), bytesToStr(super_hit['bandwidth'][0]),
time.asctime(super_hit['last_access']) time.asctime(super_hit['last_access'])
] ]
table.appendRow(row) table.appendRow(row)

View File

@ -87,9 +87,9 @@ class IWLADisplayFeeds(IPlugin):
if super_hit['feed_parser'] == IWLAPostAnalysisFeeds.MERGED_FEED_PARSER: if super_hit['feed_parser'] == IWLAPostAnalysisFeeds.MERGED_FEED_PARSER:
address += '*' address += '*'
if super_hit['robot']: if super_hit['robot']:
table.appendRow([address, super_hit['not_viewed_pages'], super_hit['not_viewed_hits']]) table.appendRow([address, super_hit['not_viewed_pages'][0], super_hit['not_viewed_hits'][0]])
else: else:
table.appendRow([address, super_hit['viewed_pages'], super_hit['viewed_hits']]) table.appendRow([address, super_hit['viewed_pages'][0], super_hit['viewed_hits'][0]])
page.appendBlock(table) page.appendBlock(table)
note = DisplayHTMLRaw(self.iwla, ('<small>*%s</small>' % (self.iwla._(u'Merged feeds parsers')))) note = DisplayHTMLRaw(self.iwla, ('<small>*%s</small>' % (self.iwla._(u'Merged feeds parsers'))))
page.appendBlock(note) page.appendBlock(note)

View File

@ -68,7 +68,7 @@ class IWLADisplayRobotBandwidth(IPlugin):
for (k, super_hit) in hits.items(): for (k, super_hit) in hits.items():
if not self.iwla.isRobot(super_hit): if not self.iwla.isRobot(super_hit):
continue continue
bandwidths.append((super_hit, super_hit['bandwidth'])) bandwidths.append((super_hit, super_hit['bandwidth'][0]))
bandwidths.sort(key=lambda tup: tup[1], reverse=True) bandwidths.sort(key=lambda tup: tup[1], reverse=True)
# All in a page # All in a page

View File

@ -60,11 +60,11 @@ class IWLADisplayTopVisitors(IPlugin):
total = [0]*5 total = [0]*5
for super_hit in hits.values(): for super_hit in hits.values():
total[1] += super_hit['viewed_pages'] total[1] += super_hit['viewed_pages'][0]
total[2] += super_hit['viewed_hits'] total[2] += super_hit['viewed_hits'][0]
total[3] += super_hit['bandwidth'] total[3] += super_hit['bandwidth'][0]
top_bandwidth = [(k,v['bandwidth']) for (k,v) in hits.items()] top_bandwidth = [(k,v['bandwidth'][0]) for (k,v) in hits.items()]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True) top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
top_visitors = [hits[h[0]] for h in top_bandwidth[:10]] top_visitors = [hits[h[0]] for h in top_bandwidth[:10]]
@ -79,14 +79,14 @@ class IWLADisplayTopVisitors(IPlugin):
row = [ row = [
address, address,
super_hit['viewed_pages'], super_hit['viewed_pages'][0],
super_hit['viewed_hits'], super_hit['viewed_hits'][0],
bytesToStr(super_hit['bandwidth']), bytesToStr(super_hit['bandwidth'][0]),
time.asctime(super_hit['last_access']) time.asctime(super_hit['last_access'])
] ]
total[1] -= super_hit['viewed_pages'] total[1] -= super_hit['viewed_pages'][0]
total[2] -= super_hit['viewed_hits'] total[2] -= super_hit['viewed_hits'][0]
total[3] -= super_hit['bandwidth'] total[3] -= super_hit['bandwidth'][0]
table.appendRow(row) table.appendRow(row)
if total[1] or total[2] or total[3]: if total[1] or total[2] or total[3]:
total[0] = self.iwla._(u'Others') total[0] = self.iwla._(u'Others')

View File

@ -78,7 +78,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
return True return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit): def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
if isFeedParser and (hit['viewed_hits'] + hit['not_viewed_hits']) == 1: if isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
user_agent = hit['requests'][0]['http_user_agent'].lower() user_agent = hit['requests'][0]['http_user_agent'].lower()
if one_hit_only.get(user_agent, None) is None: if one_hit_only.get(user_agent, None) is None:
# Merged # Merged
@ -117,7 +117,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.FEED_PARSER isFeedParser = self.FEED_PARSER
# Robot that views pages -> bot # Robot that views pages -> bot
if hit['robot']: if hit['robot']:
if hit['not_viewed_pages']: if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
if self.merge_one_hit_only_feeds_parsers: if self.merge_one_hit_only_feeds_parsers:

View File

@ -83,14 +83,17 @@ class IWLAPreAnalysisPageToHit(IPlugin):
uri = request['extract_request']['extract_uri'] uri = request['extract_request']['extract_uri']
day = request['time_decoded'].tm_mday
if request['is_page']: if request['is_page']:
# Page to hit # Page to hit
for regexp in self.ph_regexps: for regexp in self.ph_regexps:
if regexp.match(uri): if regexp.match(uri):
self.logger.debug('%s changed from page to hit' % (uri)) self.logger.debug('%s changed from page to hit' % (uri))
request['is_page'] = False request['is_page'] = False
super_hit['viewed_pages'] -= 1 super_hit['viewed_pages'][day] -= 1
super_hit['viewed_hits'] += 1 super_hit['viewed_hits'][day] = super_hit['viewed_hits'].get(day, 0) + 1
super_hit['viewed_pages'][0] -= 1
super_hit['viewed_hits'][0] += 1
break break
else: else:
# Hit to page # Hit to page
@ -98,6 +101,8 @@ class IWLAPreAnalysisPageToHit(IPlugin):
if regexp.match(uri): if regexp.match(uri):
self.logger.debug('%s changed from hit to page' % (uri)) self.logger.debug('%s changed from hit to page' % (uri))
request['is_page'] = True request['is_page'] = True
super_hit['viewed_pages'] += 1 super_hit['viewed_pages'][day] = super_hit['viewed_pages'].get(day, 0) + 1
super_hit['viewed_hits'] -= 1 super_hit['viewed_hits'][day] -= 1
super_hit['viewed_pages'][0] += 1
super_hit['viewed_hits'][0] -= 1
break break

View File

@ -104,12 +104,12 @@ class IWLAPreAnalysisRobots(IPlugin):
continue continue
# 1) no pages view --> robot # 1) no pages view --> robot
# if not super_hit['viewed_pages']: # if not super_hit['viewed_pages'][0]:
# super_hit['robot'] = 1 # super_hit['robot'] = 1
# continue # continue
# 2) pages without hit --> robot # 2) pages without hit --> robot
if not super_hit['viewed_hits']: if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self.logger.debug(super_hit) self.logger.debug(super_hit)
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
@ -137,7 +137,7 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
if not super_hit['viewed_pages'] and \ if not super_hit['viewed_pages'][0] and \
(super_hit['viewed_hits'] and not referers): (super_hit['viewed_hits'][0] and not referers):
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue