2020-10-30 14:42:56 +01:00
|
|
|
#!/usr/bin/env python3
|
2014-12-18 19:54:31 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# Copyright Grégory Soutadé 2015
|
|
|
|
|
|
|
|
# This file is part of iwla
|
|
|
|
|
|
|
|
# iwla is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# iwla is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
2014-11-18 20:18:53 +01:00
|
|
|
|
|
|
|
import os
|
2014-12-03 10:55:32 +01:00
|
|
|
import shutil
|
|
|
|
import sys
|
2014-11-18 20:18:53 +01:00
|
|
|
import re
|
|
|
|
import time
|
2020-10-30 14:42:56 +01:00
|
|
|
import pickle
|
2014-11-19 19:34:16 +01:00
|
|
|
import gzip
|
2014-11-22 19:23:56 +01:00
|
|
|
import importlib
|
2014-12-03 10:55:32 +01:00
|
|
|
import argparse
|
2014-12-15 22:30:49 +01:00
|
|
|
import logging
|
2014-12-17 20:31:59 +01:00
|
|
|
import gettext
|
2014-11-28 16:26:11 +01:00
|
|
|
from calendar import monthrange
|
2015-01-02 19:27:57 +01:00
|
|
|
from datetime import date, datetime
|
2024-10-27 09:16:01 +01:00
|
|
|
import socket
|
2014-11-19 19:45:41 +01:00
|
|
|
|
2014-11-24 21:37:37 +01:00
|
|
|
import default_conf as conf
|
|
|
|
|
2014-11-24 17:13:59 +01:00
|
|
|
from iplugin import *
|
2014-11-20 16:31:00 +01:00
|
|
|
from display import *
|
|
|
|
|
2014-12-19 11:34:25 +01:00
|
|
|
"""
|
|
|
|
Main class IWLA
|
|
|
|
Parse Log, compute them, call plugins and produce output
|
|
|
|
For now, only HTTP log are valid
|
|
|
|
|
|
|
|
Plugin requirements :
|
|
|
|
None
|
|
|
|
|
|
|
|
Conf values needed :
|
|
|
|
analyzed_filename
|
|
|
|
domain_name
|
|
|
|
locales_path
|
2020-04-15 14:44:11 +02:00
|
|
|
compress_output_files
|
|
|
|
excluded_ip
|
2023-06-14 09:21:11 +02:00
|
|
|
excluded_domain_name
|
2024-10-27 09:16:01 +01:00
|
|
|
reverse_dns_timeout*
|
2014-12-19 11:34:25 +01:00
|
|
|
|
|
|
|
Output files :
|
|
|
|
DB_ROOT/meta.db
|
|
|
|
DB_ROOT/year/month/iwla.db
|
|
|
|
OUTPUT_ROOT/index.html
|
2014-12-31 14:22:46 +01:00
|
|
|
OUTPUT_ROOT/year/_stats.html
|
2014-12-19 11:34:25 +01:00
|
|
|
OUTPUT_ROOT/year/month/index.html
|
|
|
|
|
|
|
|
Statistics creation :
|
|
|
|
|
|
|
|
meta :
|
|
|
|
last_time
|
|
|
|
start_analysis_time
|
|
|
|
stats =>
|
|
|
|
year =>
|
|
|
|
month =>
|
|
|
|
viewed_bandwidth
|
|
|
|
not_viewed_bandwidth
|
|
|
|
viewed_pages
|
|
|
|
viewed_hits
|
|
|
|
nb_visits
|
|
|
|
nb_visitors
|
|
|
|
|
|
|
|
month_stats :
|
|
|
|
viewed_bandwidth
|
|
|
|
not_viewed_bandwidth
|
|
|
|
viewed_pages
|
|
|
|
viewed_hits
|
|
|
|
nb_visits
|
|
|
|
|
|
|
|
days_stats :
|
|
|
|
day =>
|
|
|
|
viewed_bandwidth
|
|
|
|
not_viewed_bandwidth
|
|
|
|
viewed_pages
|
|
|
|
viewed_hits
|
|
|
|
nb_visits
|
|
|
|
nb_visitors
|
|
|
|
|
|
|
|
visits :
|
2023-05-21 11:04:40 +02:00
|
|
|
remote_ip =>
|
2014-12-19 11:34:25 +01:00
|
|
|
remote_addr
|
|
|
|
remote_ip
|
2017-08-24 07:55:53 +02:00
|
|
|
viewed_pages{0..31} # 0 contains total
|
|
|
|
viewed_hits{0..31} # 0 contains total
|
|
|
|
not_viewed_pages{0..31}
|
|
|
|
not_viewed_hits{0..31}
|
|
|
|
bandwidth{0..31}
|
2014-12-19 11:34:25 +01:00
|
|
|
last_access
|
|
|
|
requests =>
|
|
|
|
[fields_from_format_log]
|
|
|
|
extract_request =>
|
2015-03-15 10:31:28 +01:00
|
|
|
http_method
|
|
|
|
http_uri
|
|
|
|
http_version
|
2014-12-19 11:34:25 +01:00
|
|
|
extract_uri
|
|
|
|
extract_parameters*
|
|
|
|
extract_referer* =>
|
|
|
|
extract_uri
|
|
|
|
extract_parameters*
|
|
|
|
robot
|
|
|
|
hit_only
|
|
|
|
is_page
|
2022-06-23 21:16:30 +02:00
|
|
|
keep_requests
|
2014-12-19 11:34:25 +01:00
|
|
|
|
|
|
|
valid_visitors:
|
|
|
|
month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
|
|
|
|
|
|
|
|
Statistics update :
|
|
|
|
None
|
|
|
|
|
|
|
|
Statistics deletion :
|
|
|
|
None
|
|
|
|
"""
|
|
|
|
|
2014-12-09 16:54:02 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
class IWLA(object):
|
|
|
|
|
|
|
|
ANALYSIS_CLASS = 'HTTP'
|
|
|
|
API_VERSION = 1
|
2024-10-27 09:16:01 +01:00
|
|
|
IWLA_VERSION = '0.8'
|
|
|
|
DEFAULT_DNS_TIMEOUT = 0.5
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2023-03-11 20:51:44 +01:00
|
|
|
def __init__(self, logLevel, args):
|
2014-11-21 14:46:12 +01:00
|
|
|
self.meta_infos = {}
|
|
|
|
self.analyse_started = False
|
|
|
|
self.current_analysis = {}
|
2015-01-02 19:27:57 +01:00
|
|
|
self.start_time = 0
|
2014-11-21 14:46:12 +01:00
|
|
|
self.cache_plugins = {}
|
2014-11-30 19:05:17 +01:00
|
|
|
self.display = DisplayHTMLBuild(self)
|
2014-11-21 14:46:12 +01:00
|
|
|
self.valid_visitors = None
|
2023-03-11 20:51:44 +01:00
|
|
|
self.args = args
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2024-10-27 09:16:01 +01:00
|
|
|
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
|
|
|
|
IWLA.DEFAULT_DNS_TIMEOUT)
|
|
|
|
|
2017-08-23 20:11:17 +02:00
|
|
|
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
|
2014-11-21 14:46:12 +01:00
|
|
|
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
|
|
|
|
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
|
|
|
self.log_re = re.compile(self.log_format_extracted)
|
2014-12-18 21:57:44 +01:00
|
|
|
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
|
2014-12-17 21:06:48 +01:00
|
|
|
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
|
2022-06-23 21:11:43 +02:00
|
|
|
self.slash_re = re.compile(r'//')
|
|
|
|
self.protocol_re = re.compile(r'^.*://')
|
2020-04-09 09:33:59 +02:00
|
|
|
self.excluded_ip = []
|
|
|
|
for ip in conf.excluded_ip:
|
|
|
|
self.excluded_ip += [re.compile(ip)]
|
2023-06-14 09:21:11 +02:00
|
|
|
self.excluded_domain_name = []
|
|
|
|
for domain_name in conf.excluded_domain_name:
|
|
|
|
self.excluded_domain_name += [re.compile(domain_name)]
|
2024-07-28 09:24:33 +02:00
|
|
|
self.multimedia_files_re = []
|
|
|
|
for file_re in conf.multimedia_files_re:
|
|
|
|
self.multimedia_files_re += [re.compile(file_re)]
|
2014-11-27 12:34:42 +01:00
|
|
|
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
|
|
|
|
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
|
|
|
|
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-16 20:23:33 +01:00
|
|
|
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
|
|
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
2023-03-11 20:51:44 +01:00
|
|
|
if self.args.dry_run:
|
2017-05-25 21:03:46 +02:00
|
|
|
self.logger.info('==> Start (DRY RUN)')
|
|
|
|
else:
|
|
|
|
self.logger.info('==> Start')
|
2014-12-17 20:31:59 +01:00
|
|
|
try:
|
2020-10-30 14:42:56 +01:00
|
|
|
t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale])
|
2014-12-17 20:31:59 +01:00
|
|
|
self.logger.info('\tUsing locale %s' % (conf.locale))
|
|
|
|
except IOError:
|
|
|
|
t = gettext.NullTranslations()
|
|
|
|
self.logger.info('\tUsing default locale en_EN')
|
2020-10-30 14:42:56 +01:00
|
|
|
self._ = t.gettext
|
2014-12-17 19:00:42 +01:00
|
|
|
|
2014-12-08 14:13:26 +01:00
|
|
|
def getVersion(self):
|
|
|
|
return IWLA.IWLA_VERSION
|
|
|
|
|
2014-11-27 12:34:42 +01:00
|
|
|
def getConfValue(self, key, default=None):
|
2014-11-24 21:42:57 +01:00
|
|
|
if not key in dir(conf):
|
|
|
|
return default
|
|
|
|
else:
|
|
|
|
return conf.__dict__[key]
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _clearVisits(self):
|
|
|
|
self.current_analysis = {
|
|
|
|
'days_stats' : {},
|
|
|
|
'month_stats' : {},
|
|
|
|
'visits' : {}
|
|
|
|
}
|
|
|
|
self.valid_visitors = None
|
|
|
|
return self.current_analysis
|
|
|
|
|
|
|
|
def getDaysStats(self):
|
|
|
|
return self.current_analysis['days_stats']
|
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
def getMonthStats(self):
|
2014-11-21 14:46:12 +01:00
|
|
|
return self.current_analysis['month_stats']
|
|
|
|
|
2016-02-04 20:44:36 +01:00
|
|
|
def getCurrentVisits(self):
|
2014-11-21 14:46:12 +01:00
|
|
|
return self.current_analysis['visits']
|
|
|
|
|
2022-11-16 21:12:19 +01:00
|
|
|
def getSortedCurrentVisits(self):
|
|
|
|
visits = self.current_analysis['visits'].values()
|
|
|
|
return sorted(visits, key=lambda hit: hit['last_access'])
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def getValidVisitors(self):
|
2014-11-21 16:56:58 +01:00
|
|
|
return self.valid_visitors
|
|
|
|
|
|
|
|
def getDisplay(self):
|
|
|
|
return self.display
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-25 16:59:29 +01:00
|
|
|
def getCurTime(self):
|
|
|
|
return self.meta_infos['last_time']
|
|
|
|
|
2014-11-26 19:53:00 +01:00
|
|
|
def getStartAnalysisTime(self):
|
|
|
|
return self.meta_infos['start_analysis_time']
|
|
|
|
|
2014-11-27 09:01:51 +01:00
|
|
|
def isValidForCurrentAnalysis(self, request):
|
|
|
|
cur_time = self.meta_infos['start_analysis_time']
|
2014-11-27 14:11:47 +01:00
|
|
|
# Analyse not started
|
|
|
|
if not cur_time: return False
|
|
|
|
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
|
2014-11-27 09:01:51 +01:00
|
|
|
|
2014-11-27 13:07:14 +01:00
|
|
|
def hasBeenViewed(self, request):
|
|
|
|
return int(request['status']) in conf.viewed_http_codes
|
|
|
|
|
2014-11-27 14:29:25 +01:00
|
|
|
def getCurDisplayPath(self, filename):
|
2014-11-27 14:11:47 +01:00
|
|
|
cur_time = self.meta_infos['last_time']
|
2014-12-12 13:24:47 +01:00
|
|
|
return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
|
2014-11-27 14:11:47 +01:00
|
|
|
|
2014-11-30 19:05:17 +01:00
|
|
|
def getResourcesPath(self):
|
|
|
|
return conf.resources_path
|
|
|
|
|
|
|
|
def getCSSPath(self):
|
|
|
|
return conf.css_path
|
|
|
|
|
2024-10-27 09:16:01 +01:00
|
|
|
def reverseDNS(self, hit):
|
|
|
|
if hit.get('dns_name_replaced', False):
|
|
|
|
return hit['remote_addr']
|
|
|
|
|
|
|
|
try:
|
|
|
|
timeout = socket.getdefaulttimeout()
|
|
|
|
if timeout != self.reverse_dns_timeout:
|
|
|
|
socket.setdefaulttimeout(self.reverse_dns_timeout)
|
|
|
|
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
|
|
|
|
if timeout != self.reverse_dns_timeout:
|
|
|
|
socket.setdefaulttimeout(timeout)
|
|
|
|
hit['remote_addr'] = name.lower()
|
|
|
|
hit['dns_name_replaced'] = True
|
|
|
|
except socket.herror:
|
|
|
|
pass
|
|
|
|
finally:
|
|
|
|
hit['dns_analysed'] = True
|
|
|
|
|
|
|
|
return hit['remote_addr']
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _clearMeta(self):
|
|
|
|
self.meta_infos = {
|
2014-12-09 16:54:02 +01:00
|
|
|
'last_time' : None,
|
|
|
|
'start_analysis_time' : None
|
2014-11-21 14:46:12 +01:00
|
|
|
}
|
|
|
|
return self.meta_infos
|
|
|
|
|
|
|
|
def _clearDisplay(self):
|
2016-02-04 20:46:12 +01:00
|
|
|
self.display.clear()
|
2020-10-30 14:42:56 +01:00
|
|
|
return self.display
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
def getDBFilename(self, time):
|
2014-12-12 13:24:47 +01:00
|
|
|
return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2015-05-23 16:38:39 +02:00
|
|
|
def _openDB(self, filename, prot='r'):
|
|
|
|
if self.args.dont_compress:
|
|
|
|
return open(filename, prot)
|
|
|
|
else:
|
|
|
|
return gzip.open(filename, prot)
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _serialize(self, obj, filename):
|
2023-03-11 20:51:44 +01:00
|
|
|
if self.args.dry_run: return
|
|
|
|
self.logger.info("==> Serialize to %s" % (filename))
|
2014-11-21 14:46:12 +01:00
|
|
|
base = os.path.dirname(filename)
|
|
|
|
if not os.path.exists(base):
|
|
|
|
os.makedirs(base)
|
|
|
|
|
2019-08-30 07:50:54 +02:00
|
|
|
# Make a backup in case of something fails
|
|
|
|
if os.path.exists(filename):
|
|
|
|
shutil.copy(filename, filename + '.bak')
|
|
|
|
|
2015-05-23 16:38:39 +02:00
|
|
|
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
|
2020-10-30 14:42:56 +01:00
|
|
|
pickle.dump(obj, f)
|
2014-11-21 16:56:58 +01:00
|
|
|
f.seek(0)
|
2014-12-19 18:06:02 +01:00
|
|
|
fzip.write(f.read())
|
2019-08-30 07:50:54 +02:00
|
|
|
os.fsync(fzip)
|
2014-11-19 19:34:16 +01:00
|
|
|
os.remove(filename + '.tmp')
|
2019-08-30 07:50:54 +02:00
|
|
|
if os.path.exists(filename + '.bak'):
|
|
|
|
os.remove(filename + '.bak')
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _deserialize(self, filename):
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
return None
|
|
|
|
|
2017-05-07 16:55:05 +02:00
|
|
|
res = None
|
2015-05-23 16:38:39 +02:00
|
|
|
with self._openDB(filename) as f:
|
2020-10-30 14:42:56 +01:00
|
|
|
res = pickle.load(f)
|
2017-05-07 16:55:05 +02:00
|
|
|
return res
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-27 12:34:42 +01:00
|
|
|
def _callPlugins(self, target_root, *args):
|
2014-12-16 20:23:33 +01:00
|
|
|
self.logger.info('==> Call plugins (%s)' % (target_root))
|
2014-11-27 12:34:42 +01:00
|
|
|
for (root, plugins) in self.plugins:
|
|
|
|
if root != target_root: continue
|
|
|
|
for p in plugins:
|
|
|
|
mod = self.cache_plugins.get(root + '.' + p, None)
|
|
|
|
if mod:
|
2014-12-16 20:23:33 +01:00
|
|
|
self.logger.info('\t%s' % (p))
|
2014-11-27 12:34:42 +01:00
|
|
|
mod.hook(*args)
|
2014-11-19 08:01:12 +01:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def isPage(self, request):
|
2015-01-06 08:08:09 +01:00
|
|
|
self.logger.debug("Is page %s" % (request))
|
2014-11-24 21:37:37 +01:00
|
|
|
for e in conf.pages_extensions:
|
2014-11-21 14:46:12 +01:00
|
|
|
if request.endswith(e):
|
2015-01-06 08:08:09 +01:00
|
|
|
self.logger.debug("True")
|
2014-11-21 14:46:12 +01:00
|
|
|
return True
|
2024-01-30 11:27:03 +01:00
|
|
|
# No extension -> page
|
|
|
|
if not '.' in request.split('/')[-1]:
|
|
|
|
self.logger.debug("True")
|
|
|
|
return True
|
2015-01-06 08:08:09 +01:00
|
|
|
self.logger.debug("False")
|
2014-11-21 14:46:12 +01:00
|
|
|
return False
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2024-07-28 09:24:33 +02:00
|
|
|
def isMultimediaFile(self, uri):
|
|
|
|
self.logger.debug("Is multimedia %s" % (uri))
|
2015-01-13 18:52:35 +01:00
|
|
|
for e in conf.multimedia_files:
|
2024-07-28 09:24:33 +02:00
|
|
|
if uri.lower().endswith(e):
|
2015-01-13 18:52:35 +01:00
|
|
|
self.logger.debug("True")
|
|
|
|
return True
|
|
|
|
self.logger.debug("False")
|
2024-07-28 09:24:33 +02:00
|
|
|
|
|
|
|
for file_re in self.multimedia_files_re:
|
|
|
|
if file_re.match(uri):
|
|
|
|
self.logger.debug("Is multimedia re True")
|
|
|
|
return True
|
2015-01-13 18:52:35 +01:00
|
|
|
return False
|
|
|
|
|
2016-01-18 07:33:48 +01:00
|
|
|
def isValidVisitor(self, hit):
|
|
|
|
if hit['robot']: return False
|
2017-08-24 07:55:53 +02:00
|
|
|
if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
|
2016-01-18 07:33:48 +01:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2016-09-25 20:36:51 +02:00
|
|
|
def isRobot(self, hit):
|
2023-03-11 20:51:44 +01:00
|
|
|
# By default robot is None
|
|
|
|
return hit['robot'] == True
|
2016-09-25 20:36:51 +02:00
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _appendHit(self, hit):
|
2023-05-21 11:04:40 +02:00
|
|
|
remote_ip = hit['remote_ip']
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2023-05-21 11:04:40 +02:00
|
|
|
if not remote_ip: return
|
2014-11-28 16:02:04 +01:00
|
|
|
|
2020-04-09 09:33:59 +02:00
|
|
|
for ip in self.excluded_ip:
|
2023-05-21 11:04:40 +02:00
|
|
|
if ip.match(remote_ip):
|
2020-04-09 09:33:59 +02:00
|
|
|
return
|
|
|
|
|
2023-01-28 09:42:12 +01:00
|
|
|
# Redirected page/hit
|
|
|
|
if int(hit['status']) in (301, 302, 307, 308):
|
|
|
|
return
|
|
|
|
|
2023-05-21 11:04:40 +02:00
|
|
|
if not remote_ip in self.current_analysis['visits'].keys():
|
2014-11-25 16:22:07 +01:00
|
|
|
self._createVisitor(hit)
|
2020-10-30 14:42:56 +01:00
|
|
|
|
2023-05-21 11:04:40 +02:00
|
|
|
super_hit = self.current_analysis['visits'][remote_ip]
|
2017-05-07 16:56:12 +02:00
|
|
|
# Don't keep all requests for robots
|
|
|
|
if not super_hit['robot']:
|
|
|
|
super_hit['requests'].append(hit)
|
2017-08-24 07:55:53 +02:00
|
|
|
|
|
|
|
day = self.meta_infos['last_time'].tm_mday
|
|
|
|
if self.hasBeenViewed(hit):
|
|
|
|
super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
|
|
|
|
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
|
2020-10-30 14:42:56 +01:00
|
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
request = hit['extract_request']
|
2014-11-27 12:34:42 +01:00
|
|
|
uri = request.get('extract_uri', request['http_uri'])
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
hit['is_page'] = self.isPage(uri)
|
|
|
|
|
|
|
|
if super_hit['robot'] or\
|
2020-10-30 14:42:56 +01:00
|
|
|
not self.hasBeenViewed(hit):
|
2014-11-21 14:46:12 +01:00
|
|
|
page_key = 'not_viewed_pages'
|
|
|
|
hit_key = 'not_viewed_hits'
|
|
|
|
else:
|
|
|
|
page_key = 'viewed_pages'
|
|
|
|
hit_key = 'viewed_hits'
|
|
|
|
|
|
|
|
if hit['is_page']:
|
2017-08-24 07:55:53 +02:00
|
|
|
super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
|
|
|
|
super_hit[page_key][0] += 1
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
2017-08-24 07:55:53 +02:00
|
|
|
super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
|
|
|
|
super_hit[hit_key][0] += 1
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-25 16:22:07 +01:00
|
|
|
def _createVisitor(self, hit):
|
2014-11-21 14:46:12 +01:00
|
|
|
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
|
|
|
|
super_hit['remote_addr'] = hit['remote_addr']
|
2014-11-26 16:17:16 +01:00
|
|
|
super_hit['remote_ip'] = hit['remote_addr']
|
2017-08-24 07:55:53 +02:00
|
|
|
super_hit['viewed_pages'] = {0:0}
|
|
|
|
super_hit['viewed_hits'] = {0:0}
|
|
|
|
super_hit['not_viewed_pages'] = {0:0}
|
|
|
|
super_hit['not_viewed_hits'] = {0:0}
|
|
|
|
super_hit['bandwidth'] = {0:0}
|
2014-11-21 14:46:12 +01:00
|
|
|
super_hit['last_access'] = self.meta_infos['last_time']
|
|
|
|
super_hit['requests'] = []
|
2023-03-11 20:51:44 +01:00
|
|
|
super_hit['robot'] = None
|
2014-11-21 14:46:12 +01:00
|
|
|
super_hit['hit_only'] = 0
|
|
|
|
|
2024-01-30 11:28:10 +01:00
|
|
|
def _normalizeURI(self, uri, removeFileSlash=True):
|
2016-01-02 11:50:31 +01:00
|
|
|
if uri == '/': return uri
|
2022-06-23 21:11:43 +02:00
|
|
|
# Remove protocol
|
|
|
|
uri = self.protocol_re.sub('', uri)
|
|
|
|
# Remove double /
|
|
|
|
uri = self.slash_re.sub('/', uri)
|
2024-01-30 11:28:10 +01:00
|
|
|
if removeFileSlash:
|
|
|
|
while len(uri) > 1 and uri[-1] == '/':
|
|
|
|
uri = uri[:-1]
|
2016-01-02 11:50:31 +01:00
|
|
|
return uri
|
|
|
|
|
|
|
|
def _normalizeParameters(self, parameters):
|
|
|
|
# No parameters
|
|
|
|
if parameters == '?': return None
|
|
|
|
return parameters
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
def _decodeHTTPRequest(self, hit):
|
|
|
|
if not 'request' in hit.keys(): return False
|
|
|
|
|
|
|
|
groups = self.http_request_extracted.match(hit['request'])
|
|
|
|
|
|
|
|
if groups:
|
2017-08-23 20:11:17 +02:00
|
|
|
hit['extract_request'] = groups.groupdict("")
|
2014-11-21 14:46:12 +01:00
|
|
|
uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
|
|
|
|
if uri_groups:
|
2017-08-23 20:11:17 +02:00
|
|
|
d = uri_groups.groupdict("")
|
2016-01-02 11:50:31 +01:00
|
|
|
hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
|
2014-11-21 14:46:12 +01:00
|
|
|
if 'extract_parameters' in d.keys():
|
2016-01-02 11:50:31 +01:00
|
|
|
parameters = self._normalizeParameters(d['extract_parameters'])
|
|
|
|
if parameters:
|
|
|
|
hit['extract_request']['extract_parameters'] = parameters
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
2014-12-16 20:23:33 +01:00
|
|
|
self.logger.warning("Bad request extraction %s" % (hit['request']))
|
2014-11-21 14:46:12 +01:00
|
|
|
return False
|
|
|
|
|
2014-11-26 16:17:16 +01:00
|
|
|
if hit['http_referer']:
|
|
|
|
referer_groups = self.uri_re.match(hit['http_referer'])
|
|
|
|
if referer_groups:
|
2017-08-23 20:11:17 +02:00
|
|
|
hit['extract_referer'] = referer_groups.groupdict("")
|
2024-01-30 11:28:10 +01:00
|
|
|
hit['extract_referer']['extract_uri'] = self._normalizeURI(hit['extract_referer']['extract_uri'])
|
2016-01-03 09:28:19 +01:00
|
|
|
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
|
2023-05-21 11:04:40 +02:00
|
|
|
|
|
|
|
hit['remote_ip'] = hit['remote_addr']
|
|
|
|
|
2014-11-21 14:46:12 +01:00
|
|
|
return True
|
|
|
|
|
|
|
|
def _decodeTime(self, hit):
|
2014-12-10 21:41:22 +01:00
|
|
|
try:
|
|
|
|
hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
|
2020-10-30 14:42:56 +01:00
|
|
|
except ValueError as e:
|
2014-12-10 21:41:22 +01:00
|
|
|
if sys.version_info < (3, 2):
|
2020-10-30 14:42:56 +01:00
|
|
|
# Try without UTC value at the end (%z not recognized)
|
2014-12-10 21:41:22 +01:00
|
|
|
gmt_offset_str = hit['time_local'][-5:]
|
|
|
|
gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
|
|
|
|
gmt_offset_minutes = int(gmt_offset_str[3:5])*60
|
|
|
|
gmt_offset = gmt_offset_hours + gmt_offset_minutes
|
|
|
|
hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
|
2014-12-31 18:00:10 +01:00
|
|
|
# if gmt_offset_str[0] == '-':
|
|
|
|
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
|
|
|
|
# else:
|
|
|
|
# hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
|
2014-12-10 21:41:22 +01:00
|
|
|
else:
|
|
|
|
raise e
|
2014-11-26 19:53:00 +01:00
|
|
|
return hit['time_decoded']
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
def getDisplayIndex(self):
|
|
|
|
cur_time = self.meta_infos['last_time']
|
2014-11-27 14:29:25 +01:00
|
|
|
filename = self.getCurDisplayPath('index.html')
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-11-21 16:56:58 +01:00
|
|
|
return self.display.getPage(filename)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
def _generateDisplayDaysStats(self):
|
2014-11-21 14:46:12 +01:00
|
|
|
cur_time = self.meta_infos['last_time']
|
2014-12-19 17:50:45 +01:00
|
|
|
title = createCurTitle(self, self._('Statistics'))
|
2014-11-27 14:29:25 +01:00
|
|
|
filename = self.getCurDisplayPath('index.html')
|
2014-12-16 20:23:33 +01:00
|
|
|
self.logger.info('==> Generate display (%s)' % (filename))
|
2014-12-08 14:13:26 +01:00
|
|
|
page = self.display.createPage(title, filename, conf.css_path)
|
2014-12-31 14:22:46 +01:00
|
|
|
link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
|
|
|
|
page.appendBlock(link)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2023-02-04 08:40:36 +01:00
|
|
|
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
|
2014-12-02 16:53:54 +01:00
|
|
|
_, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
|
2020-12-11 09:18:50 +01:00
|
|
|
days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6), [4, 5])
|
2014-12-15 20:43:43 +01:00
|
|
|
days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
2014-11-21 14:46:12 +01:00
|
|
|
nb_visits = 0
|
2014-11-28 16:26:11 +01:00
|
|
|
nb_days = 0
|
2014-12-04 19:15:15 +01:00
|
|
|
for i in range(1, nb_month_days+1):
|
2023-02-04 08:40:36 +01:00
|
|
|
month = months_name[int(time.strftime('%m', cur_time), 10)]
|
|
|
|
day = '%d<br/>%s' % (i, month)
|
|
|
|
full_day = '%02d %s %d' % (i, month, cur_time.tm_year)
|
2014-11-28 16:26:11 +01:00
|
|
|
if i in self.current_analysis['days_stats'].keys():
|
|
|
|
stats = self.current_analysis['days_stats'][i]
|
2014-12-15 20:43:43 +01:00
|
|
|
row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
|
2014-12-02 16:53:54 +01:00
|
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
2014-12-15 20:43:43 +01:00
|
|
|
nb_visits += stats['nb_visits']
|
2014-11-28 16:26:11 +01:00
|
|
|
nb_days += 1
|
|
|
|
else:
|
2014-12-02 21:53:20 +01:00
|
|
|
row = [full_day, 0, 0, 0, 0, 0]
|
2014-11-21 16:56:58 +01:00
|
|
|
days.appendRow(row)
|
2020-12-11 09:18:50 +01:00
|
|
|
viewed_bandwidth = row[4]
|
|
|
|
not_viewed_bandwidth = row[5]
|
|
|
|
days.setCellValue(i-1, 4, viewed_bandwidth)
|
|
|
|
days.setCellValue(i-1, 5, not_viewed_bandwidth)
|
2014-12-02 21:53:20 +01:00
|
|
|
days.appendShortTitle(day)
|
2014-12-04 19:15:15 +01:00
|
|
|
adate = date(cur_time.tm_year, cur_time.tm_mon, i)
|
2014-12-02 21:53:20 +01:00
|
|
|
week_day = adate.weekday()
|
2014-12-02 21:16:27 +01:00
|
|
|
if week_day == 5 or week_day == 6:
|
2014-12-04 19:15:15 +01:00
|
|
|
days.setRowCSSClass(i-1, 'iwla_weekend')
|
2014-12-02 21:53:20 +01:00
|
|
|
if adate == date.today():
|
2014-12-11 22:31:40 +01:00
|
|
|
css = days.getCellCSSClass(i-1, 0)
|
2014-12-02 21:53:20 +01:00
|
|
|
if css: css = '%s %s' % (css, 'iwla_curday')
|
|
|
|
else: css = 'iwla_curday'
|
2014-12-04 19:15:15 +01:00
|
|
|
days.setCellCSSClass(i-1, 0, css)
|
2014-11-21 14:46:12 +01:00
|
|
|
|
|
|
|
stats = self.current_analysis['month_stats']
|
|
|
|
|
|
|
|
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
|
|
|
if nb_days:
|
2020-12-09 13:24:29 +01:00
|
|
|
average_row = list(map(lambda v: int(v/nb_days), row))
|
2014-11-21 14:46:12 +01:00
|
|
|
else:
|
2020-12-09 13:24:29 +01:00
|
|
|
average_row = list(map(lambda v: 0, row))
|
2014-11-21 14:46:12 +01:00
|
|
|
|
2014-12-17 20:31:59 +01:00
|
|
|
average_row[0] = self._('Average')
|
2014-11-24 13:44:04 +01:00
|
|
|
days.appendRow(average_row)
|
2014-11-18 20:18:53 +01:00
|
|
|
|
2014-12-17 20:31:59 +01:00
|
|
|
row[0] = self._('Total')
|
2014-11-21 16:56:58 +01:00
|
|
|
days.appendRow(row)
|
|
|
|
page.appendBlock(days)
|
|
|
|
self.display.addPage(page)
|
2014-11-20 08:18:31 +01:00
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
def _generateDisplayMonthStats(self, page, year, month_stats):
|
2014-12-04 19:15:15 +01:00
|
|
|
cur_time = time.localtime()
|
2014-12-19 17:50:45 +01:00
|
|
|
months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
|
2014-12-17 20:31:59 +01:00
|
|
|
title = '%s %d' % (self._('Summary'), year)
|
2023-02-04 08:40:36 +01:00
|
|
|
cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')]
|
|
|
|
graph_cols=range(1,6)
|
2020-12-11 09:18:50 +01:00
|
|
|
months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols, [5, 6])
|
2023-02-04 08:40:36 +01:00
|
|
|
months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
|
2014-12-03 21:58:55 +01:00
|
|
|
total = [0] * len(cols)
|
2014-12-04 19:15:15 +01:00
|
|
|
for i in range(1, 13):
|
|
|
|
month = '%s<br/>%d' % (months_name[i], year)
|
|
|
|
full_month = '%s %d' % (months_name[i], year)
|
2023-02-04 08:40:36 +01:00
|
|
|
link_month = '<a target="_top" href="/%d/%02d/index.html">%s</a>' % (year, i, full_month)
|
2014-12-03 21:58:55 +01:00
|
|
|
if i in month_stats.keys():
|
|
|
|
stats = month_stats[i]
|
2023-02-04 08:40:36 +01:00
|
|
|
row = [link_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
|
|
|
|
stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
|
2023-02-18 08:49:27 +01:00
|
|
|
for j in range(1,7):
|
2014-12-04 19:15:15 +01:00
|
|
|
total[j] += row[j]
|
2014-12-03 21:58:55 +01:00
|
|
|
else:
|
2023-02-04 08:40:36 +01:00
|
|
|
row = [full_month, 0, 0, 0, 0, 0, 0]
|
2014-12-03 21:58:55 +01:00
|
|
|
months.appendRow(row)
|
|
|
|
months.appendShortTitle(month)
|
2014-12-04 19:15:15 +01:00
|
|
|
if year == cur_time.tm_year and i == cur_time.tm_mon:
|
|
|
|
css = months.getCellCSSClass(i-1, 0)
|
|
|
|
if css: css = '%s %s' % (css, 'iwla_curday')
|
|
|
|
else: css = 'iwla_curday'
|
|
|
|
months.setCellCSSClass(i-1, 0, css)
|
2014-12-03 21:58:55 +01:00
|
|
|
|
2014-12-17 20:31:59 +01:00
|
|
|
total[0] = self._('Total')
|
2014-12-03 21:58:55 +01:00
|
|
|
months.appendRow(total)
|
|
|
|
page.appendBlock(months)
|
|
|
|
|
2014-12-31 14:22:46 +01:00
|
|
|
filename = '%d/_stats.html' % (year)
|
|
|
|
page_ = self.display.createPage(u'', filename, conf.css_path)
|
2023-02-04 08:40:36 +01:00
|
|
|
page_.appendBlock(months)
|
2014-12-31 14:22:46 +01:00
|
|
|
page_.build(conf.DISPLAY_ROOT, False)
|
2023-02-04 08:40:36 +01:00
|
|
|
months.resetHTML()
|
2014-12-31 14:22:46 +01:00
|
|
|
|
2014-12-03 21:58:55 +01:00
|
|
|
def _generateDisplayWholeMonthStats(self):
|
2014-12-17 20:31:59 +01:00
|
|
|
title = '%s %s' % (self._('Statistics for'), conf.domain_name)
|
2014-12-03 21:58:55 +01:00
|
|
|
filename = 'index.html'
|
2014-12-16 20:23:33 +01:00
|
|
|
|
|
|
|
self.logger.info('==> Generate main page (%s)' % (filename))
|
2014-12-03 21:58:55 +01:00
|
|
|
|
2014-12-08 14:13:26 +01:00
|
|
|
page = self.display.createPage(title, filename, conf.css_path)
|
2014-12-03 21:58:55 +01:00
|
|
|
|
2015-01-02 19:27:57 +01:00
|
|
|
last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
|
2014-12-08 14:13:26 +01:00
|
|
|
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
|
2015-01-02 19:27:57 +01:00
|
|
|
duration = datetime.now() - self.start_time
|
|
|
|
duration = time.gmtime(duration.seconds)
|
|
|
|
time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
|
|
|
|
if duration.tm_hour:
|
|
|
|
time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
|
|
|
|
time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
|
|
|
|
page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
|
2014-12-04 21:47:11 +01:00
|
|
|
|
2014-12-15 22:30:17 +01:00
|
|
|
for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
|
2014-12-03 21:58:55 +01:00
|
|
|
self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
|
|
|
|
|
|
|
|
self.display.addPage(page)
|
|
|
|
|
2020-05-01 09:57:24 +02:00
|
|
|
def _compressFile(self, root, filename):
|
2014-12-15 21:28:25 +01:00
|
|
|
path = os.path.join(root, filename)
|
|
|
|
gz_path = path + '.gz'
|
2014-12-16 20:23:33 +01:00
|
|
|
|
|
|
|
self.logger.debug('Compress %s => %s' % (path, gz_path))
|
|
|
|
|
2014-12-15 21:28:25 +01:00
|
|
|
if not os.path.exists(gz_path) or\
|
2020-05-01 09:57:24 +02:00
|
|
|
os.stat(pa |