#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright Grégory Soutadé 2015 # This file is part of iwla # iwla is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # iwla is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with iwla. If not, see . # import os import shutil import sys import re import time import pickle import gzip import importlib import argparse import logging import gettext from calendar import monthrange from datetime import date, datetime import default_conf as conf from iplugin import * from display import * """ Main class IWLA Parse Log, compute them, call plugins and produce output For now, only HTTP log are valid Plugin requirements : None Conf values needed : analyzed_filename domain_name locales_path compress_output_files excluded_ip excluded_domain_name Output files : DB_ROOT/meta.db DB_ROOT/year/month/iwla.db OUTPUT_ROOT/index.html OUTPUT_ROOT/year/_stats.html OUTPUT_ROOT/year/month/index.html Statistics creation : meta : last_time start_analysis_time stats => year => month => viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits nb_visitors month_stats : viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits days_stats : day => viewed_bandwidth not_viewed_bandwidth viewed_pages viewed_hits nb_visits nb_visitors visits : remote_ip => remote_addr remote_ip viewed_pages{0..31} # 0 contains total viewed_hits{0..31} # 0 contains total not_viewed_pages{0..31} not_viewed_hits{0..31} bandwidth{0..31} last_access requests => [fields_from_format_log] extract_request => http_method http_uri http_version extract_uri extract_parameters* extract_referer* => extract_uri extract_parameters* robot hit_only is_page keep_requests valid_visitors: month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors) Statistics update : None Statistics deletion : None """ class IWLA(object): ANALYSIS_CLASS = 'HTTP' API_VERSION = 1 IWLA_VERSION = '0.7' def __init__(self, logLevel, args): self.meta_infos = {} self.analyse_started = False self.current_analysis = {} self.start_time = 0 self.cache_plugins = {} self.display = DisplayHTMLBuild(self) self.valid_visitors = None self.args = args self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.http_request_extracted = re.compile(r'(?P\S+) (?P\S+) (?P\S+)') self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?#]+)(\?(?P[^#]+))?(#.*)?') self.domain_name_re = re.compile(r'.*%s' % conf.domain_name) self.slash_re = re.compile(r'//') self.protocol_re = re.compile(r'^.*://') self.excluded_ip = [] for ip in conf.excluded_ip: self.excluded_ip += [re.compile(ip)] self.excluded_domain_name = [] for domain_name in conf.excluded_domain_name: self.excluded_domain_name += [re.compile(domain_name)] self.multimedia_files_re = [] for file_re in conf.multimedia_files_re: self.multimedia_files_re += [re.compile(file_re)] self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks), (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks), (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)] logging.basicConfig(format='%(name)s %(message)s', level=logLevel) self.logger = logging.getLogger(self.__class__.__name__) if self.args.dry_run: self.logger.info('==> Start (DRY RUN)') else: self.logger.info('==> Start') try: t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale]) self.logger.info('\tUsing locale %s' % (conf.locale)) except IOError: t = gettext.NullTranslations() self.logger.info('\tUsing default locale en_EN') self._ = t.gettext def getVersion(self): return IWLA.IWLA_VERSION def getConfValue(self, key, default=None): if not key in dir(conf): return default else: return conf.__dict__[key] def _clearVisits(self): self.current_analysis = { 'days_stats' : {}, 'month_stats' : {}, 'visits' : {} } self.valid_visitors = None return self.current_analysis def getDaysStats(self): return self.current_analysis['days_stats'] def getMonthStats(self): return self.current_analysis['month_stats'] def getCurrentVisits(self): return self.current_analysis['visits'] def getSortedCurrentVisits(self): visits = self.current_analysis['visits'].values() return sorted(visits, key=lambda hit: hit['last_access']) def getValidVisitors(self): return self.valid_visitors def getDisplay(self): return self.display def getCurTime(self): return self.meta_infos['last_time'] def getStartAnalysisTime(self): return self.meta_infos['start_analysis_time'] def isValidForCurrentAnalysis(self, request): cur_time = self.meta_infos['start_analysis_time'] # Analyse not started if not cur_time: return False return (time.mktime(cur_time) < time.mktime(request['time_decoded'])) def hasBeenViewed(self, request): return int(request['status']) in conf.viewed_http_codes def getCurDisplayPath(self, filename): cur_time = self.meta_infos['last_time'] return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename) def getResourcesPath(self): return conf.resources_path def getCSSPath(self): return conf.css_path def _clearMeta(self): self.meta_infos = { 'last_time' : None, 'start_analysis_time' : None } return self.meta_infos def _clearDisplay(self): self.display.clear() return self.display def getDBFilename(self, time): return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME) def _openDB(self, filename, prot='r'): if self.args.dont_compress: return open(filename, prot) else: return gzip.open(filename, prot) def _serialize(self, obj, filename): if self.args.dry_run: return self.logger.info("==> Serialize to %s" % (filename)) base = os.path.dirname(filename) if not os.path.exists(base): os.makedirs(base) # Make a backup in case of something fails if os.path.exists(filename): shutil.copy(filename, filename + '.bak') with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip: pickle.dump(obj, f) f.seek(0) fzip.write(f.read()) os.fsync(fzip) os.remove(filename + '.tmp') if os.path.exists(filename + '.bak'): os.remove(filename + '.bak') def _deserialize(self, filename): if not os.path.exists(filename): return None res = None with self._openDB(filename) as f: res = pickle.load(f) return res def _callPlugins(self, target_root, *args): self.logger.info('==> Call plugins (%s)' % (target_root)) for (root, plugins) in self.plugins: if root != target_root: continue for p in plugins: mod = self.cache_plugins.get(root + '.' + p, None) if mod: self.logger.info('\t%s' % (p)) mod.hook(*args) def isPage(self, request): self.logger.debug("Is page %s" % (request)) for e in conf.pages_extensions: if request.endswith(e): self.logger.debug("True") return True # No extension -> page if not '.' in request.split('/')[-1]: self.logger.debug("True") return True self.logger.debug("False") return False def isMultimediaFile(self, uri): self.logger.debug("Is multimedia %s" % (uri)) for e in conf.multimedia_files: if uri.lower().endswith(e): self.logger.debug("True") return True self.logger.debug("False") for file_re in self.multimedia_files_re: if file_re.match(uri): self.logger.debug("Is multimedia re True") return True return False def isValidVisitor(self, hit): if hit['robot']: return False if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]: return False return True def isRobot(self, hit): # By default robot is None return hit['robot'] == True def _appendHit(self, hit): remote_ip = hit['remote_ip'] if not remote_ip: return for ip in self.excluded_ip: if ip.match(remote_ip): return # Redirected page/hit if int(hit['status']) in (301, 302, 307, 308): return if not remote_ip in self.current_analysis['visits'].keys(): self._createVisitor(hit) super_hit = self.current_analysis['visits'][remote_ip] # Don't keep all requests for robots if not super_hit['robot']: super_hit['requests'].append(hit) day = self.meta_infos['last_time'].tm_mday if self.hasBeenViewed(hit): super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent']) super_hit['bandwidth'][0] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] request = hit['extract_request'] uri = request.get('extract_uri', request['http_uri']) hit['is_page'] = self.isPage(uri) if super_hit['robot'] or\ not self.hasBeenViewed(hit): page_key = 'not_viewed_pages' hit_key = 'not_viewed_hits' else: page_key = 'viewed_pages' hit_key = 'viewed_hits' if hit['is_page']: super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1 super_hit[page_key][0] += 1 else: super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1 super_hit[hit_key][0] += 1 def _createVisitor(self, hit): super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_ip'] = hit['remote_addr'] super_hit['viewed_pages'] = {0:0} super_hit['viewed_hits'] = {0:0} super_hit['not_viewed_pages'] = {0:0} super_hit['not_viewed_hits'] = {0:0} super_hit['bandwidth'] = {0:0} super_hit['last_access'] = self.meta_infos['last_time'] super_hit['requests'] = [] super_hit['robot'] = None super_hit['hit_only'] = 0 def _normalizeURI(self, uri, removeFileSlash=True): if uri == '/': return uri # Remove protocol uri = self.protocol_re.sub('', uri) # Remove double / uri = self.slash_re.sub('/', uri) if removeFileSlash: while len(uri) > 1 and uri[-1] == '/': uri = uri[:-1] return uri def _normalizeParameters(self, parameters): # No parameters if parameters == '?': return None return parameters def _decodeHTTPRequest(self, hit): if not 'request' in hit.keys(): return False groups = self.http_request_extracted.match(hit['request']) if groups: hit['extract_request'] = groups.groupdict("") uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) if uri_groups: d = uri_groups.groupdict("") hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri']) if 'extract_parameters' in d.keys(): parameters = self._normalizeParameters(d['extract_parameters']) if parameters: hit['extract_request']['extract_parameters'] = parameters else: self.logger.warning("Bad request extraction %s" % (hit['request'])) return False if hit['http_referer']: referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: hit['extract_referer'] = referer_groups.groupdict("") hit['extract_referer']['extract_uri'] = self._normalizeURI(hit['extract_referer']['extract_uri']) hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters']) hit['remote_ip'] = hit['remote_addr'] return True def _decodeTime(self, hit): try: hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format) except ValueError as e: if sys.version_info < (3, 2): # Try without UTC value at the end (%z not recognized) gmt_offset_str = hit['time_local'][-5:] gmt_offset_hours = int(gmt_offset_str[1:3])*60*60 gmt_offset_minutes = int(gmt_offset_str[3:5])*60 gmt_offset = gmt_offset_hours + gmt_offset_minutes hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3]) # if gmt_offset_str[0] == '-': # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset) # else: # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset) else: raise e return hit['time_decoded'] def getDisplayIndex(self): cur_time = self.meta_infos['last_time'] filename = self.getCurDisplayPath('index.html') return self.display.getPage(filename) def _generateDisplayDaysStats(self): cur_time = self.meta_infos['last_time'] title = createCurTitle(self, self._('Statistics')) filename = self.getCurDisplayPath('index.html') self.logger.info('==> Generate display (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) link = DisplayHTMLRaw(self, '') page.appendBlock(link) months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')] _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon) days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6), [4, 5]) days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth']) nb_visits = 0 nb_days = 0 for i in range(1, nb_month_days+1): month = months_name[int(time.strftime('%m', cur_time), 10)] day = '%d
%s' % (i, month) full_day = '%02d %s %d' % (i, month, cur_time.tm_year) if i in self.current_analysis['days_stats'].keys(): stats = self.current_analysis['days_stats'][i] row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] nb_visits += stats['nb_visits'] nb_days += 1 else: row = [full_day, 0, 0, 0, 0, 0] days.appendRow(row) viewed_bandwidth = row[4] not_viewed_bandwidth = row[5] days.setCellValue(i-1, 4, viewed_bandwidth) days.setCellValue(i-1, 5, not_viewed_bandwidth) days.appendShortTitle(day) adate = date(cur_time.tm_year, cur_time.tm_mon, i) week_day = adate.weekday() if week_day == 5 or week_day == 6: days.setRowCSSClass(i-1, 'iwla_weekend') if adate == date.today(): css = days.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' days.setCellCSSClass(i-1, 0, css) stats = self.current_analysis['month_stats'] row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] if nb_days: average_row = list(map(lambda v: int(v/nb_days), row)) else: average_row = list(map(lambda v: 0, row)) average_row[0] = self._('Average') days.appendRow(average_row) row[0] = self._('Total') days.appendRow(row) page.appendBlock(days) self.display.addPage(page) def _generateDisplayMonthStats(self, page, year, month_stats): cur_time = time.localtime() months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')] title = '%s %d' % (self._('Summary'), year) cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')] graph_cols=range(1,6) months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols, [5, 6]) months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth']) total = [0] * len(cols) for i in range(1, 13): month = '%s
%d' % (months_name[i], year) full_month = '%s %d' % (months_name[i], year) link_month = '%s' % (year, i, full_month) if i in month_stats.keys(): stats = month_stats[i] row = [link_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] for j in range(1,7): total[j] += row[j] else: row = [full_month, 0, 0, 0, 0, 0, 0] months.appendRow(row) months.appendShortTitle(month) if year == cur_time.tm_year and i == cur_time.tm_mon: css = months.getCellCSSClass(i-1, 0) if css: css = '%s %s' % (css, 'iwla_curday') else: css = 'iwla_curday' months.setCellCSSClass(i-1, 0, css) total[0] = self._('Total') months.appendRow(total) page.appendBlock(months) filename = '%d/_stats.html' % (year) page_ = self.display.createPage(u'', filename, conf.css_path) page_.appendBlock(months) page_.build(conf.DISPLAY_ROOT, False) months.resetHTML() def _generateDisplayWholeMonthStats(self): title = '%s %s' % (self._('Statistics for'), conf.domain_name) filename = 'index.html' self.logger.info('==> Generate main page (%s)' % (filename)) page = self.display.createPage(title, filename, conf.css_path) last_update = u'%s %s
' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime())) page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update)) duration = datetime.now() - self.start_time duration = time.gmtime(duration.seconds) time_analysis = u'%s ' % (self._('Time analysis')) if duration.tm_hour: time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours')) time_analysis += u'%d %s and %d %s
' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds')) page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis)) for year in sorted(self.meta_infos['stats'].keys(), reverse=True): self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year]) self.display.addPage(page) def _compressFile(self, root, filename): path = os.path.join(root, filename) gz_path = path + '.gz' self.logger.debug('Compress %s => %s' % (path, gz_path)) if not os.path.exists(gz_path) or\ os.stat(path).st_mtime > os.stat(gz_path).st_mtime: if self.args.dry_run: return with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out: f_out.write(f_in.read()) def _compressFiles(self, root): if not conf.compress_output_files: return for rootdir, subdirs, files in os.walk(root, followlinks=True): for f in files: for ext in conf.compress_output_files: if f.endswith(ext): self._compressFile(rootdir, f) break def _generateDisplay(self): if self.args.disable_display: return self._generateDisplayDaysStats() self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY) self._generateDisplayWholeMonthStats() if self.args.dry_run: return self.display.build(conf.DISPLAY_ROOT) self._compressFiles(conf.DISPLAY_ROOT) def _createEmptyStats(self): stats = {} stats['viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0 stats['viewed_pages'] = 0 stats['viewed_hits'] = 0 stats['nb_visits'] = 0 return stats def _generateMonthStats(self): self._clearDisplay() visits = self.current_analysis['visits'] stats = self._createEmptyStats() for (day, stat) in self.current_analysis['days_stats'].items(): for k in stats.keys(): stats[k] += stat[k] duplicated_stats = {k:v for (k,v) in stats.items()} cur_time = self.meta_infos['last_time'] self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)) self.logger.info(stats) if not 'month_stats' in self.current_analysis.keys(): self.current_analysis['month_stats'] = stats else: for (k,v) in stats.items(): self.current_analysis['month_stats'][k] = v self.valid_visitors = {} for (k,v) in visits.items(): if self.isValidVisitor(v): self.valid_visitors[k] = v duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys()) self._callPlugins(conf.POST_HOOK_DIRECTORY) if self.args.display_only: if not 'stats' in self.meta_infos.keys(): self.meta_infos['stats'] = {} self._generateDisplay() return for (k,v) in visits.items(): # Keep at least one request (for referers...) if not v.get('keep_requests', conf.keep_requests): if len(v['requests']) > 1: v['requests'] = [v['requests'][0]] path = self.getDBFilename(cur_time) self._serialize(self.current_analysis, path) # Save month stats year = cur_time.tm_year month = cur_time.tm_mon if not 'stats' in self.meta_infos.keys(): self.meta_infos['stats'] = {} if not year in self.meta_infos['stats'].keys(): self.meta_infos['stats'][year] = {} self.meta_infos['stats'][year][month] = duplicated_stats meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME) self._serialize(self.meta_infos, meta_path) self._generateDisplay() def _generateDayStats(self): self._callPlugins(conf.PRE_HOOK_DIRECTORY) visits = self.current_analysis['visits'] cur_time = self.meta_infos['last_time'] stats = self._createEmptyStats() day = cur_time.tm_mday for (k, super_hit) in visits.items(): if super_hit['last_access'].tm_mday != day: continue if super_hit['robot']: stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0) continue stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0) stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0) stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0) if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\ super_hit['viewed_pages'].get(day, 0)): stats['nb_visits'] += 1 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)) self.logger.info(stats) self.current_analysis['days_stats'][cur_time.tm_mday] = stats def _newHit(self, hit): if not self.domain_name_re.match(hit['server_name']): self.logger.debug("Not in domain %s" % (hit)) return False for domain_name in self.excluded_domain_name: if domain_name.match(hit['server_name']): self.logger.debug("Domain name %s excluded" % (hit['server_name'])) return False t = self._decodeTime(hit) cur_time = self.meta_infos['last_time'] if cur_time == None: self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.analyse_started = True else: if not self.analyse_started and\ time.mktime(t) <= time.mktime(cur_time): self.logger.debug("Not in time") return False self.analyse_started = True if t < cur_time: # Don't accept past hits return False if cur_time.tm_mon != t.tm_mon: self._generateDayStats() self._generateMonthStats() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() elif cur_time.tm_mday != t.tm_mday: self._generateDayStats() self.meta_infos['last_time'] = t if not self.meta_infos['start_analysis_time']: self.meta_infos['start_analysis_time'] = t if not self._decodeHTTPRequest(hit): return False if hit['extract_request']['http_method'] not in ['GET', 'POST']: return False for k in hit.keys(): if hit[k] == '-' or hit[k] == '*': hit[k] = '' self._appendHit(hit) return True def _reset(self): reset_time = time.strptime(self.args.reset, '%m/%Y') self.logger.info('Reset time') self.logger.info(reset_time) self.meta_infos['last_time'] = reset_time cur_time = time.localtime() year = reset_time.tm_year while year < cur_time.tm_year: db_path = os.path.join(conf.DB_ROOT, str(year)) if os.path.exists(db_path): shutil.rmtree(db_path) output_path = os.path.join(conf.DISPLAY_ROOT, str(year)) if os.path.exists(output_path): shutil.rmtree(output_path) year += 1 month = reset_time.tm_mon while month <= cur_time.tm_mon: db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month)) if os.path.exists(db_path): shutil.rmtree(db_path) output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month)) if os.path.exists(output_path): shutil.rmtree(output_path) month += 1 def start(self, _file): self.start_time = datetime.now() meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME) if os.path.exists(meta_path): self.logger.info('==> Load previous database') self.meta_infos = self._deserialize(meta_path) or self._clearMeta() if self.meta_infos['last_time']: if args.reset: self._reset() self.logger.info('Last time') self.logger.info(self.meta_infos['last_time']) self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() self.meta_infos['start_analysis_time'] = None self.cache_plugins = preloadPlugins(self.plugins, self) self.logger.info('==> Analysing log') for l in _file: # print "line " + l groups = self.log_re.match(l) if groups: self._newHit(groups.groupdict("")) else: self.logger.warning("No match for %s" % (l)) #break if self.analyse_started: self._generateDayStats() self._generateMonthStats() del self.meta_infos['start_analysis_time'] else: self.logger.info('==> Analyse not started : nothing new') def displayOnly(self, start_time): self.start_time = datetime.now() meta_path = os.path.join(conf.DB_ROOT, conf.META_FILENAME) if os.path.exists(meta_path): self.logger.info('==> Load previous database') self.meta_infos = self._deserialize(meta_path) or self._clearMeta() self.meta_infos['last_time'] = time.strptime(start_time, '%m/%Y') if self.meta_infos['last_time']: self.logger.info('Last time') self.logger.info(self.meta_infos['last_time']) self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() else: self._clearVisits() self.meta_infos['start_analysis_time'] = None self.cache_plugins = preloadPlugins(self.plugins, self) self.logger.info('==> Analysing log') self._generateDayStats() self._generateMonthStats() class FileIter(object): def __init__(self, filenames): self.filenames = [f for f in filenames.split(',') if f] for f in self.filenames: if not os.path.exists(f): print('No such file \'%s\'' % (f)) sys.exit(-1) self.cur_file = None self._openNextFile() def __iter__(self): return self def __next__(self): return self.next() def _openNextFile(self): if self.cur_file: self.cur_file.close() self.cur_file = None if not self.filenames: raise StopIteration() filename = self.filenames.pop(0) if filename.endswith('gz'): self.cur_file = gzip.open(filename, 'rt') else: self.cur_file = open(filename, 'rt') def next(self): l = self.cur_file.readline() if not l: self._openNextFile() l = self.cur_file.readline() return l[:-1] if __name__ == '__main__': parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer') parser.add_argument('-c', '--config-file', dest='config_file', default='conf.py', type=str, help='Config file to use (default conf.py)') parser.add_argument('-C', '--clean-output', dest='clean_output', action='store_true', default=False, help='Clean output before starting') parser.add_argument('-i', '--stdin', dest='stdin', action='store_true', default=False, help='Read data from stdin instead of conf.analyzed_filename') parser.add_argument('-f', '--file', dest='file', help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted') parser.add_argument('-d', '--log-level', dest='loglevel', default='INFO', type=str, help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO')) parser.add_argument('-r', '--reset', dest='reset', default=False, help='Reset analysis to a specific date (month/year)') parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true', default=False, help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)') parser.add_argument('-p', '--display-only', dest='display_only', default='', type=str, help='Only generate display for a specific date (month/year)') parser.add_argument('-P', '--disable-display', dest='disable_display', action='store_true', default=False, help='Don\'t generate display') parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true', default=False, help='Process log but don\'t write files (database and HTML) to disk') args = parser.parse_args() if args.config_file.endswith('.py'): args.config_file = args.config_file[:-3] user_conf = importlib.import_module(args.config_file) # Load user conf for (k,v) in user_conf.__dict__.items(): if k.endswith('_append'): new_k = k[:-7] if new_k in dir(conf): if type(conf.__dict__[new_k]) == list: if type(v) == list: conf.__dict__[new_k] += v else: conf.__dict__[new_k].append(v) else: print("Error %s is not a list" % (new_k)) else: print("Error %s doesn't exists in default conf" % (new_k)) else: conf.__dict__.update({k:v}) if args.clean_output and not args.dry_run: if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT) if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT) loglevel = getattr(logging, args.loglevel.upper(), None) if not isinstance(loglevel, int): raise ValueError('Invalid log level: %s' % (args.loglevel)) iwla = IWLA(loglevel, args) required_conf = ['analyzed_filename', 'domain_name'] if not validConfRequirements(required_conf, iwla, 'Main Conf'): sys.exit(0) if args.display_only: iwla.displayOnly(args.display_only) else: if args.stdin: iwla.start(sys.stdin) else: filename = args.file or conf.analyzed_filename iwla.start(FileIter(filename))