From bb268114b2123210e70221e8e7a4198bf5e285fd Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Fri, 30 Aug 2019 07:50:54 +0200 Subject: [PATCH] Make backup before compressing (low memory servers) Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin --- iwla.py | 16 +++++--- plugins/display/istats_diff.py | 26 ++++++++++--- plugins/display/top_pages_diff.py | 61 ++++++++++++++++++++++++++++++ plugins/post_analysis/ip_to_geo.py | 2 + plugins/post_analysis/referers.py | 44 +++++++++++---------- plugins/pre_analysis/robots.py | 18 +++++++-- 6 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 plugins/display/top_pages_diff.py diff --git a/iwla.py b/iwla.py index e0c4eff..a6983ed 100755 --- a/iwla.py +++ b/iwla.py @@ -252,12 +252,18 @@ class IWLA(object): if not os.path.exists(base): os.makedirs(base) + # Make a backup in case of something fails + if os.path.exists(filename): + shutil.copy(filename, filename + '.bak') + with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip: cPickle.dump(obj, f) - os.fsync(f) f.seek(0) fzip.write(f.read()) + os.fsync(fzip) os.remove(filename + '.tmp') + if os.path.exists(filename + '.bak'): + os.remove(filename + '.bak') def _deserialize(self, filename): if not os.path.exists(filename): @@ -626,15 +632,13 @@ class IWLA(object): duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys()) + self._callPlugins(conf.POST_HOOK_DIRECTORY) + if args.display_only: self._generateDisplay() return - self._callPlugins(conf.POST_HOOK_DIRECTORY) - path = self.getDBFilename(cur_time) - if os.path.exists(path) and not self.dry_run: - os.remove(path) self.logger.info("==> Serialize to %s" % (path)) self._serialize(self.current_analysis, path) @@ -701,6 +705,8 @@ class IWLA(object): self.logger.debug("Not in time") return False self.analyse_started = True + if t < cur_time: # Don't accept past hits + return False if cur_time.tm_mon != t.tm_mon: self._generateDayStats() self._generateMonthStats() diff --git a/plugins/display/istats_diff.py b/plugins/display/istats_diff.py index c5d50ca..ffb1aae 100644 --- a/plugins/display/istats_diff.py +++ b/plugins/display/istats_diff.py @@ -22,6 +22,7 @@ from iwla import IWLA from iplugin import IPlugin from display import * import logging +import re """ Display hook interface @@ -54,9 +55,11 @@ class IWLADisplayStatsDiff(IPlugin): self.month_stats_key = None # Set >= if month_stats[self.month_stats_key] is a list or a tuple self.stats_index = -1 + self.display_index = 1 self.filename = None self.block_name = None self.logger = logging.getLogger(__name__) + self.tag_re = re.compile(r'<[^>]+>') def load(self): if not self.month_stats_key or not self.filename or\ @@ -67,6 +70,10 @@ class IWLADisplayStatsDiff(IPlugin): self.cur_stats = {k:v for (k,v) in month_stats.get(self.month_stats_key, {}).items()} return True + # from https://tutorialedge.net/python/removing-html-from-string/ + def remove_tags(self, text): + return self.tag_re.sub('', text) + def hook(self): display = self.iwla.getDisplay() month_stats = self.iwla.getMonthStats() @@ -88,14 +95,21 @@ class IWLADisplayStatsDiff(IPlugin): if new_value: if self.stats_index != -1: if new_value[self.stats_index] != v[self.stats_index]: - stats_diff[k] = 'iwla_update' + diff_value = v[self.stats_index] - new_value[self.stats_index] + stats_diff[k] = ['iwla_update', diff_value] else: if new_value != v: - stats_diff[k] = 'iwla_update' + diff_value = v - new_value + stats_diff[k] = ['iwla_update', diff_value] else: - stats_diff[k] = 'iwla_new' + stats_diff[k] = ['iwla_new', 0] for (idx, row) in enumerate(block.rows): - for k in stats_diff.keys(): - if k in row[0]: - block.setCellCSSClass(idx, 0, stats_diff[k]) + clear_text = self.remove_tags(row[0]) + if clear_text in stats_diff.keys(): + (cls, diff) = stats_diff[clear_text] + block.setCellCSSClass(idx, 0, cls) + if diff: + value = block.getCellValue(idx, self.display_index) + value += ' (+%d)' % diff + block.setCellValue(idx, self.display_index, value) diff --git a/plugins/display/top_pages_diff.py b/plugins/display/top_pages_diff.py new file mode 100644 index 0000000..83ebbbb --- /dev/null +++ b/plugins/display/top_pages_diff.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# +# Copyright Grégory Soutadé 2018 + +# This file is part of iwla + +# iwla is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# iwla is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with iwla. If not, see . +# + +from iwla import IWLA +from istats_diff import IWLADisplayStatsDiff +from display import * + +""" +Display hook + +Enlight new and updated pages in in top_pages.html + +Plugin requirements : + display/top_pages + +Conf values needed : + None + +Output files : + None + +Statistics creation : + None + +Statistics update : + None + +Statistics deletion : + None +""" + +class IWLADisplayTopPagesDiff(IWLADisplayStatsDiff): + def __init__(self, iwla): + super(IWLADisplayTopPagesDiff, self).__init__(iwla) + self.API_VERSION = 1 + self.requires = ['IWLADisplayTopPages'] + self.month_stats_key = u'top_pages' + self.filename = u'top_pages.html' + self.block_name = self.iwla._(u'All Pages') + + def load(self): + if not self.iwla.getConfValue('create_all_pages_page', True): + return False + return super(IWLADisplayTopPagesDiff, self).load() diff --git a/plugins/post_analysis/ip_to_geo.py b/plugins/post_analysis/ip_to_geo.py index 4b0cf86..e99e5b3 100644 --- a/plugins/post_analysis/ip_to_geo.py +++ b/plugins/post_analysis/ip_to_geo.py @@ -82,6 +82,8 @@ class IWLAPostAnalysisIPToGeo(IPlugin): (_, cc) = self.iptogeo.ip_to_geo(ip) cc = cc and cc or 'ip' visitor['country_code'] = cc + if not self.iwla.isValidVisitor(visitor): + continue if cc in geo.keys(): geo[cc] += 1 else: diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index 37b470a..eed573e 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -140,30 +140,32 @@ class IWLAPostAnalysisReferers(IPlugin): uri = r['extract_referer']['extract_uri'] if self.own_domain_re.match(uri): continue - is_search_engine = False - for (name, engine) in self.search_engines.items(): - for (hashid, hashid_re) in engine['hashid']: - if not hashid_re.match(uri): continue - - not_engine = engine.get('not_search_engine', None) - # Try not engine - if not_engine and not_engine.match(uri): break - is_search_engine = True - uri = name - - parameters = r['extract_referer'].get('extract_parameters', None) - key_phrase_re = engine.get('known_url', None) - - self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) - break - - if is_search_engine: - dictionary = search_engine_referers - elif super_hit['robot']: + if super_hit['robot']: dictionary = robots_referers # print '%s => %s' % (uri, super_hit['remote_ip']) else: - dictionary = referers + is_search_engine = False + for (name, engine) in self.search_engines.items(): + for (hashid, hashid_re) in engine['hashid']: + if not hashid_re.match(uri): continue + + not_engine = engine.get('not_search_engine', None) + # Try not engine + if not_engine and not_engine.match(uri): break + is_search_engine = True + uri = name + + parameters = r['extract_referer'].get('extract_parameters', None) + key_phrase_re = engine.get('known_url', None) + + self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) + break + + if is_search_engine: + dictionary = search_engine_referers + else: + dictionary = referers + if r['is_page']: key = 'pages' else: diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 375590e..9e16f68 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -109,6 +109,16 @@ class IWLAPreAnalysisRobots(IPlugin): # continue # 2) pages without hit --> robot + if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]: + self._setRobot(k, super_hit) + continue + +# 3) no pages and not hit --> robot + if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]: + self._setRobot(k, super_hit) + continue + +# 4) pages without hit --> robot if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]: self.logger.debug(super_hit) self._setRobot(k, super_hit) @@ -116,15 +126,15 @@ class IWLAPreAnalysisRobots(IPlugin): not_found_pages = 0 for hit in super_hit['requests']: -# 3) /robots.txt read +# 5) /robots.txt read if hit['extract_request']['http_uri'].endswith('/robots.txt'): self._setRobot(k, super_hit) break - if int(hit['status']) == 404: + if int(hit['status']) == 404 or int(hit['status']) == 403: not_found_pages += 1 -# 4) Any referer for hits +# 6) Any referer for hits if not hit['is_page'] and hit['http_referer']: referers += 1 @@ -132,7 +142,7 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue -# 5) more than 10 404 pages +# 7) more than 10 404/403 pages if not_found_pages > 10: self._setRobot(k, super_hit) continue