diff --git a/iwla.py b/iwla.py index e0c4eff..a6983ed 100755 --- a/iwla.py +++ b/iwla.py @@ -252,12 +252,18 @@ class IWLA(object): if not os.path.exists(base): os.makedirs(base) + # Make a backup in case of something fails + if os.path.exists(filename): + shutil.copy(filename, filename + '.bak') + with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip: cPickle.dump(obj, f) - os.fsync(f) f.seek(0) fzip.write(f.read()) + os.fsync(fzip) os.remove(filename + '.tmp') + if os.path.exists(filename + '.bak'): + os.remove(filename + '.bak') def _deserialize(self, filename): if not os.path.exists(filename): @@ -626,15 +632,13 @@ class IWLA(object): duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys()) + self._callPlugins(conf.POST_HOOK_DIRECTORY) + if args.display_only: self._generateDisplay() return - self._callPlugins(conf.POST_HOOK_DIRECTORY) - path = self.getDBFilename(cur_time) - if os.path.exists(path) and not self.dry_run: - os.remove(path) self.logger.info("==> Serialize to %s" % (path)) self._serialize(self.current_analysis, path) @@ -701,6 +705,8 @@ class IWLA(object): self.logger.debug("Not in time") return False self.analyse_started = True + if t < cur_time: # Don't accept past hits + return False if cur_time.tm_mon != t.tm_mon: self._generateDayStats() self._generateMonthStats() diff --git a/plugins/display/istats_diff.py b/plugins/display/istats_diff.py index c5d50ca..ffb1aae 100644 --- a/plugins/display/istats_diff.py +++ b/plugins/display/istats_diff.py @@ -22,6 +22,7 @@ from iwla import IWLA from iplugin import IPlugin from display import * import logging +import re """ Display hook interface @@ -54,9 +55,11 @@ class IWLADisplayStatsDiff(IPlugin): self.month_stats_key = None # Set >= if month_stats[self.month_stats_key] is a list or a tuple self.stats_index = -1 + self.display_index = 1 self.filename = None self.block_name = None self.logger = logging.getLogger(__name__) + self.tag_re = re.compile(r'<[^>]+>') def load(self): if not self.month_stats_key or not self.filename or\ @@ -67,6 +70,10 @@ class IWLADisplayStatsDiff(IPlugin): self.cur_stats = {k:v for (k,v) in month_stats.get(self.month_stats_key, {}).items()} return True + # from https://tutorialedge.net/python/removing-html-from-string/ + def remove_tags(self, text): + return self.tag_re.sub('', text) + def hook(self): display = self.iwla.getDisplay() month_stats = self.iwla.getMonthStats() @@ -88,14 +95,21 @@ class IWLADisplayStatsDiff(IPlugin): if new_value: if self.stats_index != -1: if new_value[self.stats_index] != v[self.stats_index]: - stats_diff[k] = 'iwla_update' + diff_value = v[self.stats_index] - new_value[self.stats_index] + stats_diff[k] = ['iwla_update', diff_value] else: if new_value != v: - stats_diff[k] = 'iwla_update' + diff_value = v - new_value + stats_diff[k] = ['iwla_update', diff_value] else: - stats_diff[k] = 'iwla_new' + stats_diff[k] = ['iwla_new', 0] for (idx, row) in enumerate(block.rows): - for k in stats_diff.keys(): - if k in row[0]: - block.setCellCSSClass(idx, 0, stats_diff[k]) + clear_text = self.remove_tags(row[0]) + if clear_text in stats_diff.keys(): + (cls, diff) = stats_diff[clear_text] + block.setCellCSSClass(idx, 0, cls) + if diff: + value = block.getCellValue(idx, self.display_index) + value += ' (+%d)' % diff + block.setCellValue(idx, self.display_index, value) diff --git a/plugins/display/top_pages_diff.py b/plugins/display/top_pages_diff.py new file mode 100644 index 0000000..83ebbbb --- /dev/null +++ b/plugins/display/top_pages_diff.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# +# Copyright Grégory Soutadé 2018 + +# This file is part of iwla + +# iwla is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# iwla is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with iwla. If not, see . +# + +from iwla import IWLA +from istats_diff import IWLADisplayStatsDiff +from display import * + +""" +Display hook + +Enlight new and updated pages in in top_pages.html + +Plugin requirements : + display/top_pages + +Conf values needed : + None + +Output files : + None + +Statistics creation : + None + +Statistics update : + None + +Statistics deletion : + None +""" + +class IWLADisplayTopPagesDiff(IWLADisplayStatsDiff): + def __init__(self, iwla): + super(IWLADisplayTopPagesDiff, self).__init__(iwla) + self.API_VERSION = 1 + self.requires = ['IWLADisplayTopPages'] + self.month_stats_key = u'top_pages' + self.filename = u'top_pages.html' + self.block_name = self.iwla._(u'All Pages') + + def load(self): + if not self.iwla.getConfValue('create_all_pages_page', True): + return False + return super(IWLADisplayTopPagesDiff, self).load() diff --git a/plugins/post_analysis/ip_to_geo.py b/plugins/post_analysis/ip_to_geo.py index 4b0cf86..e99e5b3 100644 --- a/plugins/post_analysis/ip_to_geo.py +++ b/plugins/post_analysis/ip_to_geo.py @@ -82,6 +82,8 @@ class IWLAPostAnalysisIPToGeo(IPlugin): (_, cc) = self.iptogeo.ip_to_geo(ip) cc = cc and cc or 'ip' visitor['country_code'] = cc + if not self.iwla.isValidVisitor(visitor): + continue if cc in geo.keys(): geo[cc] += 1 else: diff --git a/plugins/post_analysis/referers.py b/plugins/post_analysis/referers.py index 37b470a..eed573e 100644 --- a/plugins/post_analysis/referers.py +++ b/plugins/post_analysis/referers.py @@ -140,30 +140,32 @@ class IWLAPostAnalysisReferers(IPlugin): uri = r['extract_referer']['extract_uri'] if self.own_domain_re.match(uri): continue - is_search_engine = False - for (name, engine) in self.search_engines.items(): - for (hashid, hashid_re) in engine['hashid']: - if not hashid_re.match(uri): continue - - not_engine = engine.get('not_search_engine', None) - # Try not engine - if not_engine and not_engine.match(uri): break - is_search_engine = True - uri = name - - parameters = r['extract_referer'].get('extract_parameters', None) - key_phrase_re = engine.get('known_url', None) - - self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) - break - - if is_search_engine: - dictionary = search_engine_referers - elif super_hit['robot']: + if super_hit['robot']: dictionary = robots_referers # print '%s => %s' % (uri, super_hit['remote_ip']) else: - dictionary = referers + is_search_engine = False + for (name, engine) in self.search_engines.items(): + for (hashid, hashid_re) in engine['hashid']: + if not hashid_re.match(uri): continue + + not_engine = engine.get('not_search_engine', None) + # Try not engine + if not_engine and not_engine.match(uri): break + is_search_engine = True + uri = name + + parameters = r['extract_referer'].get('extract_parameters', None) + key_phrase_re = engine.get('known_url', None) + + self._extractKeyPhrase(key_phrase_re, parameters, key_phrases) + break + + if is_search_engine: + dictionary = search_engine_referers + else: + dictionary = referers + if r['is_page']: key = 'pages' else: diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 375590e..9e16f68 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -109,6 +109,16 @@ class IWLAPreAnalysisRobots(IPlugin): # continue # 2) pages without hit --> robot + if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]: + self._setRobot(k, super_hit) + continue + +# 3) no pages and not hit --> robot + if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]: + self._setRobot(k, super_hit) + continue + +# 4) pages without hit --> robot if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]: self.logger.debug(super_hit) self._setRobot(k, super_hit) @@ -116,15 +126,15 @@ class IWLAPreAnalysisRobots(IPlugin): not_found_pages = 0 for hit in super_hit['requests']: -# 3) /robots.txt read +# 5) /robots.txt read if hit['extract_request']['http_uri'].endswith('/robots.txt'): self._setRobot(k, super_hit) break - if int(hit['status']) == 404: + if int(hit['status']) == 404 or int(hit['status']) == 403: not_found_pages += 1 -# 4) Any referer for hits +# 6) Any referer for hits if not hit['is_page'] and hit['http_referer']: referers += 1 @@ -132,7 +142,7 @@ class IWLAPreAnalysisRobots(IPlugin): self._setRobot(k, super_hit) continue -# 5) more than 10 404 pages +# 7) more than 10 404/403 pages if not_found_pages > 10: self._setRobot(k, super_hit) continue