Make backup before compressing (low memory servers)
Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin
This commit is contained in:
parent
ed6ed68706
commit
bb268114b2
16
iwla.py
16
iwla.py
|
@ -252,12 +252,18 @@ class IWLA(object):
|
|||
if not os.path.exists(base):
|
||||
os.makedirs(base)
|
||||
|
||||
# Make a backup in case of something fails
|
||||
if os.path.exists(filename):
|
||||
shutil.copy(filename, filename + '.bak')
|
||||
|
||||
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
|
||||
cPickle.dump(obj, f)
|
||||
os.fsync(f)
|
||||
f.seek(0)
|
||||
fzip.write(f.read())
|
||||
os.fsync(fzip)
|
||||
os.remove(filename + '.tmp')
|
||||
if os.path.exists(filename + '.bak'):
|
||||
os.remove(filename + '.bak')
|
||||
|
||||
def _deserialize(self, filename):
|
||||
if not os.path.exists(filename):
|
||||
|
@ -626,15 +632,13 @@ class IWLA(object):
|
|||
|
||||
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
|
||||
|
||||
self._callPlugins(conf.POST_HOOK_DIRECTORY)
|
||||
|
||||
if args.display_only:
|
||||
self._generateDisplay()
|
||||
return
|
||||
|
||||
self._callPlugins(conf.POST_HOOK_DIRECTORY)
|
||||
|
||||
path = self.getDBFilename(cur_time)
|
||||
if os.path.exists(path) and not self.dry_run:
|
||||
os.remove(path)
|
||||
|
||||
self.logger.info("==> Serialize to %s" % (path))
|
||||
self._serialize(self.current_analysis, path)
|
||||
|
@ -701,6 +705,8 @@ class IWLA(object):
|
|||
self.logger.debug("Not in time")
|
||||
return False
|
||||
self.analyse_started = True
|
||||
if t < cur_time: # Don't accept past hits
|
||||
return False
|
||||
if cur_time.tm_mon != t.tm_mon:
|
||||
self._generateDayStats()
|
||||
self._generateMonthStats()
|
||||
|
|
|
@ -22,6 +22,7 @@ from iwla import IWLA
|
|||
from iplugin import IPlugin
|
||||
from display import *
|
||||
import logging
|
||||
import re
|
||||
|
||||
"""
|
||||
Display hook interface
|
||||
|
@ -54,9 +55,11 @@ class IWLADisplayStatsDiff(IPlugin):
|
|||
self.month_stats_key = None
|
||||
# Set >= if month_stats[self.month_stats_key] is a list or a tuple
|
||||
self.stats_index = -1
|
||||
self.display_index = 1
|
||||
self.filename = None
|
||||
self.block_name = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.tag_re = re.compile(r'<[^>]+>')
|
||||
|
||||
def load(self):
|
||||
if not self.month_stats_key or not self.filename or\
|
||||
|
@ -67,6 +70,10 @@ class IWLADisplayStatsDiff(IPlugin):
|
|||
self.cur_stats = {k:v for (k,v) in month_stats.get(self.month_stats_key, {}).items()}
|
||||
return True
|
||||
|
||||
# from https://tutorialedge.net/python/removing-html-from-string/
|
||||
def remove_tags(self, text):
|
||||
return self.tag_re.sub('', text)
|
||||
|
||||
def hook(self):
|
||||
display = self.iwla.getDisplay()
|
||||
month_stats = self.iwla.getMonthStats()
|
||||
|
@ -88,14 +95,21 @@ class IWLADisplayStatsDiff(IPlugin):
|
|||
if new_value:
|
||||
if self.stats_index != -1:
|
||||
if new_value[self.stats_index] != v[self.stats_index]:
|
||||
stats_diff[k] = 'iwla_update'
|
||||
diff_value = v[self.stats_index] - new_value[self.stats_index]
|
||||
stats_diff[k] = ['iwla_update', diff_value]
|
||||
else:
|
||||
if new_value != v:
|
||||
stats_diff[k] = 'iwla_update'
|
||||
diff_value = v - new_value
|
||||
stats_diff[k] = ['iwla_update', diff_value]
|
||||
else:
|
||||
stats_diff[k] = 'iwla_new'
|
||||
stats_diff[k] = ['iwla_new', 0]
|
||||
|
||||
for (idx, row) in enumerate(block.rows):
|
||||
for k in stats_diff.keys():
|
||||
if k in row[0]:
|
||||
block.setCellCSSClass(idx, 0, stats_diff[k])
|
||||
clear_text = self.remove_tags(row[0])
|
||||
if clear_text in stats_diff.keys():
|
||||
(cls, diff) = stats_diff[clear_text]
|
||||
block.setCellCSSClass(idx, 0, cls)
|
||||
if diff:
|
||||
value = block.getCellValue(idx, self.display_index)
|
||||
value += ' (+%d)' % diff
|
||||
block.setCellValue(idx, self.display_index, value)
|
||||
|
|
61
plugins/display/top_pages_diff.py
Normal file
61
plugins/display/top_pages_diff.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright Grégory Soutadé 2018
|
||||
|
||||
# This file is part of iwla
|
||||
|
||||
# iwla is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# iwla is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from iwla import IWLA
|
||||
from istats_diff import IWLADisplayStatsDiff
|
||||
from display import *
|
||||
|
||||
"""
|
||||
Display hook
|
||||
|
||||
Enlight new and updated pages in in top_pages.html
|
||||
|
||||
Plugin requirements :
|
||||
display/top_pages
|
||||
|
||||
Conf values needed :
|
||||
None
|
||||
|
||||
Output files :
|
||||
None
|
||||
|
||||
Statistics creation :
|
||||
None
|
||||
|
||||
Statistics update :
|
||||
None
|
||||
|
||||
Statistics deletion :
|
||||
None
|
||||
"""
|
||||
|
||||
class IWLADisplayTopPagesDiff(IWLADisplayStatsDiff):
|
||||
def __init__(self, iwla):
|
||||
super(IWLADisplayTopPagesDiff, self).__init__(iwla)
|
||||
self.API_VERSION = 1
|
||||
self.requires = ['IWLADisplayTopPages']
|
||||
self.month_stats_key = u'top_pages'
|
||||
self.filename = u'top_pages.html'
|
||||
self.block_name = self.iwla._(u'All Pages')
|
||||
|
||||
def load(self):
|
||||
if not self.iwla.getConfValue('create_all_pages_page', True):
|
||||
return False
|
||||
return super(IWLADisplayTopPagesDiff, self).load()
|
|
@ -82,6 +82,8 @@ class IWLAPostAnalysisIPToGeo(IPlugin):
|
|||
(_, cc) = self.iptogeo.ip_to_geo(ip)
|
||||
cc = cc and cc or 'ip'
|
||||
visitor['country_code'] = cc
|
||||
if not self.iwla.isValidVisitor(visitor):
|
||||
continue
|
||||
if cc in geo.keys():
|
||||
geo[cc] += 1
|
||||
else:
|
||||
|
|
|
@ -140,30 +140,32 @@ class IWLAPostAnalysisReferers(IPlugin):
|
|||
uri = r['extract_referer']['extract_uri']
|
||||
if self.own_domain_re.match(uri): continue
|
||||
|
||||
is_search_engine = False
|
||||
for (name, engine) in self.search_engines.items():
|
||||
for (hashid, hashid_re) in engine['hashid']:
|
||||
if not hashid_re.match(uri): continue
|
||||
|
||||
not_engine = engine.get('not_search_engine', None)
|
||||
# Try not engine
|
||||
if not_engine and not_engine.match(uri): break
|
||||
is_search_engine = True
|
||||
uri = name
|
||||
|
||||
parameters = r['extract_referer'].get('extract_parameters', None)
|
||||
key_phrase_re = engine.get('known_url', None)
|
||||
|
||||
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
|
||||
break
|
||||
|
||||
if is_search_engine:
|
||||
dictionary = search_engine_referers
|
||||
elif super_hit['robot']:
|
||||
if super_hit['robot']:
|
||||
dictionary = robots_referers
|
||||
# print '%s => %s' % (uri, super_hit['remote_ip'])
|
||||
else:
|
||||
dictionary = referers
|
||||
is_search_engine = False
|
||||
for (name, engine) in self.search_engines.items():
|
||||
for (hashid, hashid_re) in engine['hashid']:
|
||||
if not hashid_re.match(uri): continue
|
||||
|
||||
not_engine = engine.get('not_search_engine', None)
|
||||
# Try not engine
|
||||
if not_engine and not_engine.match(uri): break
|
||||
is_search_engine = True
|
||||
uri = name
|
||||
|
||||
parameters = r['extract_referer'].get('extract_parameters', None)
|
||||
key_phrase_re = engine.get('known_url', None)
|
||||
|
||||
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
|
||||
break
|
||||
|
||||
if is_search_engine:
|
||||
dictionary = search_engine_referers
|
||||
else:
|
||||
dictionary = referers
|
||||
|
||||
if r['is_page']:
|
||||
key = 'pages'
|
||||
else:
|
||||
|
|
|
@ -109,6 +109,16 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
# continue
|
||||
|
||||
# 2) pages without hit --> robot
|
||||
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
# 3) no pages and not hit --> robot
|
||||
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
# 4) pages without hit --> robot
|
||||
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
|
||||
self.logger.debug(super_hit)
|
||||
self._setRobot(k, super_hit)
|
||||
|
@ -116,15 +126,15 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
|
||||
not_found_pages = 0
|
||||
for hit in super_hit['requests']:
|
||||
# 3) /robots.txt read
|
||||
# 5) /robots.txt read
|
||||
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
|
||||
self._setRobot(k, super_hit)
|
||||
break
|
||||
|
||||
if int(hit['status']) == 404:
|
||||
if int(hit['status']) == 404 or int(hit['status']) == 403:
|
||||
not_found_pages += 1
|
||||
|
||||
# 4) Any referer for hits
|
||||
# 6) Any referer for hits
|
||||
if not hit['is_page'] and hit['http_referer']:
|
||||
referers += 1
|
||||
|
||||
|
@ -132,7 +142,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
# 5) more than 10 404 pages
|
||||
# 7) more than 10 404/403 pages
|
||||
if not_found_pages > 10:
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
|
Loading…
Reference in New Issue
Block a user