Merge branch 'dev' of soutade.fr:iwla into dev

This commit is contained in:
Gregory Soutade 2015-05-22 08:16:15 +02:00
commit ce1cee986e
4 changed files with 62 additions and 18 deletions

File diff suppressed because one or more lines are too long

42
iwla.py
View File

@ -683,13 +683,40 @@ class IWLA(object):
return True return True
def start(self, _file): def _reset(self):
reset_time = time.strptime(self.args.reset, '%m/%Y')
self.logger.info('Reset time')
self.logger.info(reset_time)
self.meta_infos['last_time'] = reset_time
cur_time = time.localtime()
year = reset_time.tm_year
while year < cur_time.tm_year:
db_path = os.path.join(conf.DB_ROOT, str(year))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
if os.path.exists(output_path): shutil.rmtree(output_path)
year += 1
month = reset_time.tm_mon
while month <= cur_time.tm_mon:
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
if os.path.exists(output_path): shutil.rmtree(output_path)
month += 1
def start(self, _file, args):
self.args = args
self.start_time = datetime.now() self.start_time = datetime.now()
self.logger.info('==> Load previous database') self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']: if self.meta_infos['last_time']:
if args.reset:
self._reset()
self.logger.info('Last time') self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time']) self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
@ -743,7 +770,11 @@ class FileIter(object):
self.cur_file = None self.cur_file = None
if not self.filenames: if not self.filenames:
raise StopIteration() raise StopIteration()
self.cur_file = open(self.filenames.pop(0)) filename = self.filenames.pop(0)
if filename.endswith('gz'):
self.cur_file = gzip.open(filename, 'r')
else:
self.cur_file = open(filename)
def next(self): def next(self):
l = self.cur_file.readline() l = self.cur_file.readline()
@ -770,6 +801,9 @@ if __name__ == '__main__':
default='INFO', type=str, default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO')) help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
parser.add_argument('-r', '--reset', dest='reset',
help='Reset analysis to a specific date (month/year)')
args = parser.parse_args() args = parser.parse_args()
# Load user conf # Load user conf
@ -804,7 +838,7 @@ if __name__ == '__main__':
sys.exit(0) sys.exit(0)
if args.stdin: if args.stdin:
iwla.start(sys.stdin) iwla.start(sys.stdin, args)
else: else:
filename = args.file or conf.analyzed_filename filename = args.file or conf.analyzed_filename
iwla.start(FileIter(filename)) iwla.start(FileIter(filename), args)

View File

@ -19,6 +19,7 @@
# #
import re import re
import logging
from iwla import IWLA from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
@ -64,6 +65,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', []) self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps) self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
self.logger = logging.getLogger(self.__class__.__name__)
return True return True
def hook(self): def hook(self):
@ -85,7 +87,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
# Page to hit # Page to hit
for regexp in self.ph_regexps: for regexp in self.ph_regexps:
if regexp.match(uri): if regexp.match(uri):
#print '%s is a hit' % (uri ) self.logger.debug('%s changed from page to hit' % (uri))
request['is_page'] = False request['is_page'] = False
super_hit['viewed_pages'] -= 1 super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1 super_hit['viewed_hits'] += 1
@ -94,7 +96,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
# Hit to page # Hit to page
for regexp in self.hp_regexps: for regexp in self.hp_regexps:
if regexp.match(uri): if regexp.match(uri):
#print '%s is a page' % (uri ) self.logger.debug('%s changed from hit to page' % (uri))
request['is_page'] = True request['is_page'] = True
super_hit['viewed_pages'] += 1 super_hit['viewed_pages'] += 1
super_hit['viewed_hits'] -= 1 super_hit['viewed_hits'] -= 1

View File

@ -20,6 +20,7 @@
import re import re
import logging import logging
import inspect
from iwla import IWLA from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
@ -66,7 +67,11 @@ class IWLAPreAnalysisRobots(IPlugin):
return True return True
def _setRobot(self, k, super_hit): def _setRobot(self, k, super_hit):
self.logger.debug('%s is a robot' % (k)) callerframerecord = inspect.stack()[1]
frame = callerframerecord[0]
info = inspect.getframeinfo(frame)
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
super_hit['robot'] = 1 super_hit['robot'] = 1
# Basic rule to detect robots # Basic rule to detect robots
@ -84,6 +89,7 @@ class IWLAPreAnalysisRobots(IPlugin):
if self.robot_re.match(first_page['http_user_agent']) or\ if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']): self.crawl_re.match(first_page['http_user_agent']):
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
@ -93,6 +99,7 @@ class IWLAPreAnalysisRobots(IPlugin):
break break
if isRobot: if isRobot:
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
@ -103,6 +110,7 @@ class IWLAPreAnalysisRobots(IPlugin):
# 2) pages without hit --> robot # 2) pages without hit --> robot
if not super_hit['viewed_hits']: if not super_hit['viewed_hits']:
self.logger.debug(super_hit)
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue