Add reset feature

Allow to open .gz file transparently
Import debug in robots.py
This commit is contained in:
Gregory Soutade 2015-05-22 07:51:11 +02:00
parent 86fc5f2189
commit 4cb3b21ca5
3 changed files with 51 additions and 7 deletions

42
iwla.py
View File

@ -683,13 +683,40 @@ class IWLA(object):
return True
def start(self, _file):
def _reset(self):
reset_time = time.strptime(self.args.reset, '%m/%Y')
self.logger.info('Reset time')
self.logger.info(reset_time)
self.meta_infos['last_time'] = reset_time
cur_time = time.localtime()
year = reset_time.tm_year
while year < cur_time.tm_year:
db_path = os.path.join(conf.DB_ROOT, str(year))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
if os.path.exists(output_path): shutil.rmtree(output_path)
year += 1
month = reset_time.tm_mon
while month <= cur_time.tm_mon:
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
if os.path.exists(output_path): shutil.rmtree(output_path)
month += 1
def start(self, _file, args):
self.args = args
self.start_time = datetime.now()
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
if args.reset:
self._reset()
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
@ -743,7 +770,11 @@ class FileIter(object):
self.cur_file = None
if not self.filenames:
raise StopIteration()
self.cur_file = open(self.filenames.pop(0))
filename = self.filenames.pop(0)
if filename.endswith('gz'):
self.cur_file = gzip.open(filename, 'r')
else:
self.cur_file = open(filename)
def next(self):
l = self.cur_file.readline()
@ -770,6 +801,9 @@ if __name__ == '__main__':
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
parser.add_argument('-r', '--reset', dest='reset',
help='Reset analysis to a specific date (month/year)')
args = parser.parse_args()
# Load user conf
@ -804,7 +838,7 @@ if __name__ == '__main__':
sys.exit(0)
if args.stdin:
iwla.start(sys.stdin)
iwla.start(sys.stdin, args)
else:
filename = args.file or conf.analyzed_filename
iwla.start(FileIter(filename))
iwla.start(FileIter(filename), args)

View File

@ -19,6 +19,7 @@
#
import re
import logging
from iwla import IWLA
from iplugin import IPlugin
@ -64,6 +65,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
self.logger = logging.getLogger(self.__class__.__name__)
return True
def hook(self):
@ -85,7 +87,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
# Page to hit
for regexp in self.ph_regexps:
if regexp.match(uri):
#print '%s is a hit' % (uri )
self.logger.debug('%s changed from page to hit' % (uri))
request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
@ -94,7 +96,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
# Hit to page
for regexp in self.hp_regexps:
if regexp.match(uri):
#print '%s is a page' % (uri )
self.logger.debug('%s changed from hit to page' % (uri))
request['is_page'] = True
super_hit['viewed_pages'] += 1
super_hit['viewed_hits'] -= 1

View File

@ -20,6 +20,7 @@
import re
import logging
import inspect
from iwla import IWLA
from iplugin import IPlugin
@ -66,7 +67,11 @@ class IWLAPreAnalysisRobots(IPlugin):
return True
def _setRobot(self, k, super_hit):
self.logger.debug('%s is a robot' % (k))
callerframerecord = inspect.stack()[1]
frame = callerframerecord[0]
info = inspect.getframeinfo(frame)
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
super_hit['robot'] = 1
# Basic rule to detect robots
@ -84,6 +89,7 @@ class IWLAPreAnalysisRobots(IPlugin):
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']):
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
@ -93,6 +99,7 @@ class IWLAPreAnalysisRobots(IPlugin):
break
if isRobot:
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
@ -103,6 +110,7 @@ class IWLAPreAnalysisRobots(IPlugin):
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
continue