diff --git a/iwla.py b/iwla.py index fd5b2ab..5e00640 100755 --- a/iwla.py +++ b/iwla.py @@ -683,13 +683,40 @@ class IWLA(object): return True - def start(self, _file): + def _reset(self): + reset_time = time.strptime(self.args.reset, '%m/%Y') + + self.logger.info('Reset time') + self.logger.info(reset_time) + + self.meta_infos['last_time'] = reset_time + + cur_time = time.localtime() + year = reset_time.tm_year + while year < cur_time.tm_year: + db_path = os.path.join(conf.DB_ROOT, str(year)) + if os.path.exists(db_path): shutil.rmtree(db_path) + output_path = os.path.join(conf.DISPLAY_ROOT, str(year)) + if os.path.exists(output_path): shutil.rmtree(output_path) + year += 1 + month = reset_time.tm_mon + while month <= cur_time.tm_mon: + db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month)) + if os.path.exists(db_path): shutil.rmtree(db_path) + output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month)) + if os.path.exists(output_path): shutil.rmtree(output_path) + month += 1 + + def start(self, _file, args): + self.args = args self.start_time = datetime.now() self.logger.info('==> Load previous database') self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() if self.meta_infos['last_time']: + if args.reset: + self._reset() self.logger.info('Last time') self.logger.info(self.meta_infos['last_time']) self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits() @@ -743,7 +770,11 @@ class FileIter(object): self.cur_file = None if not self.filenames: raise StopIteration() - self.cur_file = open(self.filenames.pop(0)) + filename = self.filenames.pop(0) + if filename.endswith('gz'): + self.cur_file = gzip.open(filename, 'r') + else: + self.cur_file = open(filename) def next(self): l = self.cur_file.readline() @@ -770,6 +801,9 @@ if __name__ == '__main__': default='INFO', type=str, help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO')) + parser.add_argument('-r', '--reset', dest='reset', + help='Reset analysis to a specific date (month/year)') + args = parser.parse_args() # Load user conf @@ -804,7 +838,7 @@ if __name__ == '__main__': sys.exit(0) if args.stdin: - iwla.start(sys.stdin) + iwla.start(sys.stdin, args) else: filename = args.file or conf.analyzed_filename - iwla.start(FileIter(filename)) + iwla.start(FileIter(filename), args) diff --git a/plugins/pre_analysis/page_to_hit.py b/plugins/pre_analysis/page_to_hit.py index 77772bb..282f53f 100644 --- a/plugins/pre_analysis/page_to_hit.py +++ b/plugins/pre_analysis/page_to_hit.py @@ -19,6 +19,7 @@ # import re +import logging from iwla import IWLA from iplugin import IPlugin @@ -64,6 +65,7 @@ class IWLAPreAnalysisPageToHit(IPlugin): self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', []) self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps) + self.logger = logging.getLogger(self.__class__.__name__) return True def hook(self): @@ -85,7 +87,7 @@ class IWLAPreAnalysisPageToHit(IPlugin): # Page to hit for regexp in self.ph_regexps: if regexp.match(uri): - #print '%s is a hit' % (uri ) + self.logger.debug('%s changed from page to hit' % (uri)) request['is_page'] = False super_hit['viewed_pages'] -= 1 super_hit['viewed_hits'] += 1 @@ -94,7 +96,7 @@ class IWLAPreAnalysisPageToHit(IPlugin): # Hit to page for regexp in self.hp_regexps: if regexp.match(uri): - #print '%s is a page' % (uri ) + self.logger.debug('%s changed from hit to page' % (uri)) request['is_page'] = True super_hit['viewed_pages'] += 1 super_hit['viewed_hits'] -= 1 diff --git a/plugins/pre_analysis/robots.py b/plugins/pre_analysis/robots.py index 41c744e..d84087d 100644 --- a/plugins/pre_analysis/robots.py +++ b/plugins/pre_analysis/robots.py @@ -20,6 +20,7 @@ import re import logging +import inspect from iwla import IWLA from iplugin import IPlugin @@ -66,7 +67,11 @@ class IWLAPreAnalysisRobots(IPlugin): return True def _setRobot(self, k, super_hit): - self.logger.debug('%s is a robot' % (k)) + callerframerecord = inspect.stack()[1] + frame = callerframerecord[0] + info = inspect.getframeinfo(frame) + + self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno)) super_hit['robot'] = 1 # Basic rule to detect robots @@ -84,6 +89,7 @@ class IWLAPreAnalysisRobots(IPlugin): if self.robot_re.match(first_page['http_user_agent']) or\ self.crawl_re.match(first_page['http_user_agent']): + self.logger.debug(first_page['http_user_agent']) self._setRobot(k, super_hit) continue @@ -93,6 +99,7 @@ class IWLAPreAnalysisRobots(IPlugin): break if isRobot: + self.logger.debug(first_page['http_user_agent']) self._setRobot(k, super_hit) continue @@ -103,6 +110,7 @@ class IWLAPreAnalysisRobots(IPlugin): # 2) pages without hit --> robot if not super_hit['viewed_hits']: + self.logger.debug(super_hit) self._setRobot(k, super_hit) continue