Add reset feature
Allow to open .gz file transparently Import debug in robots.py
This commit is contained in:
parent
86fc5f2189
commit
4cb3b21ca5
42
iwla.py
42
iwla.py
|
@ -683,13 +683,40 @@ class IWLA(object):
|
|||
|
||||
return True
|
||||
|
||||
def start(self, _file):
|
||||
def _reset(self):
|
||||
reset_time = time.strptime(self.args.reset, '%m/%Y')
|
||||
|
||||
self.logger.info('Reset time')
|
||||
self.logger.info(reset_time)
|
||||
|
||||
self.meta_infos['last_time'] = reset_time
|
||||
|
||||
cur_time = time.localtime()
|
||||
year = reset_time.tm_year
|
||||
while year < cur_time.tm_year:
|
||||
db_path = os.path.join(conf.DB_ROOT, str(year))
|
||||
if os.path.exists(db_path): shutil.rmtree(db_path)
|
||||
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
|
||||
if os.path.exists(output_path): shutil.rmtree(output_path)
|
||||
year += 1
|
||||
month = reset_time.tm_mon
|
||||
while month <= cur_time.tm_mon:
|
||||
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
|
||||
if os.path.exists(db_path): shutil.rmtree(db_path)
|
||||
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
|
||||
if os.path.exists(output_path): shutil.rmtree(output_path)
|
||||
month += 1
|
||||
|
||||
def start(self, _file, args):
|
||||
self.args = args
|
||||
self.start_time = datetime.now()
|
||||
|
||||
self.logger.info('==> Load previous database')
|
||||
|
||||
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
||||
if self.meta_infos['last_time']:
|
||||
if args.reset:
|
||||
self._reset()
|
||||
self.logger.info('Last time')
|
||||
self.logger.info(self.meta_infos['last_time'])
|
||||
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
|
||||
|
@ -743,7 +770,11 @@ class FileIter(object):
|
|||
self.cur_file = None
|
||||
if not self.filenames:
|
||||
raise StopIteration()
|
||||
self.cur_file = open(self.filenames.pop(0))
|
||||
filename = self.filenames.pop(0)
|
||||
if filename.endswith('gz'):
|
||||
self.cur_file = gzip.open(filename, 'r')
|
||||
else:
|
||||
self.cur_file = open(filename)
|
||||
|
||||
def next(self):
|
||||
l = self.cur_file.readline()
|
||||
|
@ -770,6 +801,9 @@ if __name__ == '__main__':
|
|||
default='INFO', type=str,
|
||||
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
|
||||
|
||||
parser.add_argument('-r', '--reset', dest='reset',
|
||||
help='Reset analysis to a specific date (month/year)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load user conf
|
||||
|
@ -804,7 +838,7 @@ if __name__ == '__main__':
|
|||
sys.exit(0)
|
||||
|
||||
if args.stdin:
|
||||
iwla.start(sys.stdin)
|
||||
iwla.start(sys.stdin, args)
|
||||
else:
|
||||
filename = args.file or conf.analyzed_filename
|
||||
iwla.start(FileIter(filename))
|
||||
iwla.start(FileIter(filename), args)
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from iwla import IWLA
|
||||
from iplugin import IPlugin
|
||||
|
@ -64,6 +65,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
|||
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
|
||||
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
|
||||
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
return True
|
||||
|
||||
def hook(self):
|
||||
|
@ -85,7 +87,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
|||
# Page to hit
|
||||
for regexp in self.ph_regexps:
|
||||
if regexp.match(uri):
|
||||
#print '%s is a hit' % (uri )
|
||||
self.logger.debug('%s changed from page to hit' % (uri))
|
||||
request['is_page'] = False
|
||||
super_hit['viewed_pages'] -= 1
|
||||
super_hit['viewed_hits'] += 1
|
||||
|
@ -94,7 +96,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
|||
# Hit to page
|
||||
for regexp in self.hp_regexps:
|
||||
if regexp.match(uri):
|
||||
#print '%s is a page' % (uri )
|
||||
self.logger.debug('%s changed from hit to page' % (uri))
|
||||
request['is_page'] = True
|
||||
super_hit['viewed_pages'] += 1
|
||||
super_hit['viewed_hits'] -= 1
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
import re
|
||||
import logging
|
||||
import inspect
|
||||
|
||||
from iwla import IWLA
|
||||
from iplugin import IPlugin
|
||||
|
@ -66,7 +67,11 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
return True
|
||||
|
||||
def _setRobot(self, k, super_hit):
|
||||
self.logger.debug('%s is a robot' % (k))
|
||||
callerframerecord = inspect.stack()[1]
|
||||
frame = callerframerecord[0]
|
||||
info = inspect.getframeinfo(frame)
|
||||
|
||||
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
|
||||
super_hit['robot'] = 1
|
||||
|
||||
# Basic rule to detect robots
|
||||
|
@ -84,6 +89,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
|
||||
if self.robot_re.match(first_page['http_user_agent']) or\
|
||||
self.crawl_re.match(first_page['http_user_agent']):
|
||||
self.logger.debug(first_page['http_user_agent'])
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
|
@ -93,6 +99,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
break
|
||||
|
||||
if isRobot:
|
||||
self.logger.debug(first_page['http_user_agent'])
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
|
@ -103,6 +110,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
|||
|
||||
# 2) pages without hit --> robot
|
||||
if not super_hit['viewed_hits']:
|
||||
self.logger.debug(super_hit)
|
||||
self._setRobot(k, super_hit)
|
||||
continue
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user