Add reset feature
Allow to open .gz file transparently Import debug in robots.py
This commit is contained in:
parent
86fc5f2189
commit
4cb3b21ca5
42
iwla.py
42
iwla.py
|
@ -683,13 +683,40 @@ class IWLA(object):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def start(self, _file):
|
def _reset(self):
|
||||||
|
reset_time = time.strptime(self.args.reset, '%m/%Y')
|
||||||
|
|
||||||
|
self.logger.info('Reset time')
|
||||||
|
self.logger.info(reset_time)
|
||||||
|
|
||||||
|
self.meta_infos['last_time'] = reset_time
|
||||||
|
|
||||||
|
cur_time = time.localtime()
|
||||||
|
year = reset_time.tm_year
|
||||||
|
while year < cur_time.tm_year:
|
||||||
|
db_path = os.path.join(conf.DB_ROOT, str(year))
|
||||||
|
if os.path.exists(db_path): shutil.rmtree(db_path)
|
||||||
|
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
|
||||||
|
if os.path.exists(output_path): shutil.rmtree(output_path)
|
||||||
|
year += 1
|
||||||
|
month = reset_time.tm_mon
|
||||||
|
while month <= cur_time.tm_mon:
|
||||||
|
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
|
||||||
|
if os.path.exists(db_path): shutil.rmtree(db_path)
|
||||||
|
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
|
||||||
|
if os.path.exists(output_path): shutil.rmtree(output_path)
|
||||||
|
month += 1
|
||||||
|
|
||||||
|
def start(self, _file, args):
|
||||||
|
self.args = args
|
||||||
self.start_time = datetime.now()
|
self.start_time = datetime.now()
|
||||||
|
|
||||||
self.logger.info('==> Load previous database')
|
self.logger.info('==> Load previous database')
|
||||||
|
|
||||||
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
|
||||||
if self.meta_infos['last_time']:
|
if self.meta_infos['last_time']:
|
||||||
|
if args.reset:
|
||||||
|
self._reset()
|
||||||
self.logger.info('Last time')
|
self.logger.info('Last time')
|
||||||
self.logger.info(self.meta_infos['last_time'])
|
self.logger.info(self.meta_infos['last_time'])
|
||||||
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
|
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
|
||||||
|
@ -743,7 +770,11 @@ class FileIter(object):
|
||||||
self.cur_file = None
|
self.cur_file = None
|
||||||
if not self.filenames:
|
if not self.filenames:
|
||||||
raise StopIteration()
|
raise StopIteration()
|
||||||
self.cur_file = open(self.filenames.pop(0))
|
filename = self.filenames.pop(0)
|
||||||
|
if filename.endswith('gz'):
|
||||||
|
self.cur_file = gzip.open(filename, 'r')
|
||||||
|
else:
|
||||||
|
self.cur_file = open(filename)
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
l = self.cur_file.readline()
|
l = self.cur_file.readline()
|
||||||
|
@ -770,6 +801,9 @@ if __name__ == '__main__':
|
||||||
default='INFO', type=str,
|
default='INFO', type=str,
|
||||||
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
|
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
|
||||||
|
|
||||||
|
parser.add_argument('-r', '--reset', dest='reset',
|
||||||
|
help='Reset analysis to a specific date (month/year)')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Load user conf
|
# Load user conf
|
||||||
|
@ -804,7 +838,7 @@ if __name__ == '__main__':
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
if args.stdin:
|
if args.stdin:
|
||||||
iwla.start(sys.stdin)
|
iwla.start(sys.stdin, args)
|
||||||
else:
|
else:
|
||||||
filename = args.file or conf.analyzed_filename
|
filename = args.file or conf.analyzed_filename
|
||||||
iwla.start(FileIter(filename))
|
iwla.start(FileIter(filename), args)
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
from iwla import IWLA
|
from iwla import IWLA
|
||||||
from iplugin import IPlugin
|
from iplugin import IPlugin
|
||||||
|
@ -64,6 +65,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
||||||
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
|
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
|
||||||
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
|
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def hook(self):
|
def hook(self):
|
||||||
|
@ -85,7 +87,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
||||||
# Page to hit
|
# Page to hit
|
||||||
for regexp in self.ph_regexps:
|
for regexp in self.ph_regexps:
|
||||||
if regexp.match(uri):
|
if regexp.match(uri):
|
||||||
#print '%s is a hit' % (uri )
|
self.logger.debug('%s changed from page to hit' % (uri))
|
||||||
request['is_page'] = False
|
request['is_page'] = False
|
||||||
super_hit['viewed_pages'] -= 1
|
super_hit['viewed_pages'] -= 1
|
||||||
super_hit['viewed_hits'] += 1
|
super_hit['viewed_hits'] += 1
|
||||||
|
@ -94,7 +96,7 @@ class IWLAPreAnalysisPageToHit(IPlugin):
|
||||||
# Hit to page
|
# Hit to page
|
||||||
for regexp in self.hp_regexps:
|
for regexp in self.hp_regexps:
|
||||||
if regexp.match(uri):
|
if regexp.match(uri):
|
||||||
#print '%s is a page' % (uri )
|
self.logger.debug('%s changed from hit to page' % (uri))
|
||||||
request['is_page'] = True
|
request['is_page'] = True
|
||||||
super_hit['viewed_pages'] += 1
|
super_hit['viewed_pages'] += 1
|
||||||
super_hit['viewed_hits'] -= 1
|
super_hit['viewed_hits'] -= 1
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
import inspect
|
||||||
|
|
||||||
from iwla import IWLA
|
from iwla import IWLA
|
||||||
from iplugin import IPlugin
|
from iplugin import IPlugin
|
||||||
|
@ -66,7 +67,11 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _setRobot(self, k, super_hit):
|
def _setRobot(self, k, super_hit):
|
||||||
self.logger.debug('%s is a robot' % (k))
|
callerframerecord = inspect.stack()[1]
|
||||||
|
frame = callerframerecord[0]
|
||||||
|
info = inspect.getframeinfo(frame)
|
||||||
|
|
||||||
|
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
|
||||||
super_hit['robot'] = 1
|
super_hit['robot'] = 1
|
||||||
|
|
||||||
# Basic rule to detect robots
|
# Basic rule to detect robots
|
||||||
|
@ -84,6 +89,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
|
|
||||||
if self.robot_re.match(first_page['http_user_agent']) or\
|
if self.robot_re.match(first_page['http_user_agent']) or\
|
||||||
self.crawl_re.match(first_page['http_user_agent']):
|
self.crawl_re.match(first_page['http_user_agent']):
|
||||||
|
self.logger.debug(first_page['http_user_agent'])
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -93,6 +99,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
break
|
break
|
||||||
|
|
||||||
if isRobot:
|
if isRobot:
|
||||||
|
self.logger.debug(first_page['http_user_agent'])
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -103,6 +110,7 @@ class IWLAPreAnalysisRobots(IPlugin):
|
||||||
|
|
||||||
# 2) pages without hit --> robot
|
# 2) pages without hit --> robot
|
||||||
if not super_hit['viewed_hits']:
|
if not super_hit['viewed_hits']:
|
||||||
|
self.logger.debug(super_hit)
|
||||||
self._setRobot(k, super_hit)
|
self._setRobot(k, super_hit)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user