import re import time from iwla import IWLA from iplugin import IPlugin # Basic rule to detect robots class IWLAPreAnalysisPageToHit(IPlugin): def __init__(self, iwla): super(IWLAPreAnalysisPageToHit, self).__init__(iwla) self.API_VERSION = 1 def load(self): # Remove logo from indefero self.regexps = self.iwla.getConfValue('page_to_hit_conf', []) if not self.regexps: return False self.regexps = map(lambda(r): re.compile(r), self.regexps) return True def hook(self, iwla): start_time = self.iwla.getStartAnalysisTime() start_time = time.mktime(start_time) hits = iwla.getCurrentVisists() viewed_http_codes = self.iwla.getConfValue('viewed_http_codes', [200, 304]) for (k, super_hit) in hits.items(): if super_hit['robot']: continue for p in super_hit['requests']: if not p['is_page']: continue if int(p['status']) not in viewed_http_codes: continue if time.mktime(p['time_decoded']) < start_time: continue uri = p['extract_request']['extract_uri'] for r in self.regexps: if r.match(uri): p['is_page'] = False super_hit['viewed_pages'] -= 1 super_hit['viewed_hits'] += 1 break