Do a more generic plugin : page_to_hit
This commit is contained in:
		
							
								
								
									
										38
									
								
								plugins/pre_analysis/page_to_hit.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								plugins/pre_analysis/page_to_hit.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| import re | ||||
|  | ||||
| from iwla import IWLA | ||||
| from iplugin import IPlugin | ||||
|  | ||||
| # Basic rule to detect robots | ||||
|  | ||||
| class IWLAPreAnalysisPageToHit(IPlugin): | ||||
|  | ||||
|     def __init__(self, iwla): | ||||
|         super(IWLAPreAnalysisPageToHit, self).__init__(iwla) | ||||
|         self.API_VERSION = 1 | ||||
|  | ||||
|     def load(self): | ||||
| # Remove logo from indefero | ||||
|         self.regexps = self.iwla.getConfValue('page_to_hit_conf', []) | ||||
|         if not self.regexps: return False | ||||
|         self.regexps = map(lambda(r): re.compile(r), self.regexps) | ||||
|  | ||||
|         return True | ||||
|  | ||||
|     def hook(self, iwla): | ||||
|         hits = iwla.getCurrentVisists() | ||||
|  | ||||
|         for (k, super_hit) in hits.items(): | ||||
|             if super_hit['robot']: continue | ||||
|  | ||||
|             for p in super_hit['requests']: | ||||
|                 if not p['is_page']: continue | ||||
|                 if int(p['status']) != 200: continue | ||||
|                 if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue | ||||
|                 uri = p['extract_request']['extract_uri'] | ||||
|                 for r in self.regexps: | ||||
|                     if r.match(uri): | ||||
|                         p['is_page'] = False | ||||
|                         super_hit['viewed_pages'] -= 1 | ||||
|                         super_hit['viewed_hits'] += 1 | ||||
|                         break | ||||
| @@ -1,35 +0,0 @@ | ||||
| import re | ||||
|  | ||||
| from iwla import IWLA | ||||
| from iplugin import IPlugin | ||||
|  | ||||
| # Basic rule to detect robots | ||||
|  | ||||
| class IWLAPreAnalysisSoutade(IPlugin): | ||||
|  | ||||
|     def __init__(self, iwla): | ||||
|         super(IWLAPreAnalysisSoutade, self).__init__(iwla) | ||||
|         self.API_VERSION = 1 | ||||
|  | ||||
|     def load(self): | ||||
| # Remove logo from indefero | ||||
|         self.logo_re = re.compile(r'^.+/logo/$') | ||||
|  | ||||
|         return True | ||||
|  | ||||
|     def hook(self, iwla): | ||||
|         hits = iwla.getCurrentVisists() | ||||
|  | ||||
|         for k in hits.keys(): | ||||
|             super_hit = hits[k] | ||||
|  | ||||
|             if super_hit['robot']: continue | ||||
|  | ||||
|             for p in super_hit['requests']: | ||||
|                 if not p['is_page']: continue | ||||
|                 if int(p['status']) != 200: continue | ||||
|                 if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue | ||||
|                 if self.logo_re.match(p['extract_request']['extract_uri']): | ||||
|                     p['is_page'] = False | ||||
|                     super_hit['viewed_pages'] -= 1 | ||||
|                     super_hit['viewed_hits'] += 1 | ||||
		Reference in New Issue
	
	Block a user