40 lines
		
	
	
		
			935 B
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			40 lines
		
	
	
		
			935 B
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | 
 | ||
|  | # Basic rule to detect robots | ||
|  | 
 | ||
|  | def hook(hits): | ||
|  |     for k in hits.keys(): | ||
|  |         super_hit = hits[k] | ||
|  | 
 | ||
|  |         if super_hit['robot']: continue | ||
|  | 
 | ||
|  |         isRobot = False | ||
|  |         referers = 0 | ||
|  | 
 | ||
|  | # 1) no pages view --> robot | ||
|  |         if not super_hit['viewed_pages']: | ||
|  |             super_hit['robot'] = 1 | ||
|  |             continue | ||
|  | 
 | ||
|  | # 2) pages without hit --> robot | ||
|  |         if not super_hit['viewed_hits']: | ||
|  |             super_hit['robot'] = 1 | ||
|  |             continue | ||
|  | 
 | ||
|  |         for hit in super_hit['pages']: | ||
|  | # 3) /robots.txt read | ||
|  |             if hit['extract_request']['http_uri'] == '/robots.txt': | ||
|  |                 isRobot = True | ||
|  |                 break | ||
|  | 
 | ||
|  | # 4) Any referer for hits | ||
|  |             if not hit['is_page'] and hit['http_referer']: | ||
|  |                 referers += 1 | ||
|  | 
 | ||
|  |         if isRobot: | ||
|  |             super_hit['robot'] = 1 | ||
|  |             continue | ||
|  | 
 | ||
|  |         if super_hit['viewed_hits'] and not referers: | ||
|  |             super_hit['robot'] = 1 | ||
|  |             continue |