43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
 | |
| # Basic rule to detect robots
 | |
| 
 | |
| def hook(hits):
 | |
|     for k in hits.keys():
 | |
|         super_hit = hits[k]
 | |
| 
 | |
|         if super_hit['robot']: continue
 | |
| 
 | |
|         isRobot = False
 | |
|         referers = 0
 | |
| 
 | |
| # 1) no pages view --> robot
 | |
|         # if not super_hit['viewed_pages']:
 | |
|         #     super_hit['robot'] = 1
 | |
|         #     continue
 | |
| 
 | |
| # 2) pages without hit --> robot
 | |
|         if not super_hit['viewed_hits']:
 | |
|             super_hit['robot'] = 1
 | |
|             continue
 | |
|         elif not super_hit['viewed_pages']:
 | |
| # Hit only
 | |
|             super_hit['hit_only'] = 1
 | |
|             
 | |
|         for hit in super_hit['pages']:
 | |
| # 3) /robots.txt read
 | |
|             if hit['extract_request']['http_uri'] == '/robots.txt':
 | |
|                 isRobot = True
 | |
|                 break
 | |
| 
 | |
| # 4) Any referer for hits
 | |
|             if not hit['is_page'] and hit['http_referer']:
 | |
|                 referers += 1
 | |
| 
 | |
|         if isRobot:
 | |
|             super_hit['robot'] = 1
 | |
|             continue
 | |
| 
 | |
|         if super_hit['viewed_hits'] and not referers:
 | |
|             super_hit['robot'] = 1
 | |
|             continue
 |