Compare commits

..

3 Commits

Author SHA1 Message Date
Gregory Soutade
bde91ca936 Move reverse DNS core management into iwla.py + Add robot_domains configuration 2024-10-27 09:16:01 +01:00
Gregory Soutade
70de0d3aca Add no_merge_feeds_parsers_list conf value 2024-10-27 09:15:39 +01:00
Gregory Soutade
9939922c31 Move feeds and reverse_dns plugins from post_analysis to pre_analysis 2024-10-02 08:27:53 +02:00
4 changed files with 72 additions and 43 deletions

28
iwla.py
View File

@ -32,6 +32,7 @@ import logging
import gettext import gettext
from calendar import monthrange from calendar import monthrange
from datetime import date, datetime from datetime import date, datetime
import socket
import default_conf as conf import default_conf as conf
@ -53,6 +54,7 @@ Conf values needed :
compress_output_files compress_output_files
excluded_ip excluded_ip
excluded_domain_name excluded_domain_name
reverse_dns_timeout*
Output files : Output files :
DB_ROOT/meta.db DB_ROOT/meta.db
@ -133,7 +135,8 @@ class IWLA(object):
ANALYSIS_CLASS = 'HTTP' ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1 API_VERSION = 1
IWLA_VERSION = '0.7' IWLA_VERSION = '0.8'
DEFAULT_DNS_TIMEOUT = 0.5
def __init__(self, logLevel, args): def __init__(self, logLevel, args):
self.meta_infos = {} self.meta_infos = {}
@ -145,6 +148,9 @@ class IWLA(object):
self.valid_visitors = None self.valid_visitors = None
self.args = args self.args = args
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
IWLA.DEFAULT_DNS_TIMEOUT)
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)') self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
@ -242,6 +248,26 @@ class IWLA(object):
def getCSSPath(self): def getCSSPath(self):
return conf.css_path return conf.css_path
def reverseDNS(self, hit):
if hit.get('dns_name_replaced', False):
return hit['remote_addr']
try:
timeout = socket.getdefaulttimeout()
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(self.reverse_dns_timeout)
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(timeout)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except socket.herror:
pass
finally:
hit['dns_analysed'] = True
return hit['remote_addr']
def _clearMeta(self): def _clearMeta(self):
self.meta_infos = { self.meta_infos = {
'last_time' : None, 'last_time' : None,

View File

@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
return True return True
def hook(self): def hook(self):
from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds
display = self.iwla.getDisplay() display = self.iwla.getDisplay()
hits = self.iwla.getCurrentVisits() hits = self.iwla.getCurrentVisits()

View File

@ -25,7 +25,7 @@ from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
""" """
Post analysis hook Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
@ -77,6 +77,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds_agents = self.iwla.getConfValue('feeds_agents', []) feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False if feeds is None: return False
@ -104,6 +105,10 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in _merge_feeds_parsers_list: for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f)) self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {} self.merged_feeds = {}
return True return True
@ -134,8 +139,11 @@ class IWLAPostAnalysisFeeds(IPlugin):
def mergeFeedsParsers(self, isFeedParser, hit): def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER): if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list: for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']): if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
# One group can view multiple different feeds # One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '') key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit) self._appendToMergeCache(isFeedParser, key, hit)
@ -159,15 +167,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)): if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access'] hit['feed_parser_last_access'] = hit['last_access']
if not hit.get('feed_name_analyzed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.NOT_A_FEED_PARSER
break
# Register already tagged feed parser in merged_feeds # Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)
@ -177,12 +176,12 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower() uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re: for regexp in self.feeds_re:
if regexp.match(uri): if regexp.match(uri) and self.iwla.hasBeenViewed(request):
isFeedParser = self.FEED_PARSER isFeedParser = self.FEED_PARSER
# Robot that views pages -> bot # # Robot that views pages -> bot
if hit['robot']: # if hit['robot']:
if hit['not_viewed_pages'][0]: # if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER # isFeedParser = self.NOT_A_FEED_PARSER
break break
user_agent = request['http_user_agent'].lower() user_agent = request['http_user_agent'].lower()
@ -199,13 +198,16 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
if not hit.get('feed_name_analyzed', False) and\ if isFeedParser == self.FEED_PARSER:
hit.get('dns_name_replaced', False): if not hit.get('dns_name_replaced', False):
self.iwla.reverseDNS(hit)
if not hit.get('feed_name_analyzed', False):
hit['feed_name_analyzed'] = True hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None) addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re: for r in self.bad_feeds_re:
if r.match(addr): if r.match(addr):
isFeedParser = hit['feed_parser'] = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
if isFeedParser == self.FEED_PARSER: if isFeedParser == self.FEED_PARSER:
@ -217,6 +219,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
if subscribers: if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0]) hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)

View File

@ -19,12 +19,13 @@
# #
import socket import socket
import re
from iwla import IWLA from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
""" """
Post analysis hook Pre analysis hook
Replace IP by reverse DNS names Replace IP by reverse DNS names
@ -32,7 +33,7 @@ Plugin requirements :
None None
Conf values needed : Conf values needed :
reverse_dns_timeout* robot_domains*
Output files : Output files :
None None
@ -51,12 +52,13 @@ Statistics deletion :
""" """
class IWLAPostAnalysisReverseDNS(IPlugin): class IWLAPostAnalysisReverseDNS(IPlugin):
DEFAULT_DNS_TIMEOUT = 0.5
def load(self): def load(self):
timeout = self.iwla.getConfValue('reverse_dns_timeout', self.robot_domains_re = []
IWLAPostAnalysisReverseDNS.DEFAULT_DNS_TIMEOUT) robot_domains = self.iwla.getConfValue('robot_domains', [])
socket.setdefaulttimeout(timeout) for domain in robot_domains:
self.robot_domains_re.append(re.compile(domain))
return True return True
def hook(self): def hook(self):
@ -65,15 +67,13 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
if hit.get('dns_analysed', False): continue if hit.get('dns_analysed', False): continue
# Do reverse for feed parser even if they're not # Do reverse for feed parser even if they're not
# valid visitors # valid visitors
if not hit.get('feed_parser', False) and\ if hit.get('robot', False) and not hit.get('feed_parser', False):
not self.iwla.isValidVisitor(hit):
continue continue
try:
name, _, _ = socket.gethostbyaddr(k) res = self.iwla.reverseDNS(hit)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True for r in self.robot_domains_re:
except: if r.match(hit['remote_addr']):
pass hit['robot'] = True
finally: break
hit['dns_analysed'] = True