Compare commits

..

No commits in common. "bde91ca9369002d6ec335146349693705c6af70e" and "6d46ac4461019ed345038fef21a8891d8ba9bdae" have entirely different histories.

4 changed files with 43 additions and 72 deletions

28
iwla.py
View File

@ -32,7 +32,6 @@ import logging
import gettext import gettext
from calendar import monthrange from calendar import monthrange
from datetime import date, datetime from datetime import date, datetime
import socket
import default_conf as conf import default_conf as conf
@ -54,7 +53,6 @@ Conf values needed :
compress_output_files compress_output_files
excluded_ip excluded_ip
excluded_domain_name excluded_domain_name
reverse_dns_timeout*
Output files : Output files :
DB_ROOT/meta.db DB_ROOT/meta.db
@ -135,8 +133,7 @@ class IWLA(object):
ANALYSIS_CLASS = 'HTTP' ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1 API_VERSION = 1
IWLA_VERSION = '0.8' IWLA_VERSION = '0.7'
DEFAULT_DNS_TIMEOUT = 0.5
def __init__(self, logLevel, args): def __init__(self, logLevel, args):
self.meta_infos = {} self.meta_infos = {}
@ -148,9 +145,6 @@ class IWLA(object):
self.valid_visitors = None self.valid_visitors = None
self.args = args self.args = args
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
IWLA.DEFAULT_DNS_TIMEOUT)
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)') self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
@ -248,26 +242,6 @@ class IWLA(object):
def getCSSPath(self): def getCSSPath(self):
return conf.css_path return conf.css_path
def reverseDNS(self, hit):
if hit.get('dns_name_replaced', False):
return hit['remote_addr']
try:
timeout = socket.getdefaulttimeout()
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(self.reverse_dns_timeout)
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(timeout)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except socket.herror:
pass
finally:
hit['dns_analysed'] = True
return hit['remote_addr']
def _clearMeta(self): def _clearMeta(self):
self.meta_infos = { self.meta_infos = {
'last_time' : None, 'last_time' : None,

View File

@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
return True return True
def hook(self): def hook(self):
from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds
display = self.iwla.getDisplay() display = self.iwla.getDisplay()
hits = self.iwla.getCurrentVisits() hits = self.iwla.getCurrentVisits()

View File

@ -25,7 +25,7 @@ from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
""" """
Pre analysis hook Post analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot) Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
@ -77,7 +77,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds_agents = self.iwla.getConfValue('feeds_agents', []) feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', []) _merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False if feeds is None: return False
@ -105,10 +104,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in _merge_feeds_parsers_list: for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f)) self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {} self.merged_feeds = {}
return True return True
@ -139,11 +134,8 @@ class IWLAPostAnalysisFeeds(IPlugin):
def mergeFeedsParsers(self, isFeedParser, hit): def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER): if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list: for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']): if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
# One group can view multiple different feeds # One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '') key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit) self._appendToMergeCache(isFeedParser, key, hit)
@ -166,7 +158,16 @@ class IWLAPostAnalysisFeeds(IPlugin):
# Update last access time # Update last access time
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)): if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access'] hit['feed_parser_last_access'] = hit['last_access']
if not hit.get('feed_name_analyzed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.NOT_A_FEED_PARSER
break
# Register already tagged feed parser in merged_feeds # Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)
@ -176,12 +177,12 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower() uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re: for regexp in self.feeds_re:
if regexp.match(uri) and self.iwla.hasBeenViewed(request): if regexp.match(uri):
isFeedParser = self.FEED_PARSER isFeedParser = self.FEED_PARSER
# # Robot that views pages -> bot # Robot that views pages -> bot
# if hit['robot']: if hit['robot']:
# if hit['not_viewed_pages'][0]: if hit['not_viewed_pages'][0]:
# isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
user_agent = request['http_user_agent'].lower() user_agent = request['http_user_agent'].lower()
@ -198,17 +199,14 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER isFeedParser = self.NOT_A_FEED_PARSER
break break
if isFeedParser == self.FEED_PARSER: if not hit.get('feed_name_analyzed', False) and\
if not hit.get('dns_name_replaced', False): hit.get('dns_name_replaced', False):
self.iwla.reverseDNS(hit) hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
if not hit.get('feed_name_analyzed', False): for r in self.bad_feeds_re:
hit['feed_name_analyzed'] = True if r.match(addr):
addr = hit.get('remote_addr', None) isFeedParser = hit['feed_parser'] = self.NOT_A_FEED_PARSER
for r in self.bad_feeds_re: break
if r.match(addr):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER: if isFeedParser == self.FEED_PARSER:
hit['feed_domain'] = request['server_name'] hit['feed_domain'] = request['server_name']
@ -219,7 +217,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
if subscribers: if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0]) hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit) self.mergeFeedsParsers(isFeedParser, hit)

View File

@ -19,13 +19,12 @@
# #
import socket import socket
import re
from iwla import IWLA from iwla import IWLA
from iplugin import IPlugin from iplugin import IPlugin
""" """
Pre analysis hook Post analysis hook
Replace IP by reverse DNS names Replace IP by reverse DNS names
@ -33,7 +32,7 @@ Plugin requirements :
None None
Conf values needed : Conf values needed :
robot_domains* reverse_dns_timeout*
Output files : Output files :
None None
@ -52,13 +51,12 @@ Statistics deletion :
""" """
class IWLAPostAnalysisReverseDNS(IPlugin): class IWLAPostAnalysisReverseDNS(IPlugin):
DEFAULT_DNS_TIMEOUT = 0.5
def load(self): def load(self):
self.robot_domains_re = [] timeout = self.iwla.getConfValue('reverse_dns_timeout',
robot_domains = self.iwla.getConfValue('robot_domains', []) IWLAPostAnalysisReverseDNS.DEFAULT_DNS_TIMEOUT)
for domain in robot_domains: socket.setdefaulttimeout(timeout)
self.robot_domains_re.append(re.compile(domain))
return True return True
def hook(self): def hook(self):
@ -67,13 +65,15 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
if hit.get('dns_analysed', False): continue if hit.get('dns_analysed', False): continue
# Do reverse for feed parser even if they're not # Do reverse for feed parser even if they're not
# valid visitors # valid visitors
if hit.get('robot', False) and not hit.get('feed_parser', False): if not hit.get('feed_parser', False) and\
not self.iwla.isValidVisitor(hit):
continue continue
try:
res = self.iwla.reverseDNS(hit) name, _, _ = socket.gethostbyaddr(k)
hit['remote_addr'] = name.lower()
for r in self.robot_domains_re: hit['dns_name_replaced'] = True
if r.match(hit['remote_addr']): except:
hit['robot'] = True pass
break finally:
hit['dns_analysed'] = True