Compare commits

..

No commits in common. "bde91ca9369002d6ec335146349693705c6af70e" and "6d46ac4461019ed345038fef21a8891d8ba9bdae" have entirely different histories.

4 changed files with 43 additions and 72 deletions

28
iwla.py
View File

@ -32,7 +32,6 @@ import logging
import gettext
from calendar import monthrange
from datetime import date, datetime
import socket
import default_conf as conf
@ -54,7 +53,6 @@ Conf values needed :
compress_output_files
excluded_ip
excluded_domain_name
reverse_dns_timeout*
Output files :
DB_ROOT/meta.db
@ -135,8 +133,7 @@ class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.8'
DEFAULT_DNS_TIMEOUT = 0.5
IWLA_VERSION = '0.7'
def __init__(self, logLevel, args):
self.meta_infos = {}
@ -148,9 +145,6 @@ class IWLA(object):
self.valid_visitors = None
self.args = args
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
IWLA.DEFAULT_DNS_TIMEOUT)
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
@ -248,26 +242,6 @@ class IWLA(object):
def getCSSPath(self):
return conf.css_path
def reverseDNS(self, hit):
if hit.get('dns_name_replaced', False):
return hit['remote_addr']
try:
timeout = socket.getdefaulttimeout()
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(self.reverse_dns_timeout)
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(timeout)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except socket.herror:
pass
finally:
hit['dns_analysed'] = True
return hit['remote_addr']
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,

View File

@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
return True
def hook(self):
from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds
from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds
display = self.iwla.getDisplay()
hits = self.iwla.getCurrentVisits()

View File

@ -25,7 +25,7 @@ from iwla import IWLA
from iplugin import IPlugin
"""
Pre analysis hook
Post analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
@ -77,7 +77,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False
@ -105,10 +104,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {}
return True
@ -139,11 +134,8 @@ class IWLAPostAnalysisFeeds(IPlugin):
def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
# One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit)
@ -167,6 +159,15 @@ class IWLAPostAnalysisFeeds(IPlugin):
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access']
if not hit.get('feed_name_analyzed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.NOT_A_FEED_PARSER
break
# Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)
@ -176,12 +177,12 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri) and self.iwla.hasBeenViewed(request):
if regexp.match(uri):
isFeedParser = self.FEED_PARSER
# # Robot that views pages -> bot
# if hit['robot']:
# if hit['not_viewed_pages'][0]:
# isFeedParser = self.NOT_A_FEED_PARSER
# Robot that views pages -> bot
if hit['robot']:
if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER
break
user_agent = request['http_user_agent'].lower()
@ -198,17 +199,14 @@ class IWLAPostAnalysisFeeds(IPlugin):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
if not hit.get('dns_name_replaced', False):
self.iwla.reverseDNS(hit)
if not hit.get('feed_name_analyzed', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
isFeedParser = self.NOT_A_FEED_PARSER
break
if not hit.get('feed_name_analyzed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
isFeedParser = hit['feed_parser'] = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
hit['feed_domain'] = request['server_name']
@ -219,7 +217,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)

View File

@ -19,13 +19,12 @@
#
import socket
import re
from iwla import IWLA
from iplugin import IPlugin
"""
Pre analysis hook
Post analysis hook
Replace IP by reverse DNS names
@ -33,7 +32,7 @@ Plugin requirements :
None
Conf values needed :
robot_domains*
reverse_dns_timeout*
Output files :
None
@ -52,13 +51,12 @@ Statistics deletion :
"""
class IWLAPostAnalysisReverseDNS(IPlugin):
DEFAULT_DNS_TIMEOUT = 0.5
def load(self):
self.robot_domains_re = []
robot_domains = self.iwla.getConfValue('robot_domains', [])
for domain in robot_domains:
self.robot_domains_re.append(re.compile(domain))
timeout = self.iwla.getConfValue('reverse_dns_timeout',
IWLAPostAnalysisReverseDNS.DEFAULT_DNS_TIMEOUT)
socket.setdefaulttimeout(timeout)
return True
def hook(self):
@ -67,13 +65,15 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
if hit.get('dns_analysed', False): continue
# Do reverse for feed parser even if they're not
# valid visitors
if hit.get('robot', False) and not hit.get('feed_parser', False):
if not hit.get('feed_parser', False) and\
not self.iwla.isValidVisitor(hit):
continue
res = self.iwla.reverseDNS(hit)
for r in self.robot_domains_re:
if r.match(hit['remote_addr']):
hit['robot'] = True
break
try:
name, _, _ = socket.gethostbyaddr(k)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except:
pass
finally:
hit['dns_analysed'] = True