Compare commits
3 Commits
6d46ac4461
...
bde91ca936
Author | SHA1 | Date | |
---|---|---|---|
|
bde91ca936 | ||
|
70de0d3aca | ||
|
9939922c31 |
28
iwla.py
28
iwla.py
|
@ -32,6 +32,7 @@ import logging
|
||||||
import gettext
|
import gettext
|
||||||
from calendar import monthrange
|
from calendar import monthrange
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
|
import socket
|
||||||
|
|
||||||
import default_conf as conf
|
import default_conf as conf
|
||||||
|
|
||||||
|
@ -53,6 +54,7 @@ Conf values needed :
|
||||||
compress_output_files
|
compress_output_files
|
||||||
excluded_ip
|
excluded_ip
|
||||||
excluded_domain_name
|
excluded_domain_name
|
||||||
|
reverse_dns_timeout*
|
||||||
|
|
||||||
Output files :
|
Output files :
|
||||||
DB_ROOT/meta.db
|
DB_ROOT/meta.db
|
||||||
|
@ -133,7 +135,8 @@ class IWLA(object):
|
||||||
|
|
||||||
ANALYSIS_CLASS = 'HTTP'
|
ANALYSIS_CLASS = 'HTTP'
|
||||||
API_VERSION = 1
|
API_VERSION = 1
|
||||||
IWLA_VERSION = '0.7'
|
IWLA_VERSION = '0.8'
|
||||||
|
DEFAULT_DNS_TIMEOUT = 0.5
|
||||||
|
|
||||||
def __init__(self, logLevel, args):
|
def __init__(self, logLevel, args):
|
||||||
self.meta_infos = {}
|
self.meta_infos = {}
|
||||||
|
@ -145,6 +148,9 @@ class IWLA(object):
|
||||||
self.valid_visitors = None
|
self.valid_visitors = None
|
||||||
self.args = args
|
self.args = args
|
||||||
|
|
||||||
|
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
|
||||||
|
IWLA.DEFAULT_DNS_TIMEOUT)
|
||||||
|
|
||||||
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
|
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
|
||||||
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
|
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
|
||||||
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
|
||||||
|
@ -242,6 +248,26 @@ class IWLA(object):
|
||||||
def getCSSPath(self):
|
def getCSSPath(self):
|
||||||
return conf.css_path
|
return conf.css_path
|
||||||
|
|
||||||
|
def reverseDNS(self, hit):
|
||||||
|
if hit.get('dns_name_replaced', False):
|
||||||
|
return hit['remote_addr']
|
||||||
|
|
||||||
|
try:
|
||||||
|
timeout = socket.getdefaulttimeout()
|
||||||
|
if timeout != self.reverse_dns_timeout:
|
||||||
|
socket.setdefaulttimeout(self.reverse_dns_timeout)
|
||||||
|
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
|
||||||
|
if timeout != self.reverse_dns_timeout:
|
||||||
|
socket.setdefaulttimeout(timeout)
|
||||||
|
hit['remote_addr'] = name.lower()
|
||||||
|
hit['dns_name_replaced'] = True
|
||||||
|
except socket.herror:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
hit['dns_analysed'] = True
|
||||||
|
|
||||||
|
return hit['remote_addr']
|
||||||
|
|
||||||
def _clearMeta(self):
|
def _clearMeta(self):
|
||||||
self.meta_infos = {
|
self.meta_infos = {
|
||||||
'last_time' : None,
|
'last_time' : None,
|
||||||
|
|
|
@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def hook(self):
|
def hook(self):
|
||||||
from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds
|
from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds
|
||||||
|
|
||||||
display = self.iwla.getDisplay()
|
display = self.iwla.getDisplay()
|
||||||
hits = self.iwla.getCurrentVisits()
|
hits = self.iwla.getCurrentVisits()
|
||||||
|
|
|
@ -25,7 +25,7 @@ from iwla import IWLA
|
||||||
from iplugin import IPlugin
|
from iplugin import IPlugin
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Post analysis hook
|
Pre analysis hook
|
||||||
|
|
||||||
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
|
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
|
||||||
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
|
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
|
||||||
|
@ -77,6 +77,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
feeds_agents = self.iwla.getConfValue('feeds_agents', [])
|
feeds_agents = self.iwla.getConfValue('feeds_agents', [])
|
||||||
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
|
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
|
||||||
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
|
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
|
||||||
|
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
|
||||||
|
|
||||||
if feeds is None: return False
|
if feeds is None: return False
|
||||||
|
|
||||||
|
@ -104,6 +105,10 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
for f in _merge_feeds_parsers_list:
|
for f in _merge_feeds_parsers_list:
|
||||||
self.merge_feeds_parsers_list.append(re.compile(f))
|
self.merge_feeds_parsers_list.append(re.compile(f))
|
||||||
|
|
||||||
|
self.no_merge_feeds_parsers_list = []
|
||||||
|
for f in _no_merge_feeds_parsers_list:
|
||||||
|
self.no_merge_feeds_parsers_list.append(re.compile(f))
|
||||||
|
|
||||||
self.merged_feeds = {}
|
self.merged_feeds = {}
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
@ -134,8 +139,11 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
|
|
||||||
def mergeFeedsParsers(self, isFeedParser, hit):
|
def mergeFeedsParsers(self, isFeedParser, hit):
|
||||||
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
|
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
|
||||||
|
for r in self.no_merge_feeds_parsers_list:
|
||||||
|
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
|
||||||
|
return
|
||||||
for r in self.merge_feeds_parsers_list:
|
for r in self.merge_feeds_parsers_list:
|
||||||
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
|
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
|
||||||
# One group can view multiple different feeds
|
# One group can view multiple different feeds
|
||||||
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
|
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
|
||||||
self._appendToMergeCache(isFeedParser, key, hit)
|
self._appendToMergeCache(isFeedParser, key, hit)
|
||||||
|
@ -159,15 +167,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
|
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
|
||||||
hit['feed_parser_last_access'] = hit['last_access']
|
hit['feed_parser_last_access'] = hit['last_access']
|
||||||
|
|
||||||
if not hit.get('feed_name_analyzed', False) and\
|
|
||||||
hit.get('dns_name_replaced', False):
|
|
||||||
hit['feed_name_analyzed'] = True
|
|
||||||
addr = hit.get('remote_addr', None)
|
|
||||||
for r in self.bad_feeds_re:
|
|
||||||
if r.match(addr):
|
|
||||||
hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
|
||||||
break
|
|
||||||
|
|
||||||
# Register already tagged feed parser in merged_feeds
|
# Register already tagged feed parser in merged_feeds
|
||||||
if self.merge_feeds_parsers:
|
if self.merge_feeds_parsers:
|
||||||
self.mergeFeedsParsers(isFeedParser, hit)
|
self.mergeFeedsParsers(isFeedParser, hit)
|
||||||
|
@ -177,12 +176,12 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
isFeedParser = self.NOT_A_FEED_PARSER
|
isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
uri = request['extract_request']['extract_uri'].lower()
|
uri = request['extract_request']['extract_uri'].lower()
|
||||||
for regexp in self.feeds_re:
|
for regexp in self.feeds_re:
|
||||||
if regexp.match(uri):
|
if regexp.match(uri) and self.iwla.hasBeenViewed(request):
|
||||||
isFeedParser = self.FEED_PARSER
|
isFeedParser = self.FEED_PARSER
|
||||||
# Robot that views pages -> bot
|
# # Robot that views pages -> bot
|
||||||
if hit['robot']:
|
# if hit['robot']:
|
||||||
if hit['not_viewed_pages'][0]:
|
# if hit['not_viewed_pages'][0]:
|
||||||
isFeedParser = self.NOT_A_FEED_PARSER
|
# isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
break
|
break
|
||||||
|
|
||||||
user_agent = request['http_user_agent'].lower()
|
user_agent = request['http_user_agent'].lower()
|
||||||
|
@ -199,14 +198,17 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
isFeedParser = self.NOT_A_FEED_PARSER
|
isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
break
|
break
|
||||||
|
|
||||||
if not hit.get('feed_name_analyzed', False) and\
|
if isFeedParser == self.FEED_PARSER:
|
||||||
hit.get('dns_name_replaced', False):
|
if not hit.get('dns_name_replaced', False):
|
||||||
hit['feed_name_analyzed'] = True
|
self.iwla.reverseDNS(hit)
|
||||||
addr = hit.get('remote_addr', None)
|
|
||||||
for r in self.bad_feeds_re:
|
if not hit.get('feed_name_analyzed', False):
|
||||||
if r.match(addr):
|
hit['feed_name_analyzed'] = True
|
||||||
isFeedParser = hit['feed_parser'] = self.NOT_A_FEED_PARSER
|
addr = hit.get('remote_addr', None)
|
||||||
break
|
for r in self.bad_feeds_re:
|
||||||
|
if r.match(addr):
|
||||||
|
isFeedParser = self.NOT_A_FEED_PARSER
|
||||||
|
break
|
||||||
|
|
||||||
if isFeedParser == self.FEED_PARSER:
|
if isFeedParser == self.FEED_PARSER:
|
||||||
hit['feed_domain'] = request['server_name']
|
hit['feed_domain'] = request['server_name']
|
||||||
|
@ -217,6 +219,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
|
||||||
if subscribers:
|
if subscribers:
|
||||||
hit['feed_subscribers'] = int(subscribers.groups()[0])
|
hit['feed_subscribers'] = int(subscribers.groups()[0])
|
||||||
|
|
||||||
|
hit['robot'] = True
|
||||||
hit['feed_parser'] = isFeedParser
|
hit['feed_parser'] = isFeedParser
|
||||||
if self.merge_feeds_parsers:
|
if self.merge_feeds_parsers:
|
||||||
self.mergeFeedsParsers(isFeedParser, hit)
|
self.mergeFeedsParsers(isFeedParser, hit)
|
|
@ -19,12 +19,13 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
|
import re
|
||||||
|
|
||||||
from iwla import IWLA
|
from iwla import IWLA
|
||||||
from iplugin import IPlugin
|
from iplugin import IPlugin
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Post analysis hook
|
Pre analysis hook
|
||||||
|
|
||||||
Replace IP by reverse DNS names
|
Replace IP by reverse DNS names
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ Plugin requirements :
|
||||||
None
|
None
|
||||||
|
|
||||||
Conf values needed :
|
Conf values needed :
|
||||||
reverse_dns_timeout*
|
robot_domains*
|
||||||
|
|
||||||
Output files :
|
Output files :
|
||||||
None
|
None
|
||||||
|
@ -51,12 +52,13 @@ Statistics deletion :
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class IWLAPostAnalysisReverseDNS(IPlugin):
|
class IWLAPostAnalysisReverseDNS(IPlugin):
|
||||||
DEFAULT_DNS_TIMEOUT = 0.5
|
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
timeout = self.iwla.getConfValue('reverse_dns_timeout',
|
self.robot_domains_re = []
|
||||||
IWLAPostAnalysisReverseDNS.DEFAULT_DNS_TIMEOUT)
|
robot_domains = self.iwla.getConfValue('robot_domains', [])
|
||||||
socket.setdefaulttimeout(timeout)
|
for domain in robot_domains:
|
||||||
|
self.robot_domains_re.append(re.compile(domain))
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def hook(self):
|
def hook(self):
|
||||||
|
@ -65,15 +67,13 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
|
||||||
if hit.get('dns_analysed', False): continue
|
if hit.get('dns_analysed', False): continue
|
||||||
# Do reverse for feed parser even if they're not
|
# Do reverse for feed parser even if they're not
|
||||||
# valid visitors
|
# valid visitors
|
||||||
if not hit.get('feed_parser', False) and\
|
if hit.get('robot', False) and not hit.get('feed_parser', False):
|
||||||
not self.iwla.isValidVisitor(hit):
|
|
||||||
continue
|
continue
|
||||||
try:
|
|
||||||
name, _, _ = socket.gethostbyaddr(k)
|
res = self.iwla.reverseDNS(hit)
|
||||||
hit['remote_addr'] = name.lower()
|
|
||||||
hit['dns_name_replaced'] = True
|
for r in self.robot_domains_re:
|
||||||
except:
|
if r.match(hit['remote_addr']):
|
||||||
pass
|
hit['robot'] = True
|
||||||
finally:
|
break
|
||||||
hit['dns_analysed'] = True
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user