Compare commits

...

14 Commits
v0.7 ... master

Author SHA1 Message Date
b2c9879412 Update ChangeLog 2025-02-03 10:06:16 +01:00
8691302741 Update documentation 2025-02-03 10:06:08 +01:00
Gregory Soutade
53e7390b77 Update AWStats data to version 8.0 2025-02-03 09:49:01 +01:00
Gregory Soutade
9b32a81ddb Add "ignore_url" parameter to iwla 2025-02-03 08:04:57 +01:00
Gregory Soutade
7b0ca661a1 Add rule for robot : forbid only "1 page and 1 hit" 2025-02-03 08:00:25 +01:00
Gregory Soutade
4d0b993aec Update default conf 2024-10-27 09:18:04 +01:00
Gregory Soutade
0211596508 Fix potential division by 0 2024-10-27 09:17:53 +01:00
Gregory Soutade
bde91ca936 Move reverse DNS core management into iwla.py + Add robot_domains configuration 2024-10-27 09:16:01 +01:00
Gregory Soutade
70de0d3aca Add no_merge_feeds_parsers_list conf value 2024-10-27 09:15:39 +01:00
Gregory Soutade
9939922c31 Move feeds and reverse_dns plugins from post_analysis to pre_analysis 2024-10-02 08:27:53 +02:00
Gregory Soutade
6d46ac4461 Robots: Improve compatible keyword detection for robots 2024-07-28 09:25:40 +02:00
Gregory Soutade
46c9ae4f15 Feeds: Add domain and number of subscribers for feed parser.
Set correct date for merged feed parsers
Remove bad BAD_FEED_PARSER state
2024-07-28 09:25:06 +02:00
Gregory Soutade
122ee875fa Sanitize requests before analyze 2024-07-28 09:24:52 +02:00
Gregory Soutade
a03b1dfc4f Core: Add multimedia_re filter 2024-07-28 09:24:33 +02:00
14 changed files with 381 additions and 227 deletions

View File

@ -1,3 +1,22 @@
v0.8 (03/02/2025)
** User **
Add multimedia_re filter to detect multimedia files by regular expression
Add domain and number of subscribers for feed parser
Add "no_merge_feeds_parsers"_list conf value
Add "robot_domains" configuration value
Add rule for robot : forbid only "1 page and 1 hit"
Add "ignore_url" conf value
** Dev **
Sanitize HTTP requests before analyze
Try to detect robots by "compatible" strings
Move feeds and reverse_dns plugins from post_analysis to pre_analysis
Move reverse DNS core management into iwla.py
** Bugs **
Fix potential division by 0
v0.7 (17/03/2024)
** User **
Awstats data updated (7.9)

File diff suppressed because one or more lines are too long

View File

@ -38,12 +38,16 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
# HTTP codes that are considered OK
viewed_http_codes = [200, 304]
# URL to ignore
ignore_url = []
# If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
count_hit_only_visitors = False
# Multimedia extensions (not accounted as downloaded files)
multimedia_files = ['png', 'jpg', 'jpeg', 'gif', 'ico', 'svg',
'css', 'js']
multimedia_files_re = []
# Default resources path (will be symlinked in DISPLAY_OUTPUT)
resources_path = ['resources']
@ -69,3 +73,9 @@ excluded_domain_name = []
# Domains that set no-referer as Referer-Policy
no_referrer_domains = []
# Domains used by robots
robot_domains = []
# Feeds agent identifier
feeds_agents = [r'.*NextCloud-News']

View File

@ -207,7 +207,7 @@ class DisplayHTMLBlockTable(DisplayHTMLBlock):
self.insertCol(column_insertion, self.iwla._('Ratio'), u'iwla_hit')
for (index, r) in enumerate(self.rows):
val = r[column] and int(r[column]) or 0
self.setCellValue(index, column_insertion, '%.1f%%' % (float(val*100)/float(total)))
self.setCellValue(index, column_insertion, '%.1f%%' % (total and float(val*100)/float(total) or 0))
def _filter(self, function, column, args):
target_col = None

View File

@ -134,19 +134,19 @@ Optional configuration values ends with *.
* plugins/display/visitor_ip.py
* plugins/post_analysis/anonymize_ip.py
* plugins/post_analysis/browsers.py
* plugins/post_analysis/feeds.py
* plugins/post_analysis/filter_users.py
* plugins/post_analysis/hours_stats.py
* plugins/post_analysis/ip_to_geo.py
* plugins/post_analysis/ip_type.py
* plugins/post_analysis/operating_systems.py
* plugins/post_analysis/referers.py
* plugins/post_analysis/reverse_dns.py
* plugins/post_analysis/subdomains.py
* plugins/post_analysis/top_downloads.py
* plugins/post_analysis/top_hits.py
* plugins/post_analysis/top_pages.py
* plugins/pre_analysis/feeds.py
* plugins/pre_analysis/page_to_hit.py
* plugins/pre_analysis/reverse_dns.py
* plugins/pre_analysis/robots.py
@ -164,9 +164,13 @@ iwla
analyzed_filename
domain_name
locales_path
locale
keep_requests*
compress_output_files
excluded_ip
excluded_domain_name
reverse_dns_timeout*
ignore_url*
Output files :
DB_ROOT/meta.db
@ -866,42 +870,6 @@ plugins.post_analysis.browsers
None
plugins.post_analysis.feeds
---------------------------
Post analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_ip =>
feed_parser
feed_name_analysed
feed_parser_last_access (for merged parser)
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.filter_users
----------------------------------
@ -1128,35 +1096,6 @@ plugins.post_analysis.referers
None
plugins.post_analysis.reverse_dns
---------------------------------
Post analysis hook
Replace IP by reverse DNS names
Plugin requirements :
None
Conf values needed :
reverse_dns_timeout*
Output files :
None
Statistics creation :
None
Statistics update :
valid_visitors:
remote_addr
dns_name_replaced
dns_analyzed
Statistics deletion :
None
plugins.post_analysis.subdomains
--------------------------------
@ -1269,6 +1208,45 @@ plugins.post_analysis.top_pages
None
plugins.pre_analysis.feeds
--------------------------
Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_ip =>
feed_parser
feed_name_analyzed
feed_parser_last_access (for merged parser)
feed_domain
feed_uri
feed_subscribers
Statistics update :
None
Statistics deletion :
None
plugins.pre_analysis.page_to_hit
--------------------------------
@ -1297,6 +1275,35 @@ plugins.pre_analysis.page_to_hit
None
plugins.pre_analysis.reverse_dns
--------------------------------
Pre analysis hook
Replace IP by reverse DNS names
Plugin requirements :
None
Conf values needed :
robot_domains*
Output files :
None
Statistics creation :
None
Statistics update :
valid_visitors:
remote_addr
dns_name_replaced
dns_analyzed
Statistics deletion :
None
plugins.pre_analysis.robots
---------------------------

View File

@ -22,19 +22,19 @@
* plugins/display/visitor_ip.py
* plugins/post_analysis/anonymize_ip.py
* plugins/post_analysis/browsers.py
* plugins/post_analysis/feeds.py
* plugins/post_analysis/filter_users.py
* plugins/post_analysis/hours_stats.py
* plugins/post_analysis/ip_to_geo.py
* plugins/post_analysis/ip_type.py
* plugins/post_analysis/operating_systems.py
* plugins/post_analysis/referers.py
* plugins/post_analysis/reverse_dns.py
* plugins/post_analysis/subdomains.py
* plugins/post_analysis/top_downloads.py
* plugins/post_analysis/top_hits.py
* plugins/post_analysis/top_pages.py
* plugins/pre_analysis/feeds.py
* plugins/pre_analysis/page_to_hit.py
* plugins/pre_analysis/reverse_dns.py
* plugins/pre_analysis/robots.py
@ -52,9 +52,13 @@ iwla
analyzed_filename
domain_name
locales_path
locale
keep_requests*
compress_output_files
excluded_ip
excluded_domain_name
reverse_dns_timeout*
ignore_url*
Output files :
DB_ROOT/meta.db
@ -754,42 +758,6 @@ plugins.post_analysis.browsers
None
plugins.post_analysis.feeds
---------------------------
Post analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_ip =>
feed_parser
feed_name_analysed
feed_parser_last_access (for merged parser)
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.filter_users
----------------------------------
@ -1016,35 +984,6 @@ plugins.post_analysis.referers
None
plugins.post_analysis.reverse_dns
---------------------------------
Post analysis hook
Replace IP by reverse DNS names
Plugin requirements :
None
Conf values needed :
reverse_dns_timeout*
Output files :
None
Statistics creation :
None
Statistics update :
valid_visitors:
remote_addr
dns_name_replaced
dns_analyzed
Statistics deletion :
None
plugins.post_analysis.subdomains
--------------------------------
@ -1157,6 +1096,45 @@ plugins.post_analysis.top_pages
None
plugins.pre_analysis.feeds
--------------------------
Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_ip =>
feed_parser
feed_name_analyzed
feed_parser_last_access (for merged parser)
feed_domain
feed_uri
feed_subscribers
Statistics update :
None
Statistics deletion :
None
plugins.pre_analysis.page_to_hit
--------------------------------
@ -1185,6 +1163,35 @@ plugins.pre_analysis.page_to_hit
None
plugins.pre_analysis.reverse_dns
--------------------------------
Pre analysis hook
Replace IP by reverse DNS names
Plugin requirements :
None
Conf values needed :
robot_domains*
Output files :
None
Statistics creation :
None
Statistics update :
valid_visitors:
remote_addr
dns_name_replaced
dns_analyzed
Statistics deletion :
None
plugins.pre_analysis.robots
---------------------------

74
iwla.py
View File

@ -32,6 +32,7 @@ import logging
import gettext
from calendar import monthrange
from datetime import date, datetime
import socket
import default_conf as conf
@ -50,9 +51,13 @@ Conf values needed :
analyzed_filename
domain_name
locales_path
locale
keep_requests*
compress_output_files
excluded_ip
excluded_domain_name
reverse_dns_timeout*
ignore_url*
Output files :
DB_ROOT/meta.db
@ -133,7 +138,8 @@ class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.7'
IWLA_VERSION = '0.8'
DEFAULT_DNS_TIMEOUT = 0.5
def __init__(self, logLevel, args):
self.meta_infos = {}
@ -145,6 +151,9 @@ class IWLA(object):
self.valid_visitors = None
self.args = args
self.reverse_dns_timeout = self.getConfValue('reverse_dns_timeout',
IWLA.DEFAULT_DNS_TIMEOUT)
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
@ -159,6 +168,12 @@ class IWLA(object):
self.excluded_domain_name = []
for domain_name in conf.excluded_domain_name:
self.excluded_domain_name += [re.compile(domain_name)]
self.ignore_url = []
for url in conf.ignore_url:
self.ignore_url += [re.compile(url)]
self.multimedia_files_re = []
for file_re in conf.multimedia_files_re:
self.multimedia_files_re += [re.compile(file_re)]
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
@ -239,6 +254,26 @@ class IWLA(object):
def getCSSPath(self):
return conf.css_path
def reverseDNS(self, hit):
if hit.get('dns_name_replaced', False):
return hit['remote_addr']
try:
timeout = socket.getdefaulttimeout()
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(self.reverse_dns_timeout)
name, _, _ = socket.gethostbyaddr(hit['remote_ip'])
if timeout != self.reverse_dns_timeout:
socket.setdefaulttimeout(timeout)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except socket.herror:
pass
finally:
hit['dns_analysed'] = True
return hit['remote_addr']
def _clearMeta(self):
self.meta_infos = {
'last_time' : None,
@ -311,13 +346,18 @@ class IWLA(object):
self.logger.debug("False")
return False
def isMultimediaFile(self, request):
self.logger.debug("Is multimedia %s" % (request))
def isMultimediaFile(self, uri):
self.logger.debug("Is multimedia %s" % (uri))
for e in conf.multimedia_files:
if request.lower().endswith(e):
if uri.lower().endswith(e):
self.logger.debug("True")
return True
self.logger.debug("False")
for file_re in self.multimedia_files_re:
if file_re.match(uri):
self.logger.debug("Is multimedia re True")
return True
return False
def isValidVisitor(self, hit):
@ -331,18 +371,24 @@ class IWLA(object):
return hit['robot'] == True
def _appendHit(self, hit):
remote_ip = hit['remote_ip']
# Redirected page/hit
if int(hit['status']) in (301, 302, 307, 308):
return
remote_ip = hit['remote_ip']
if not remote_ip: return
for ip in self.excluded_ip:
if ip.match(remote_ip):
return
# Redirected page/hit
if int(hit['status']) in (301, 302, 307, 308):
return
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
for url in self.ignore_url:
if url.match(uri):
return
if not remote_ip in self.current_analysis['visits'].keys():
self._createVisitor(hit)
@ -357,9 +403,6 @@ class IWLA(object):
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
@ -810,12 +853,15 @@ class IWLA(object):
for l in _file:
# print "line " + l
groups = self.log_re.match(l)
sanitized = l.replace('<', '')
sanitized = sanitized.replace('>', '')
groups = self.log_re.match(sanitized)
if groups:
self._newHit(groups.groupdict(""))
else:
self.logger.warning("No match for %s" % (l))
self.logger.warning("No match for %s" % (sanitized))
#break
if self.analyse_started:

Binary file not shown.

View File

@ -6,7 +6,7 @@ msgid ""
msgstr ""
"Project-Id-Version: iwla\n"
"POT-Creation-Date: 2024-03-16 08:52+0100\n"
"PO-Revision-Date: 2024-03-16 08:53+0100\n"
"PO-Revision-Date: 2025-02-03 09:57+0100\n"
"Last-Translator: Soutadé <soutade@gmail.com>\n"
"Language-Team: iwla\n"
"Language: fr\n"
@ -15,7 +15,7 @@ msgstr ""
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=2; plural=(n > 1);\n"
"Generated-By: pygettext.py 1.5\n"
"X-Generator: Poedit 3.4.2\n"
"X-Generator: Poedit 3.5\n"
"X-Poedit-SourceCharset: UTF-8\n"
#: display.py:32
@ -424,7 +424,6 @@ msgid "All key phrases"
msgstr "Toutes les phrases clé"
#: plugins/display/robot_bandwidth.py:90
#, fuzzy
msgid "Name"
msgstr "Nom"

View File

@ -22,8 +22,6 @@ from iwla import IWLA
from iplugin import IPlugin
from display import *
import awstats_data
"""
Display hook

View File

@ -59,7 +59,7 @@ class IWLADisplayFeeds(IPlugin):
return True
def hook(self):
from plugins.post_analysis.feeds import IWLAPostAnalysisFeeds
from plugins.pre_analysis.feeds import IWLAPostAnalysisFeeds
display = self.iwla.getDisplay()
hits = self.iwla.getCurrentVisits()
@ -72,11 +72,13 @@ class IWLADisplayFeeds(IPlugin):
path = self.iwla.getCurDisplayPath(filename)
page = display.createPage(title, path, self.iwla.getConfValue('css_path', []))
table = display.createBlock(DisplayHTMLBlockTable, self.iwla._(u'All feeds parsers'), [self.iwla._(u'Host'), self.iwla._(u'Pages'), self.iwla._(u'Hits'), self.iwla._(u'Last Access')])
table.setColsCSSClass(['', 'iwla_page', 'iwla_hit', ''])
table = display.createBlock(DisplayHTMLBlockTable, self.iwla._(u'All feeds parsers'), [self.iwla._(u'Host'), self.iwla._(u'Pages'), self.iwla._(u'Hits')
, self.iwla._(u'Domain'), self.iwla._(u'Subscribers'), self.iwla._(u'Last Access')])
table.setColsCSSClass(['', 'iwla_page', 'iwla_hit', '', '', ''])
rows = []
for super_hit in hits.values():
if not super_hit.get('feed_parser', False): continue
if super_hit['feed_parser'] == IWLAPostAnalysisFeeds.BAD_FEED_PARSER:
if super_hit.get('feed_parser', None) not in (IWLAPostAnalysisFeeds.FEED_PARSER,\
IWLAPostAnalysisFeeds.MERGED_FEED_PARSER):
continue
nb_feeds_parsers += 1
address = super_hit['remote_addr']
@ -84,11 +86,21 @@ class IWLADisplayFeeds(IPlugin):
address += ' *'
pages = super_hit['not_viewed_pages'][0] + super_hit['viewed_pages'][0]
hits = super_hit['not_viewed_hits'][0] + super_hit['viewed_hits'][0]
last_access = super_hit.get('feed_parser_last_access', None)
if not last_access:
last_access = super_hit['last_access']
row = [address, pages, hits, time.asctime(last_access)]
table.appendRow(row, super_hit['remote_ip'])
last_access = super_hit.get('feed_parser_last_access', super_hit['last_access'])
feed_domain = super_hit.get('feed_domain', '')
if feed_domain:
link = '<a href=\'https://%s/%s\'>%s</a>' % (feed_domain, super_hit.get('feed_uri', ''), feed_domain)
else:
link = ''
subscribers = super_hit.get('feed_subscribers', '')
# Don't overload interface
if subscribers <= 1: subscribers = ''
row = [address, pages, hits, link, subscribers, time.asctime(last_access),
super_hit['remote_ip'], last_access]
rows.append(row)
rows = sorted(rows, key=lambda t: t[7], reverse=True)
for row in rows:
table.appendRow(row[:6], row[6])
page.appendBlock(table)
note = DisplayHTMLRaw(self.iwla, ('<small>*%s</small>' % (self.iwla._(u'Merged feeds parsers'))))
page.appendBlock(note)

View File

@ -19,12 +19,13 @@
#
import re
import time
from iwla import IWLA
from iplugin import IPlugin
"""
Post analysis hook
Pre analysis hook
Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
@ -47,8 +48,11 @@ Output files :
Statistics creation :
remote_ip =>
feed_parser
feed_name_analysed
feed_name_analyzed
feed_parser_last_access (for merged parser)
feed_domain
feed_uri
feed_subscribers
Statistics update :
None
@ -73,6 +77,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds_agents = self.iwla.getConfValue('feeds_agents', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
_no_merge_feeds_parsers_list = self.iwla.getConfValue('no_merge_feeds_parsers_list', [])
if feeds is None: return False
@ -91,15 +96,25 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in feeds_agents:
self.user_agents_re.append(re.compile(f))
self.bad_user_agents_re = []
self.bad_user_agents_re.append(re.compile(r'.*feedback.*'))
self.subscribers_re = re.compile(r'.* ([0-9]+) subscriber.*')
self.merge_feeds_parsers_list = []
for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f))
self.no_merge_feeds_parsers_list = []
for f in _no_merge_feeds_parsers_list:
self.no_merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {}
return True
def _appendToMergeCache(self, isFeedParser, key, hit):
hit['feed_parser'] = isFeedParser
# First time, register into dict
if self.merged_feeds.get(key, None) is None:
# Merged
@ -108,21 +123,30 @@ class IWLAPostAnalysisFeeds(IPlugin):
# Next time
# Current must be ignored
hit['feed_parser'] = self.NOT_A_FEED_PARSER
merged_hit = hit
last_access = hit['last_access']
# Previous matched hit must be set as merged
isFeedParser = self.MERGED_FEED_PARSER
hit = self.merged_feeds[key]
if hit['last_access'] < last_access:
hit['feed_parser_last_access'] = last_access
hit['feed_parser'] = self.MERGED_FEED_PARSER
hit['viewed_pages'][0] += merged_hit['viewed_pages'][0]
hit['viewed_hits'][0] += merged_hit['viewed_hits'][0]
hit['not_viewed_pages'][0] += merged_hit['not_viewed_pages'][0]
hit['not_viewed_hits'][0] += merged_hit['not_viewed_hits'][0]
if hit['last_access'] < merged_hit['last_access']:
hit['feed_parser_last_access'] = merged_hit['last_access']
else:
hit['feed_parser_last_access'] = hit['last_access']
hit['feed_parser'] = isFeedParser
def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser:
if isFeedParser in (self.FEED_PARSER, self.MERGED_FEED_PARSER):
for r in self.no_merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
return
for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
self._appendToMergeCache(isFeedParser, r, hit)
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']) or r.match(hit['requests'][0]['http_user_agent']):
# One group can view multiple different feeds
key = r.pattern + hit.get('feed_domain', '') + hit.get('feed_uri', '')
self._appendToMergeCache(isFeedParser, key, hit)
return
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
# Other cases, look for user agent
@ -134,44 +158,68 @@ class IWLAPostAnalysisFeeds(IPlugin):
for hit in hits.values():
isFeedParser = hit.get('feed_parser', None)
# Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers and\
not isFeedParser in (None, self.BAD_FEED_PARSER):
self.mergeFeedsParsers(isFeedParser, hit)
if isFeedParser == self.NOT_A_FEED_PARSER:
continue
# Second time
if isFeedParser:
if hit['feed_parser'] == self.BAD_FEED_PARSER: continue
if not hit.get('feed_name_analysed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analysed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.BAD_FEED_PARSER
break
# Update last access time
if hit['last_access'] > hit.get('feed_parser_last_access', time.gmtime(0)):
hit['feed_parser_last_access'] = hit['last_access']
# Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)
continue
request = hit['requests'][0]
isFeedParser = self.NOT_A_FEED_PARSER
uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri):
if regexp.match(uri) and self.iwla.hasBeenViewed(request):
isFeedParser = self.FEED_PARSER
# Robot that views pages -> bot
if hit['robot']:
if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER
# # Robot that views pages -> bot
# if hit['robot']:
# if hit['not_viewed_pages'][0]:
# isFeedParser = self.NOT_A_FEED_PARSER
break
user_agent = request['http_user_agent'].lower()
if isFeedParser == self.NOT_A_FEED_PARSER:
user_agent = request['http_user_agent'].lower()
for regexp in self.user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
for regexp in self.bad_user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
if not hit.get('dns_name_replaced', False):
self.iwla.reverseDNS(hit)
if not hit.get('feed_name_analyzed', False):
hit['feed_name_analyzed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
isFeedParser = self.NOT_A_FEED_PARSER
break
if isFeedParser == self.FEED_PARSER:
hit['feed_domain'] = request['server_name']
hit['feed_uri'] = uri
hit['feed_subscribers'] = 0
subscribers = self.subscribers_re.match(user_agent)
if subscribers:
hit['feed_subscribers'] = int(subscribers.groups()[0])
hit['robot'] = True
hit['feed_parser'] = isFeedParser
if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, hit)
else:
hit['feed_parser'] = isFeedParser

View File

@ -19,12 +19,13 @@
#
import socket
import re
from iwla import IWLA
from iplugin import IPlugin
"""
Post analysis hook
Pre analysis hook
Replace IP by reverse DNS names
@ -32,7 +33,7 @@ Plugin requirements :
None
Conf values needed :
reverse_dns_timeout*
robot_domains*
Output files :
None
@ -51,12 +52,13 @@ Statistics deletion :
"""
class IWLAPostAnalysisReverseDNS(IPlugin):
DEFAULT_DNS_TIMEOUT = 0.5
def load(self):
timeout = self.iwla.getConfValue('reverse_dns_timeout',
IWLAPostAnalysisReverseDNS.DEFAULT_DNS_TIMEOUT)
socket.setdefaulttimeout(timeout)
self.robot_domains_re = []
robot_domains = self.iwla.getConfValue('robot_domains', [])
for domain in robot_domains:
self.robot_domains_re.append(re.compile(domain))
return True
def hook(self):
@ -65,15 +67,13 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
if hit.get('dns_analysed', False): continue
# Do reverse for feed parser even if they're not
# valid visitors
if not hit.get('feed_parser', False) and\
not self.iwla.isValidVisitor(hit):
if hit.get('robot', False) and not hit.get('feed_parser', False):
continue
try:
name, _, _ = socket.gethostbyaddr(k)
hit['remote_addr'] = name.lower()
hit['dns_name_replaced'] = True
except:
pass
finally:
hit['dns_analysed'] = True
res = self.iwla.reverseDNS(hit)
for r in self.robot_domains_re:
if r.match(hit['remote_addr']):
hit['robot'] = True
break

View File

@ -61,7 +61,11 @@ class IWLAPreAnalysisRobots(IPlugin):
self.awstats_robots = list(map(lambda x : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots))
self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)
self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)
self.compatible_re = re.compile(r'.*\(.*compatible; (.*); \+.*\)*')
self.compatible_re = []
self.compatible_re.append(re.compile(r'.*\(.*compatible; ([^;]+);.*\).*'))
self.compatible_re.append(re.compile(r'.*\(.*compatible; (.*)\).*'))
self.compatible_re.append(re.compile(r'.*\(([^;]+); \+.*\).*'))
self.compatible_re.append(re.compile(r'(.*); \(\+.*\)*'))
self.logger = logging.getLogger(self.__class__.__name__)
self.one_hit_only = self.iwla.getConfValue('count_hit_only_visitors', False)
self.no_referrer_domains = self.iwla.getConfValue('no_referrer_domains', [])
@ -76,12 +80,14 @@ class IWLAPreAnalysisRobots(IPlugin):
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
super_hit['robot'] = True
super_hit['keep_requests'] = False
for hit in super_hit['requests']:
robot_name = self.compatible_re.match(hit['http_user_agent'])
agent = super_hit['requests'][0]['http_user_agent']
for compatible_re in self.compatible_re:
robot_name = compatible_re.match(agent)
if robot_name:
super_hit['robot_name'] = robot_name[1]
break
# Basic rule to detect robots
def hook(self):
hits = self.iwla.getCurrentVisits()
@ -101,10 +107,9 @@ class IWLAPreAnalysisRobots(IPlugin):
referers = 0
first_page = super_hit['requests'][0]
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']) or\
self.compatible_re.match(first_page['http_user_agent']):
self.crawl_re.match(first_page['http_user_agent']):
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
@ -127,7 +132,10 @@ class IWLAPreAnalysisRobots(IPlugin):
# 2) Less than 1 hit per page
if super_hit['viewed_pages'][0] and (super_hit['viewed_hits'][0] < super_hit['viewed_pages'][0]):
isRobot = True
# 2.5) 1 page, 1 hit
elif super_hit['viewed_pages'][0] == 1 and super_hit['viewed_hits'][0] == 1:
isRobot = True
if isRobot:
self._setRobot(k, super_hit)
continue