Compare commits

...

50 Commits
v0.6 ... master

Author SHA1 Message Date
c9500e2e99 Update Changelog 2024-03-16 09:08:24 +01:00
ca3c0eefdf Update documentation 2024-03-16 09:02:06 +01:00
1e09852d18 Update locales 2024-03-16 08:53:44 +01:00
Gregory Soutade
db9009bb28 Update AWStats data (v7.9) 2024-03-05 16:41:31 +01:00
Gregory Soutade
e2210f3eab Update geo ip misc plugin 2024-02-15 10:55:59 +01:00
Gregory Soutade
9db72f41fd Don't analyze referer for non viewed hits/pages 2024-02-15 10:55:38 +01:00
Gregory Soutade
0464a3d8e7 Generate HTML part in dry run mode (but don't write it to disk) 2024-02-15 10:55:04 +01:00
Gregory Soutade
b9566beb80 Set lang value in generated HTML page 2024-02-15 10:54:52 +01:00
Gregory Soutade
d78739157b Remove all trailing slashs of URL before starting analyze 2024-02-03 09:02:55 +01:00
Gregory Soutade
d6d216db4d Improve page detection : check if . is present in last part 2024-01-30 11:27:03 +01:00
Gregory Soutade
974d355dd4 Add no_referrer_domains list to defaut_conf for website that defines this policy 2024-01-30 11:24:52 +01:00
Gregory Soutade
f1ffbe40d8 --display-only switch now takes an argument (month/year), analyze is not yet necessary 2023-08-06 13:25:42 +02:00
Gregory Soutade
83275a8db4 Rework filtered_users output to have full location in a column 2023-08-06 13:25:42 +02:00
Gregory Soutade
07eb919837 Add excluded_domain_name to default_conf 2023-07-14 09:24:47 +02:00
Gregory Soutade
16cd817fec Increase not modified page threshold for robot detection 2023-07-05 09:15:48 +02:00
Gregory Soutade
d32b2440ee Bugfix: flags management for feeds display 2023-06-14 09:21:51 +02:00
Gregory Soutade
9d3ff8b3b7 Add excluded domain option 2023-06-14 09:21:11 +02:00
Gregory Soutade
9c688e1545 Display visitor IP is now a filter 2023-05-21 11:06:16 +02:00
Gregory Soutade
7ef0911fa7 Main key for visits is now remote_ip and not remote_addr 2023-05-21 11:04:40 +02:00
Gregory Soutade
7507b8e77f WIP 2023-04-28 16:17:47 +02:00
b1b92412e0 Update documentation 2023-04-18 20:37:33 +02:00
b1e6f973a6 Update locales 2023-04-18 20:37:24 +02:00
Gregory Soutade
de79f526dd Add IP type plugin 2023-04-18 20:34:45 +02:00
Gregory Soutade
4b58048198 Update browsers with msie and Opera 2023-04-18 20:33:09 +02:00
Gregory Soutade
71d8ee2113 Forgot Firefox icon 2023-03-25 08:11:57 +01:00
Gregory Soutade
440f51ddd1 Remove robot rule 1 page for phones 2023-03-23 21:17:52 +01:00
Gregory Soutade
cad3467c25 Remove detection from awstats dataset for browser 2023-03-23 21:16:54 +01:00
Gregory Soutade
44c76007cd Remove .*bot.* and .*crawl.* from awstats_data 2023-03-11 20:56:18 +01:00
Gregory Soutade
adc04bf753 Update iwla :
* Rework arg variable management
  * Manage dry run at top level
  * 'robot' property is now None by default (allow to do analysis only once)
  * Add --disable-display option
2023-03-11 20:51:44 +01:00
Gregory Soutade
6500d98bdd Do not manage dry run inside display part, but directly in iwla 2023-03-11 20:49:28 +01:00
Gregory Soutade
a0a1f42df4 Update robot detection plugin :
* Do analyze only one time by month
  * Reactivate rule : no page view if count_hit_only_visitors is False
  * Add exception for "Less than 1 hit per page" rule if a phone is used
  * Check for all error codes in 400..499, not only 403 and 404
  * Referer '-' now counted as null
2023-03-11 20:48:17 +01:00
Gregory Soutade
31bc67ceba Replace feed referers by feed user agent 2023-03-11 20:42:56 +01:00
Gregory Soutade
3fdbc282c8 Remove feed parser detection by referer 2023-03-11 20:42:37 +01:00
Gregory Soutade
5f96c44edf Set count_hit_only_visitors to False by default 2023-03-11 20:40:31 +01:00
Gregory Soutade
58d31d842a Merge branch 'master' of soutade.fr:iwla 2023-02-18 08:51:15 +01:00
f871f4975c Update translation 2023-02-18 08:51:05 +01:00
Gregory Soutade
16b0619f19 Fix error : total of not viewed bandwidth not displayed 2023-02-18 08:49:27 +01:00
Gregory Soutade
c8dfdd17f7 Add "compatible" as a criteria for robot 2023-02-18 08:49:14 +01:00
Gregory Soutade
a5bef4ece6 Search for "compatible" in all requests, not only the first one 2023-02-18 08:48:57 +01:00
Gregory Soutade
b29765dda9 Update data with AWStats 7.9 2023-02-04 08:42:26 +01:00
Gregory Soutade
cb18cf928e New way to display global statistics : with links in months names instead of "Details" button
Fix Months name not translated in "By Day" corner
2023-02-04 08:40:36 +01:00
Gregory Soutade
21a21cd68f Add a new rule for robots : 1 page and 1 hit, but not from the same source 2023-02-04 08:40:04 +01:00
72db40d593 Update translations 2023-01-28 09:48:25 +01:00
Gregory Soutade
c6ce5cfc6f Increment IWLA version 2023-01-28 09:45:13 +01:00
Gregory Soutade
185664850d Add subdomains plugin 2023-01-28 09:44:43 +01:00
Gregory Soutade
fef9c783f6 Skip redirected pages/hit at analysis level 2023-01-28 09:42:12 +01:00
Gregory Soutade
6a4fd4e9c8 New rule for robot : more than 10 not modified pages in a row 2023-01-28 09:40:26 +01:00
Gregory Soutade
ac246eabe2 Find robot name in 'compatible' string and group them 2023-01-28 09:38:59 +01:00
Gregory Soutade
9c57ad3ece Feeds : display last access date for merged feed parsers 2023-01-28 09:36:48 +01:00
Gregory Soutade
3a8c667fdc Feeds display: Add "*" after a space in order to have flags 2023-01-28 09:35:48 +01:00
135 changed files with 1348 additions and 459 deletions

View File

@ -1,6 +1,33 @@
v0.7 (17/03/2024)
** User **
Awstats data updated (7.9)
Improve page/hit detection
--display-only switch now takes an argument (month/year), analyze is not yet necessary
Add --disable-display option
Geo IP plugin updated (use of [ip-api.com](https://ip-api.com/))
Add _subdomains_ plugin
New way to display global statistics : with links in months names instead of "Details" button
Add excluded domain option
** Dev **
Remove detection from awstats dataset for browser
Don't analyze referer for non viewed hits/pages
Remove all trailing slashs of URL before starting analyze
Main key for visits is now "remote\_ip" and not "remote\_addr"
Add IP type plugin to support IPv4 and IPv6
Update robot detection
Display visitor IP is now a filter
Generate HTML part in dry run mode (but don't write it to disk)
Set lang value in generated HTML page
Add no\_referrer\_domains list to defaut_conf for website that defines this policy
Set count\_hit\_only\_visitors to False by default
** Bugs **
Flags management for feeds display
v0.6 (20/11/2022)
** User **
Replace track_users by filter_users plugins which can itnerpret conditional filters from configuration
Replace track_users by filter_users plugins which can interpret conditional filters from configuration
Don't save all visitors requests into database (save space and computing). Can be changed in deufalt_conf.py with keep_requests value
Replace -c argument by config file. Now clean output is -C
Add favicon

File diff suppressed because one or more lines are too long

90
conf.py
View File

@ -1,6 +1,8 @@
#DB_ROOT = './output_db'
#DISPLAY_ROOT = './output_dev'
# Web server log
analyzed_filename = '/var/log/apache2/access.log.1,/var/log/apache2/access.log'
analyzed_filename = '/var/log/apache2/soutade.fr_access.log.1,/var/log/apache2/soutade.fr_access.log'
# Domain name to analyze
domain_name = 'soutade.fr'
@ -10,49 +12,99 @@ display_visitor_ip = True
# Hooks used
pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['referers', 'top_pages', 'top_downloads', 'operating_systems', 'browsers', 'feeds', 'hours_stats', 'reverse_dns', 'ip_to_geo']
display_hooks = ['filter_users', 'top_visitors', 'all_visits', 'referers', 'top_pages', 'top_downloads', 'referers_diff', 'ip_to_geo', 'operating_systems', 'browsers', 'feeds', 'hours_stats', 'top_downloads_diff', 'robot_bandwidth', 'top_pages_diff']
post_analysis_hooks = ['reverse_dns', 'referers', 'top_pages', 'subdomains', 'top_downloads', 'operating_systems', 'browsers', 'hours_stats', 'feeds', 'ip_to_geo', 'filter_users']
display_hooks = ['filter_users', 'top_visitors', 'all_visits', 'referers', 'top_pages', 'subdomains', 'top_downloads', 'referers_diff', 'ip_to_geo', 'operating_systems', 'browsers', 'feeds', 'hours_stats', 'top_downloads_diff', 'robot_bandwidth', 'top_pages_diff', 'all_visits_enlight']
# Reverse DNS timeout
reverse_dns_timeout = 0.2
# Count this addresses as hit
page_to_hit_conf = [r'^.+/logo[/]?$']
page_to_hit_conf = [r'.+/logo[/]?', r'.+/.+\.py']
# Count this addresses as page
hit_to_page_conf = [r'^.+/category/.+$', r'^.+/tag/.+$', r'^.+/archive/.+$', r'^.+/ljdc[/]?$', r'^.+/source/tree/.*$', r'^.+/source/file/.*$', r'^.+/search/.+$']
hit_to_page_conf = [
# Blog
r'.+/category/.+', r'.+/tag/.+', r'.+/archive/.+', r'.+/ljdc[/]?', r'.*/search/.+',
# Indefero
r'.+/source/tree/.*', r'.+/source/file/.*', r'.*/index$',
# Denote
r'.*/edit$', r'.*/add$', r'.+/[0-9]+$', r'.*/preferences$', r'.*/search$', r'.*/public_notes$', r'.*/template.*', r'.*/templates$',
# Music
r'.*/music/.*',
]
# Because it's too long to build HTML when there is too much entries
max_hits_displayed = 100
max_downloads_displayed = 100
# Compressed files
compress_output_files = ['html', 'css', 'js']
# Locale in French
#locale = 'fr'
# Tracked IP
tracked_ip = ['192.168.1.1']
locale = 'fr'
# Filtered IP
filtered_ip = [
# r'192.168.*', # Local
]
filtered_ip = ['82.232.68.211', '78.153.243.190', '176.152.215.133',
'83.199.87.88', # Lanion
'193.136.115.1' # Lisbon
]
import re
# google_re = re.compile('.*google.*')
# duck_re = re.compile('.*duckduckgo.*')
soutade_re = re.compile('.*soutade.fr.*')
def my_filter(iwla, visitor):
# Manage filtered users
if visitor.get('filtered', False): return True
filtered = False
req = visitor['requests'][0]
if visitor.get('country_code', '') == 'fr' and\
req['server_name'] in ('blog.soutade.fr', 'www.soutade.fr', 'soutade.fr') and \
req['extract_request']['extract_uri'] in ('/', '/index.html', '/about.html'):
referer = req['extract_referer']['extract_uri']
if referer in ('', '-'):
# print(f'{req} MATCHED')
filtered = True
elif not soutade_re.match(referer):
# if google_re.match(referer) or duck_re.match(referer):
# print(f'{req} MATCHED')
filtered = True
# Manage enlight users
if visitor.get('enlight', None) is None and not visitor.get('feed_parser', False):
enlight = False
for i, req in enumerate(visitor['requests']):
if i == 0 and req['server_name'] in ('indefero.soutade.fr'): break
if req['server_name'] in ('blog.soutade.fr') and \
req['extract_request']['extract_uri'] in ('/', '/index.html'):
enlight = True
break
visitor['enlight'] = enlight
return filtered
filtered_users = [
# [['country_code', '=', 'cn'], ['viewed_pages', '>=', '100']],
#[['country_code', '=', 'fr'], ['viewed_pages', '>=', '5'], ['viewed_hits', '>=', '5']],
[my_filter],
# [['country_code', '=', 'fr'], my_filter],
]
# Excluded IP
excluded_ip = [
r'192.168.*', # Local
r'117.78.58.*', # China ecs-117-78-58-25.compute.hwclouds-dns.com
#'79.141.15.51', # Elsys
#'165.225.20.107', # ST
#'165.225.76.184', # ST #2
'147.161.180.110', # Schneider
'147.161.182.108', # Schneider 2
'147.161.182.86', # Schneider 3
]
# Feeds url
feeds = [r'/atom.xml', r'/rss.xml']
# Feeds referers url
feeds_referers = ['https://feedly.com']
# Feeds agent url
# feeds_agents = [r'.*feedly.com.*']
merge_feeds_parsers = True
merge_feeds_parsers_list = [r'ec2-.*.compute-1.amazonaws.com']
# Consider xml files as multimedia (append to current list)
multimedia_files_append = ['xml']
@ -62,3 +114,5 @@ count_hit_only_visitors = False
# Not all robots bandwidth (too big)
create_all_robot_bandwidth_page = False
#keep_requests = True

View File

@ -39,7 +39,7 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200, 304]
# If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
count_hit_only_visitors = True
count_hit_only_visitors = False
# Multimedia extensions (not accounted as downloaded files)
multimedia_files = ['png', 'jpg', 'jpeg', 'gif', 'ico', 'svg',
@ -59,7 +59,13 @@ compress_output_files = ['html', 'css', 'js']
locales_path = './locales'
# Default locale (english)
locale = 'en_EN'
locale = 'en'
# Don't keep requests of all visitors into database
keep_requests = False
# Domain names that should be ignored
excluded_domain_name = []
# Domains that set no-referer as Referer-Policy
no_referrer_domains = []

View File

@ -39,6 +39,9 @@ class DisplayHTMLRaw(object):
self.iwla = iwla
self.html = html
def resetHTML(self):
self.html = ''
def setRawHTML(self, html):
self.html = html
@ -106,10 +109,12 @@ class DisplayHTMLBlockTable(DisplayHTMLBlock):
self.rows_cssclasses = []
self.table_css = u'iwla_table'
self.human_readable_cols = human_readable_cols or []
def appendRow(self, row):
self.objects = []
def appendRow(self, row, _object=None):
self.rows.append(listToStr(row))
self.rows_cssclasses.append([u''] * len(row))
self.objects.append(_object)
def insertCol(self, col_number, col_title='', col_css_class=''):
self.cols.insert(col_number, col_title)
@ -139,6 +144,12 @@ class DisplayHTMLBlockTable(DisplayHTMLBlock):
return self.rows[row][col]
def getRowObject(self, row):
if row < 0 or row >= len(self.rows):
raise ValueError('Invalid indices %d' % (row))
return self.objects[row]
def setCellValue(self, row, col, value):
if row < 0 or col < 0 or\
row >= len(self.rows) or col >= len(self.cols):
@ -205,9 +216,9 @@ class DisplayHTMLBlockTable(DisplayHTMLBlock):
target_col = col
break
if target_col is None: return
for row in self.rows:
res = function(row[target_col], **args)
if res:
for idx, row in enumerate(self.rows):
res = function(row[target_col], self.objects[idx], **args)
if res is not None:
row[target_col] = res
def _buildHTML(self):
@ -353,23 +364,21 @@ class DisplayHTMLPage(object):
self.logger.debug('Write %s' % (filename))
if self.iwla.dry_run: return
f = codecs.open(filename, 'w', 'utf-8')
f.write(u'<!DOCTYPE html>')
f.write(u'<html>')
f.write(u'<head>')
f.write(u'<meta http-equiv="Content-type" content="text/html; charset=UTF-8" />')
f.write(u'<link rel="icon" type="image/png" href="/resources/icon/favicon.png"/>')
f.write(u'<!DOCTYPE html>\n')
f.write(u'<html lang="{}">\n'.format(self.iwla.getConfValue('locale', 'en')))
f.write(u'<head>\n')
f.write(u'<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>\n')
f.write(u'<link rel="icon" type="image/png" href="/resources/icon/favicon.png"/>\n')
for css in self.css_path:
f.write(u'<link rel="stylesheet" href="/%s"/>' % (css))
f.write(u'<link rel="stylesheet" href="/%s"/>\n' % (css))
if self.title:
f.write(u'<title>%s</title>' % (self.title))
f.write(u'</head><body>')
f.write(u'<title>%s</title>\n' % (self.title))
f.write(u'</head><body>\n')
for block in self.blocks:
block.build(f, filters=filters)
if displayVersion:
f.write(u'<div style="text-align:center;width:100%%">Generated by <a href="%s">IWLA %s</a></div>' %
f.write(u'<div style="text-align:center;width:100%%">Generated by <a href="%s">IWLA %s</a></div>\n' %
("http://indefero.soutade.fr/p/iwla", self.iwla.getVersion()))
f.write(u'</body></html>')
f.close()
@ -403,15 +412,14 @@ class DisplayHTMLBuild(object):
self.pages.append(page)
def build(self, root):
if not self.iwla.dry_run:
display_root = self.iwla.getConfValue('DISPLAY_ROOT', '')
if not os.path.exists(display_root):
os.makedirs(display_root)
for res_path in self.iwla.getResourcesPath():
target = os.path.abspath(res_path)
link_name = os.path.join(display_root, res_path)
if not os.path.exists(link_name):
os.symlink(target, link_name)
display_root = self.iwla.getConfValue('DISPLAY_ROOT', '')
if not os.path.exists(display_root):
os.makedirs(display_root)
for res_path in self.iwla.getResourcesPath():
target = os.path.abspath(res_path)
link_name = os.path.join(display_root, res_path)
if not os.path.exists(link_name):
os.symlink(target, link_name)
for page in self.pages:
page.build(root, filters=self.filters)
@ -419,6 +427,21 @@ class DisplayHTMLBuild(object):
def addColumnFilter(self, column, function, args):
self.filters.append(({'column':column, 'args':args}, function))
def getDisplayName(self, visitor):
display_visitor_ip = True
compact_host_name = True
address = visitor['remote_addr']
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
host_name = address
if compact_host_name:
ip = visitor['remote_ip'].replace('.', '-')
host_name = host_name.replace(ip, 'IP')
ip = ip.replace('-', '')
host_name = host_name.replace(ip, 'IP')
address = '%s [%s]' % (host_name, visitor['remote_ip'])
return address
#
# Global functions

View File

@ -6,7 +6,7 @@ Introduction
iwla (Intelligent Web Log Analyzer) is basically a clone of [awstats](http://www.awstats.org). The main problem with awstats is that it's a very monolithic project with everything in one big PERL file. In opposite, iwla has been though to be very modular : a small core analysis and a lot of filters. It can be viewed as UNIX pipes. Philosophy of iwla is : add, update, delete ! That's the job of each filter : modify statistics until final result. It's written in Python.
Nevertheless, iwla is only focused on HTTP logs. It uses data (robots definitions, search engines definitions) and design from awstats. Moreover, it's not dynamic, but only generates static HTML page (with gzip compression option).
Nevertheless, iwla is only focused on HTTP logs. It uses data (search engines definitions) and design from awstats. Moreover, it's not dynamic, but only generates static HTML page (with gzip compression option).
Demo
----
@ -16,8 +16,7 @@ A demonstration instance is available [here](https://iwla-demo.soutade.fr)
Usage
-----
./iwla [-c|--config-file file] [-C|--clean-output] [-i|--stdin] [-f FILE|--file FILE] [-d LOGLEVEL|--log-level LOGLEVEL] [-r|--reset year/month] [-z|--dont-compress] [-p] [-D|--dry-run]
./iwla [-c|--config-file file] [-C|--clean-output] [-i|--stdin] [-f FILE|--file FILE] [-d LOGLEVEL|--log-level LOGLEVEL] [-r|--reset year/month] [-z|--dont-compress] [-p] [-P|--disable-display] [-D|--dry-run]
-c : Configuration file to use (default conf.py)
-C : Clean output (database and HTML) before starting
-i : Read data from stdin instead of conf.analyzed_filename
@ -26,6 +25,7 @@ Usage
-r : Reset analysis to a specific date (month/year)
-z : Don't compress databases (bigger but faster, not compatible with compressed databases)
-p : Only generate display
-P : Don't generate display
-d : Dry run (don't write/update files to disk)
Basic usage
@ -48,6 +48,7 @@ You can also append an element to an existing default configuration list by usin
multimedia_files_append = ['xml']
or
multimedia_files_append = 'xml'
Will append 'xml' to current multimedia_files list
Then, you can launch iwla. Output HTML files are created in _output_ directory by default. To quickly see it, go into _output_ and type
@ -87,7 +88,7 @@ To use plugins, just insert their file name (without _.py_ extension) in _pre_an
Statistics are stored in dictionaries :
* **month_stats** : Statistics of current analysed month
* **valid_visitor** : A subset of month_stats without robots
* **valid_visitors** : A subset of month_stats without robots
* **days_stats** : Statistics of current analysed day
* **visits** : All visitors with all of its requests (only if 'keep_requests' is true or filtered)
* **meta** : Final result of month statistics (by year)
@ -103,6 +104,7 @@ The two functions to overload are _load(self)_ that must returns True or False i
For display plugins, a lot of code has been wrote in _display.py_ that simplify the creation on HTML blocks, tables and bar graphs.
Plugins
=======
@ -116,26 +118,31 @@ Optional configuration values ends with *.
* plugins/display/filter_users.py
* plugins/display/hours_stats.py
* plugins/display/ip_to_geo.py
* plugins/display/ip_type.py
* plugins/display/istats_diff.py
* plugins/display/operating_systems.py
* plugins/display/referers_diff.py
* plugins/display/referers.py
* plugins/display/robot_bandwidth.py
* plugins/display/subdomains.py
* plugins/display/top_downloads_diff.py
* plugins/display/top_downloads.py
* plugins/display/top_hits.py
* plugins/display/top_pages_diff.py
* plugins/display/top_pages.py
* plugins/display/top_visitors.py
* plugins/display/visitor_ip.py
* plugins/post_analysis/anonymize_ip.py
* plugins/post_analysis/browsers.py
* plugins/post_analysis/feeds.py
* plugins/post_analysis/filter_users.py
* plugins/post_analysis/hours_stats.py
* plugins/post_analysis/ip_to_geo.py
* plugins/post_analysis/ip_type.py
* plugins/post_analysis/operating_systems.py
* plugins/post_analysis/referers.py
* plugins/post_analysis/reverse_dns.py
* plugins/post_analysis/subdomains.py
* plugins/post_analysis/top_downloads.py
* plugins/post_analysis/top_hits.py
* plugins/post_analysis/top_pages.py
@ -159,6 +166,7 @@ iwla
locales_path
compress_output_files
excluded_ip
excluded_domain_name
Output files :
DB_ROOT/meta.db
@ -199,7 +207,7 @@ iwla
nb_visitors
visits :
remote_addr =>
remote_ip =>
remote_addr
remote_ip
viewed_pages{0..31} # 0 contains total
@ -423,6 +431,32 @@ plugins.display.ip_to_geo
None
plugins.display.ip_type
-----------------------
Display hook
Add IPv4/IPv6 statistics
Plugin requirements :
post_analysis/ip_type
Conf values needed :
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.istats_diff
---------------------------
@ -543,7 +577,6 @@ plugins.display.robot_bandwidth
None
Conf values needed :
display_visitor_ip*
create_all_robot_bandwidth_page*
Output files :
@ -560,6 +593,32 @@ plugins.display.robot_bandwidth
None
plugins.display.subdomains
--------------------------
Display hook
Add subdomains statistics
Plugin requirements :
post_analysis/subdomains
Conf values needed :
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.top_downloads_diff
----------------------------------
@ -707,7 +766,33 @@ plugins.display.top_visitors
None
Conf values needed :
display_visitor_ip*
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.visitor_ip
--------------------------
Display hook
Display IP below visitor name
Plugin requirements :
None
Conf values needed :
compact_ip*
Output files :
OUTPUT_ROOT/year/month/index.html
@ -767,7 +852,7 @@ plugins.post_analysis.browsers
Statistics creation :
visits :
remote_addr =>
remote_ip =>
browser
month_stats :
@ -790,21 +875,25 @@ plugins.post_analysis.feeds
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_referers*
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_addr =>
remote_ip =>
feed_parser
feed_name_analysed
feed_parser_last_access (for merged parser)
Statistics update :
None
@ -856,13 +945,13 @@ plugins.post_analysis.filter_users
Statistics creation :
visits :
remote_addr =>
remote_ip =>
filtered
geo_location
Statistics update :
visits :
remote_addr =>
remote_ip =>
keep_requests
Statistics deletion :
@ -936,6 +1025,37 @@ plugins.post_analysis.ip_to_geo
None
plugins.post_analysis.ip_type
-----------------------------
Post analysis hook
Detect if IP is IPv4 or IPv6
Plugin requirements :
None
Conf values needed :
None
Output files :
None
Statistics creation :
visits :
remote_ip =>
ip_type
month_stats :
ip_type : {4: XXX, 6: XXX}
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.operating_systems
---------------------------------------
@ -954,7 +1074,7 @@ plugins.post_analysis.operating_systems
Statistics creation :
visits :
remote_addr =>
remote_ip =>
operating_system
month_stats :
@ -1037,6 +1157,34 @@ plugins.post_analysis.reverse_dns
None
plugins.post_analysis.subdomains
--------------------------------
Post analysis hook
Group top pages by subdomains
Plugin requirements :
post_analysis/top_pages
Conf values needed :
None
Output files :
None
Statistics creation :
month_stats:
subdomains =>
domain => count
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.top_downloads
-----------------------------------
@ -1160,7 +1308,8 @@ plugins.pre_analysis.robots
None
Conf values needed :
None
count_hit_only_visitors
no_referrer_domains
Output files :
None

View File

@ -6,7 +6,7 @@ Introduction
iwla (Intelligent Web Log Analyzer) is basically a clone of [awstats](http://www.awstats.org). The main problem with awstats is that it's a very monolithic project with everything in one big PERL file. In opposite, iwla has been though to be very modular : a small core analysis and a lot of filters. It can be viewed as UNIX pipes. Philosophy of iwla is : add, update, delete ! That's the job of each filter : modify statistics until final result. It's written in Python.
Nevertheless, iwla is only focused on HTTP logs. It uses data (robots definitions, search engines definitions) and design from awstats. Moreover, it's not dynamic, but only generates static HTML page (with gzip compression option).
Nevertheless, iwla is only focused on HTTP logs. It uses data (search engines definitions) and design from awstats. Moreover, it's not dynamic, but only generates static HTML page (with gzip compression option).
Demo
----
@ -16,8 +16,7 @@ A demonstration instance is available [here](https://iwla-demo.soutade.fr)
Usage
-----
./iwla [-c|--config-file file] [-C|--clean-output] [-i|--stdin] [-f FILE|--file FILE] [-d LOGLEVEL|--log-level LOGLEVEL] [-r|--reset year/month] [-z|--dont-compress] [-p] [-D|--dry-run]
./iwla [-c|--config-file file] [-C|--clean-output] [-i|--stdin] [-f FILE|--file FILE] [-d LOGLEVEL|--log-level LOGLEVEL] [-r|--reset year/month] [-z|--dont-compress] [-p] [-P|--disable-display] [-D|--dry-run]
-c : Configuration file to use (default conf.py)
-C : Clean output (database and HTML) before starting
-i : Read data from stdin instead of conf.analyzed_filename
@ -26,6 +25,7 @@ Usage
-r : Reset analysis to a specific date (month/year)
-z : Don't compress databases (bigger but faster, not compatible with compressed databases)
-p : Only generate display
-P : Don't generate display
-d : Dry run (don't write/update files to disk)
Basic usage
@ -48,6 +48,7 @@ You can also append an element to an existing default configuration list by usin
multimedia_files_append = ['xml']
or
multimedia_files_append = 'xml'
Will append 'xml' to current multimedia_files list
Then, you can launch iwla. Output HTML files are created in _output_ directory by default. To quickly see it, go into _output_ and type
@ -87,7 +88,7 @@ To use plugins, just insert their file name (without _.py_ extension) in _pre_an
Statistics are stored in dictionaries :
* **month_stats** : Statistics of current analysed month
* **valid_visitor** : A subset of month_stats without robots
* **valid_visitors** : A subset of month_stats without robots
* **days_stats** : Statistics of current analysed day
* **visits** : All visitors with all of its requests (only if 'keep_requests' is true or filtered)
* **meta** : Final result of month statistics (by year)
@ -103,6 +104,7 @@ The two functions to overload are _load(self)_ that must returns True or False i
For display plugins, a lot of code has been wrote in _display.py_ that simplify the creation on HTML blocks, tables and bar graphs.
Plugins
=======

View File

@ -6,26 +6,31 @@
* plugins/display/filter_users.py
* plugins/display/hours_stats.py
* plugins/display/ip_to_geo.py
* plugins/display/ip_type.py
* plugins/display/istats_diff.py
* plugins/display/operating_systems.py
* plugins/display/referers_diff.py
* plugins/display/referers.py
* plugins/display/robot_bandwidth.py
* plugins/display/subdomains.py
* plugins/display/top_downloads_diff.py
* plugins/display/top_downloads.py
* plugins/display/top_hits.py
* plugins/display/top_pages_diff.py
* plugins/display/top_pages.py
* plugins/display/top_visitors.py
* plugins/display/visitor_ip.py
* plugins/post_analysis/anonymize_ip.py
* plugins/post_analysis/browsers.py
* plugins/post_analysis/feeds.py
* plugins/post_analysis/filter_users.py
* plugins/post_analysis/hours_stats.py
* plugins/post_analysis/ip_to_geo.py
* plugins/post_analysis/ip_type.py
* plugins/post_analysis/operating_systems.py
* plugins/post_analysis/referers.py
* plugins/post_analysis/reverse_dns.py
* plugins/post_analysis/subdomains.py
* plugins/post_analysis/top_downloads.py
* plugins/post_analysis/top_hits.py
* plugins/post_analysis/top_pages.py
@ -49,6 +54,7 @@ iwla
locales_path
compress_output_files
excluded_ip
excluded_domain_name
Output files :
DB_ROOT/meta.db
@ -89,7 +95,7 @@ iwla
nb_visitors
visits :
remote_addr =>
remote_ip =>
remote_addr
remote_ip
viewed_pages{0..31} # 0 contains total
@ -313,6 +319,32 @@ plugins.display.ip_to_geo
None
plugins.display.ip_type
-----------------------
Display hook
Add IPv4/IPv6 statistics
Plugin requirements :
post_analysis/ip_type
Conf values needed :
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.istats_diff
---------------------------
@ -433,7 +465,6 @@ plugins.display.robot_bandwidth
None
Conf values needed :
display_visitor_ip*
create_all_robot_bandwidth_page*
Output files :
@ -450,6 +481,32 @@ plugins.display.robot_bandwidth
None
plugins.display.subdomains
--------------------------
Display hook
Add subdomains statistics
Plugin requirements :
post_analysis/subdomains
Conf values needed :
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.top_downloads_diff
----------------------------------
@ -597,7 +654,33 @@ plugins.display.top_visitors
None
Conf values needed :
display_visitor_ip*
None
Output files :
OUTPUT_ROOT/year/month/index.html
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
plugins.display.visitor_ip
--------------------------
Display hook
Display IP below visitor name
Plugin requirements :
None
Conf values needed :
compact_ip*
Output files :
OUTPUT_ROOT/year/month/index.html
@ -657,7 +740,7 @@ plugins.post_analysis.browsers
Statistics creation :
visits :
remote_addr =>
remote_ip =>
browser
month_stats :
@ -680,21 +763,25 @@ plugins.post_analysis.feeds
If merge_feeds_parsers is set to True, merge feeds parsers with the same user agent
as it must be the same person with a different IP address.
Warning : When merge_feeds_parsers is activated, last access display date is the more
recent date of all merged parsers found
Plugin requirements :
None
Conf values needed :
feeds
feeds_referers*
feeds_agents*
merge_feeds_parsers*
Output files :
None
Statistics creation :
remote_addr =>
remote_ip =>
feed_parser
feed_name_analysed
feed_parser_last_access (for merged parser)
Statistics update :
None
@ -746,13 +833,13 @@ plugins.post_analysis.filter_users
Statistics creation :
visits :
remote_addr =>
remote_ip =>
filtered
geo_location
Statistics update :
visits :
remote_addr =>
remote_ip =>
keep_requests
Statistics deletion :
@ -826,6 +913,37 @@ plugins.post_analysis.ip_to_geo
None
plugins.post_analysis.ip_type
-----------------------------
Post analysis hook
Detect if IP is IPv4 or IPv6
Plugin requirements :
None
Conf values needed :
None
Output files :
None
Statistics creation :
visits :
remote_ip =>
ip_type
month_stats :
ip_type : {4: XXX, 6: XXX}
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.operating_systems
---------------------------------------
@ -844,7 +962,7 @@ plugins.post_analysis.operating_systems
Statistics creation :
visits :
remote_addr =>
remote_ip =>
operating_system
month_stats :
@ -927,6 +1045,34 @@ plugins.post_analysis.reverse_dns
None
plugins.post_analysis.subdomains
--------------------------------
Post analysis hook
Group top pages by subdomains
Plugin requirements :
post_analysis/top_pages
Conf values needed :
None
Output files :
None
Statistics creation :
month_stats:
subdomains =>
domain => count
Statistics update :
None
Statistics deletion :
None
plugins.post_analysis.top_downloads
-----------------------------------
@ -1050,7 +1196,8 @@ plugins.pre_analysis.robots
None
Conf values needed :
None
count_hit_only_visitors
no_referrer_domains
Output files :
None

155
iwla.py
View File

@ -52,6 +52,7 @@ Conf values needed :
locales_path
compress_output_files
excluded_ip
excluded_domain_name
Output files :
DB_ROOT/meta.db
@ -92,7 +93,7 @@ days_stats :
nb_visitors
visits :
remote_addr =>
remote_ip =>
remote_addr
remote_ip
viewed_pages{0..31} # 0 contains total
@ -132,9 +133,9 @@ class IWLA(object):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
IWLA_VERSION = '0.6'
IWLA_VERSION = '0.7'
def __init__(self, logLevel, dry_run):
def __init__(self, logLevel, args):
self.meta_infos = {}
self.analyse_started = False
self.current_analysis = {}
@ -142,7 +143,7 @@ class IWLA(object):
self.cache_plugins = {}
self.display = DisplayHTMLBuild(self)
self.valid_visitors = None
self.dry_run = dry_run
self.args = args
self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
@ -155,13 +156,16 @@ class IWLA(object):
self.excluded_ip = []
for ip in conf.excluded_ip:
self.excluded_ip += [re.compile(ip)]
self.excluded_domain_name = []
for domain_name in conf.excluded_domain_name:
self.excluded_domain_name += [re.compile(domain_name)]
self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
(conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
(conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
self.logger = logging.getLogger(self.__class__.__name__)
if self.dry_run:
if self.args.dry_run:
self.logger.info('==> Start (DRY RUN)')
else:
self.logger.info('==> Start')
@ -256,7 +260,8 @@ class IWLA(object):
return gzip.open(filename, prot)
def _serialize(self, obj, filename):
if self.dry_run: return
if self.args.dry_run: return
self.logger.info("==> Serialize to %s" % (filename))
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
@ -299,6 +304,10 @@ class IWLA(object):
if request.endswith(e):
self.logger.debug("True")
return True
# No extension -> page
if not '.' in request.split('/')[-1]:
self.logger.debug("True")
return True
self.logger.debug("False")
return False
@ -318,21 +327,26 @@ class IWLA(object):
return True
def isRobot(self, hit):
return hit['robot']
# By default robot is None
return hit['robot'] == True
def _appendHit(self, hit):
remote_addr = hit['remote_addr']
remote_ip = hit['remote_ip']
if not remote_addr: return
if not remote_ip: return
for ip in self.excluded_ip:
if ip.match(remote_addr):
if ip.match(remote_ip):
return
if not remote_addr in self.current_analysis['visits'].keys():
# Redirected page/hit
if int(hit['status']) in (301, 302, 307, 308):
return
if not remote_ip in self.current_analysis['visits'].keys():
self._createVisitor(hit)
super_hit = self.current_analysis['visits'][remote_addr]
super_hit = self.current_analysis['visits'][remote_ip]
# Don't keep all requests for robots
if not super_hit['robot']:
super_hit['requests'].append(hit)
@ -344,7 +358,6 @@ class IWLA(object):
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri)
@ -375,17 +388,18 @@ class IWLA(object):
super_hit['bandwidth'] = {0:0}
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['robot'] = None
super_hit['hit_only'] = 0
def _normalizeURI(self, uri, removeFileSlash=False):
def _normalizeURI(self, uri, removeFileSlash=True):
if uri == '/': return uri
# Remove protocol
uri = self.protocol_re.sub('', uri)
# Remove double /
uri = self.slash_re.sub('/', uri)
if removeFileSlash and uri[-1] == '/':
uri = uri[:-1]
if removeFileSlash:
while len(uri) > 1 and uri[-1] == '/':
uri = uri[:-1]
return uri
def _normalizeParameters(self, parameters):
@ -416,8 +430,11 @@ class IWLA(object):
referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups