From 9b32a81ddbfe91a5edb705d9052c5a6f65223d39 Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Mon, 3 Feb 2025 08:04:57 +0100 Subject: [PATCH] Add "ignore_url" parameter to iwla --- default_conf.py | 6 ++++++ iwla.py | 25 +++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/default_conf.py b/default_conf.py index d72bc8b..55b9658 100644 --- a/default_conf.py +++ b/default_conf.py @@ -38,6 +38,9 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] # HTTP codes that are considered OK viewed_http_codes = [200, 304] +# URL to ignore +ignore_url = [] + # If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...) count_hit_only_visitors = False @@ -73,3 +76,6 @@ no_referrer_domains = [] # Domains used by robots robot_domains = [] + +# Feeds agent identifier +feeds_agents = [r'.*NextCloud-News'] diff --git a/iwla.py b/iwla.py index c57f07f..cb8b45b 100755 --- a/iwla.py +++ b/iwla.py @@ -51,10 +51,13 @@ Conf values needed : analyzed_filename domain_name locales_path + locale + keep_requests* compress_output_files excluded_ip excluded_domain_name reverse_dns_timeout* + ignore_url* Output files : DB_ROOT/meta.db @@ -165,6 +168,9 @@ class IWLA(object): self.excluded_domain_name = [] for domain_name in conf.excluded_domain_name: self.excluded_domain_name += [re.compile(domain_name)] + self.ignore_url = [] + for url in conf.ignore_url: + self.ignore_url += [re.compile(url)] self.multimedia_files_re = [] for file_re in conf.multimedia_files_re: self.multimedia_files_re += [re.compile(file_re)] @@ -365,18 +371,24 @@ class IWLA(object): return hit['robot'] == True def _appendHit(self, hit): - remote_ip = hit['remote_ip'] - + # Redirected page/hit + if int(hit['status']) in (301, 302, 307, 308): + return + + remote_ip = hit['remote_ip'] if not remote_ip: return for ip in self.excluded_ip: if ip.match(remote_ip): return - # Redirected page/hit - if int(hit['status']) in (301, 302, 307, 308): - return + request = hit['extract_request'] + uri = request.get('extract_uri', request['http_uri']) + for url in self.ignore_url: + if url.match(uri): + return + if not remote_ip in self.current_analysis['visits'].keys(): self._createVisitor(hit) @@ -391,9 +403,6 @@ class IWLA(object): super_hit['bandwidth'][0] += int(hit['body_bytes_sent']) super_hit['last_access'] = self.meta_infos['last_time'] - request = hit['extract_request'] - uri = request.get('extract_uri', request['http_uri']) - hit['is_page'] = self.isPage(uri) if super_hit['robot'] or\