From 9b32a81ddbfe91a5edb705d9052c5a6f65223d39 Mon Sep 17 00:00:00 2001
From: Gregory Soutade <gregory@soutade.fr>
Date: Mon, 3 Feb 2025 08:04:57 +0100
Subject: [PATCH] Add "ignore_url" parameter to iwla

---
 default_conf.py |  6 ++++++
 iwla.py         | 25 +++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/default_conf.py b/default_conf.py
index d72bc8b..55b9658 100644
--- a/default_conf.py
+++ b/default_conf.py
@@ -38,6 +38,9 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
 # HTTP codes that are considered OK
 viewed_http_codes = [200, 304]
 
+# URL to ignore
+ignore_url = []
+
 # If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
 count_hit_only_visitors = False
 
@@ -73,3 +76,6 @@ no_referrer_domains = []
 
 # Domains used by robots
 robot_domains = []
+
+# Feeds agent identifier
+feeds_agents = [r'.*NextCloud-News']
diff --git a/iwla.py b/iwla.py
index c57f07f..cb8b45b 100755
--- a/iwla.py
+++ b/iwla.py
@@ -51,10 +51,13 @@ Conf values needed :
     analyzed_filename
     domain_name
     locales_path
+    locale
+    keep_requests*
     compress_output_files
     excluded_ip
     excluded_domain_name
     reverse_dns_timeout*
+    ignore_url*
 
 Output files :
     DB_ROOT/meta.db
@@ -165,6 +168,9 @@ class IWLA(object):
         self.excluded_domain_name = []
         for domain_name in conf.excluded_domain_name:
             self.excluded_domain_name += [re.compile(domain_name)]
+        self.ignore_url = []
+        for url in conf.ignore_url:
+            self.ignore_url += [re.compile(url)]
         self.multimedia_files_re = []
         for file_re in conf.multimedia_files_re:
             self.multimedia_files_re += [re.compile(file_re)]
@@ -365,18 +371,24 @@ class IWLA(object):
         return hit['robot'] == True
 
     def _appendHit(self, hit):
-        remote_ip = hit['remote_ip']
-	
+        # Redirected page/hit
+        if int(hit['status']) in (301, 302, 307, 308):
+            return
+
+        remote_ip = hit['remote_ip']	
         if not remote_ip: return
 
         for ip in self.excluded_ip:
             if ip.match(remote_ip):
                 return
 
-        # Redirected page/hit
-        if int(hit['status']) in (301, 302, 307, 308):
-            return
+        request = hit['extract_request']
+        uri = request.get('extract_uri', request['http_uri'])
 
+        for url in self.ignore_url:
+            if url.match(uri):
+                return
+            
         if not remote_ip in self.current_analysis['visits'].keys():
             self._createVisitor(hit)
 	    
@@ -391,9 +403,6 @@ class IWLA(object):
             super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
             super_hit['last_access'] = self.meta_infos['last_time']
 
-        request = hit['extract_request']
-        uri = request.get('extract_uri', request['http_uri'])
-
         hit['is_page'] = self.isPage(uri)
 
         if super_hit['robot'] or\