Add "ignore_url" parameter to iwla

2025-02-03 08:04:57 +01:00 · 2025-02-03 08:04:57 +01:00 · 9b32a81ddb
commit 9b32a81ddb
parent 7b0ca661a1
2 changed files with 23 additions and 8 deletions
--- a/default_conf.py
+++ b/default_conf.py
@ -38,6 +38,9 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
 # HTTP codes that are considered OK
 viewed_http_codes = [200, 304]

+# URL to ignore
+ignore_url = []
+
 # If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
 count_hit_only_visitors = False

@ -73,3 +76,6 @@ no_referrer_domains = []

 # Domains used by robots
 robot_domains = []
+
+# Feeds agent identifier
+feeds_agents = [r'.*NextCloud-News']
--- a/iwla.py
+++ b/iwla.py
@ -51,10 +51,13 @@ Conf values needed :
    analyzed_filename
    domain_name
    locales_path
+    locale
+    keep_requests*
    compress_output_files
    excluded_ip
    excluded_domain_name
    reverse_dns_timeout*
+    ignore_url*

 Output files :
    DB_ROOT/meta.db
@ -165,6 +168,9 @@ class IWLA(object):
        self.excluded_domain_name = []
        for domain_name in conf.excluded_domain_name:
            self.excluded_domain_name += [re.compile(domain_name)]
+        self.ignore_url = []
+        for url in conf.ignore_url:
+            self.ignore_url += [re.compile(url)]
        self.multimedia_files_re = []
        for file_re in conf.multimedia_files_re:
            self.multimedia_files_re += [re.compile(file_re)]
@ -365,16 +371,22 @@ class IWLA(object):
        return hit['robot'] == True

    def _appendHit(self, hit):
-        remote_ip = hit['remote_ip']
+        # Redirected page/hit
+        if int(hit['status']) in (301, 302, 307, 308):
+            return

+        remote_ip = hit['remote_ip']	
        if not remote_ip: return

        for ip in self.excluded_ip:
            if ip.match(remote_ip):
                return

-        # Redirected page/hit
-        if int(hit['status']) in (301, 302, 307, 308):
+        request = hit['extract_request']
+        uri = request.get('extract_uri', request['http_uri'])
+
+        for url in self.ignore_url:
+            if url.match(uri):
                return
            
        if not remote_ip in self.current_analysis['visits'].keys():
@ -391,9 +403,6 @@ class IWLA(object):
            super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
            super_hit['last_access'] = self.meta_infos['last_time']

-        request = hit['extract_request']
-        uri = request.get('extract_uri', request['http_uri'])
-
        hit['is_page'] = self.isPage(uri)

        if super_hit['robot'] or\