Normalize URL before counting in stats

This commit is contained in:
Gregory Soutade 2022-06-23 21:11:43 +02:00
parent 37a33f1291
commit d36676ca38

18
iwla.py
View File

@ -149,7 +149,8 @@ class IWLA(object):
self.log_re = re.compile(self.log_format_extracted) self.log_re = re.compile(self.log_format_extracted)
self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?') self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
self.domain_name_re = re.compile(r'.*%s' % conf.domain_name) self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
self.final_slashes_re = re.compile(r'/+$') self.slash_re = re.compile(r'//')
self.protocol_re = re.compile(r'^.*://')
self.excluded_ip = [] self.excluded_ip = []
for ip in conf.excluded_ip: for ip in conf.excluded_ip:
self.excluded_ip += [re.compile(ip)] self.excluded_ip += [re.compile(ip)]
@ -372,15 +373,16 @@ class IWLA(object):
super_hit['robot'] = False super_hit['robot'] = False
super_hit['hit_only'] = 0 super_hit['hit_only'] = 0
def _normalizeURI(self, uri): def _normalizeURI(self, uri, removeFileSlash=False):
if uri == '/': return uri if uri == '/': return uri
uri = self.final_slashes_re.sub('/', uri) # Remove protocol
uri = self.protocol_re.sub('', uri)
# Remove double /
uri = self.slash_re.sub('/', uri)
if removeFileSlash and uri[-1] == '/':
uri = uri[:-1]
return uri return uri
def _removeFinalSlashes(self, uri):
if uri == '/': return uri
return self.final_slashes_re.sub('', uri)
def _normalizeParameters(self, parameters): def _normalizeParameters(self, parameters):
# No parameters # No parameters
if parameters == '?': return None if parameters == '?': return None
@ -409,7 +411,7 @@ class IWLA(object):
referer_groups = self.uri_re.match(hit['http_referer']) referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups: if referer_groups:
hit['extract_referer'] = referer_groups.groupdict("") hit['extract_referer'] = referer_groups.groupdict("")
hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri']) hit['extract_referer']['extract_uri'] = self._normalizeURI(hit['extract_referer']['extract_uri'], True)
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters']) hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
return True return True