diff --git a/iwla.py b/iwla.py index 7d8e4a1..c021d01 100755 --- a/iwla.py +++ b/iwla.py @@ -149,7 +149,8 @@ class IWLA(object): self.log_re = re.compile(self.log_format_extracted) self.uri_re = re.compile(r'(?P[^\?#]+)(\?(?P[^#]+))?(#.*)?') self.domain_name_re = re.compile(r'.*%s' % conf.domain_name) - self.final_slashes_re = re.compile(r'/+$') + self.slash_re = re.compile(r'//') + self.protocol_re = re.compile(r'^.*://') self.excluded_ip = [] for ip in conf.excluded_ip: self.excluded_ip += [re.compile(ip)] @@ -372,15 +373,16 @@ class IWLA(object): super_hit['robot'] = False super_hit['hit_only'] = 0 - def _normalizeURI(self, uri): + def _normalizeURI(self, uri, removeFileSlash=False): if uri == '/': return uri - uri = self.final_slashes_re.sub('/', uri) + # Remove protocol + uri = self.protocol_re.sub('', uri) + # Remove double / + uri = self.slash_re.sub('/', uri) + if removeFileSlash and uri[-1] == '/': + uri = uri[:-1] return uri - def _removeFinalSlashes(self, uri): - if uri == '/': return uri - return self.final_slashes_re.sub('', uri) - def _normalizeParameters(self, parameters): # No parameters if parameters == '?': return None @@ -409,7 +411,7 @@ class IWLA(object): referer_groups = self.uri_re.match(hit['http_referer']) if referer_groups: hit['extract_referer'] = referer_groups.groupdict("") - hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri']) + hit['extract_referer']['extract_uri'] = self._normalizeURI(hit['extract_referer']['extract_uri'], True) hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters']) return True