Handle URLs with empty referer

This commit is contained in:
Gregory Soutade 2017-08-23 20:11:17 +02:00
parent 3b3ae1ea3e
commit fffab335fa
2 changed files with 6 additions and 6 deletions

View File

@ -22,7 +22,7 @@ DB_FILENAME = 'iwla.db'
# Web server log format (nginx style). Default is apache log format # Web server log format (nginx style). Default is apache log format
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\ log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
'"$request" $status $body_bytes_sent ' +\ '"$request" $status $body_bytes_sent ' +\
'"$http_referer" "$http_user_agent"' '"$http_referer?" "$http_user_agent?"'
# Time format used in log format # Time format used in log format
time_format = '%d/%b/%Y:%H:%M:%S %z' time_format = '%d/%b/%Y:%H:%M:%S %z'

10
iwla.py
View File

@ -143,7 +143,7 @@ class IWLA(object):
self.valid_visitors = None self.valid_visitors = None
self.dry_run = dry_run self.dry_run = dry_run
self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format) self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted) self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)') self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
self.log_re = re.compile(self.log_format_extracted) self.log_re = re.compile(self.log_format_extracted)
@ -374,10 +374,10 @@ class IWLA(object):
groups = self.http_request_extracted.match(hit['request']) groups = self.http_request_extracted.match(hit['request'])
if groups: if groups:
hit['extract_request'] = groups.groupdict() hit['extract_request'] = groups.groupdict("")
uri_groups = self.uri_re.match(hit['extract_request']['http_uri']) uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups: if uri_groups:
d = uri_groups.groupdict() d = uri_groups.groupdict("")
hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri']) hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
if 'extract_parameters' in d.keys(): if 'extract_parameters' in d.keys():
parameters = self._normalizeParameters(d['extract_parameters']) parameters = self._normalizeParameters(d['extract_parameters'])
@ -390,7 +390,7 @@ class IWLA(object):
if hit['http_referer']: if hit['http_referer']:
referer_groups = self.uri_re.match(hit['http_referer']) referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups: if referer_groups:
hit['extract_referer'] = referer_groups.groupdict() hit['extract_referer'] = referer_groups.groupdict("")
hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri']) hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters']) hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
return True return True
@ -781,7 +781,7 @@ class IWLA(object):
groups = self.log_re.match(l) groups = self.log_re.match(l)
if groups: if groups:
self._newHit(groups.groupdict()) self._newHit(groups.groupdict(""))
else: else:
self.logger.warning("No match for %s" % (l)) self.logger.warning("No match for %s" % (l))
#break #break