Add "ignore_url" parameter to iwla
This commit is contained in:
parent
7b0ca661a1
commit
9b32a81ddb
|
@ -38,6 +38,9 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
|
|||
# HTTP codes that are considered OK
|
||||
viewed_http_codes = [200, 304]
|
||||
|
||||
# URL to ignore
|
||||
ignore_url = []
|
||||
|
||||
# If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
|
||||
count_hit_only_visitors = False
|
||||
|
||||
|
@ -73,3 +76,6 @@ no_referrer_domains = []
|
|||
|
||||
# Domains used by robots
|
||||
robot_domains = []
|
||||
|
||||
# Feeds agent identifier
|
||||
feeds_agents = [r'.*NextCloud-News']
|
||||
|
|
21
iwla.py
21
iwla.py
|
@ -51,10 +51,13 @@ Conf values needed :
|
|||
analyzed_filename
|
||||
domain_name
|
||||
locales_path
|
||||
locale
|
||||
keep_requests*
|
||||
compress_output_files
|
||||
excluded_ip
|
||||
excluded_domain_name
|
||||
reverse_dns_timeout*
|
||||
ignore_url*
|
||||
|
||||
Output files :
|
||||
DB_ROOT/meta.db
|
||||
|
@ -165,6 +168,9 @@ class IWLA(object):
|
|||
self.excluded_domain_name = []
|
||||
for domain_name in conf.excluded_domain_name:
|
||||
self.excluded_domain_name += [re.compile(domain_name)]
|
||||
self.ignore_url = []
|
||||
for url in conf.ignore_url:
|
||||
self.ignore_url += [re.compile(url)]
|
||||
self.multimedia_files_re = []
|
||||
for file_re in conf.multimedia_files_re:
|
||||
self.multimedia_files_re += [re.compile(file_re)]
|
||||
|
@ -365,16 +371,22 @@ class IWLA(object):
|
|||
return hit['robot'] == True
|
||||
|
||||
def _appendHit(self, hit):
|
||||
remote_ip = hit['remote_ip']
|
||||
# Redirected page/hit
|
||||
if int(hit['status']) in (301, 302, 307, 308):
|
||||
return
|
||||
|
||||
remote_ip = hit['remote_ip']
|
||||
if not remote_ip: return
|
||||
|
||||
for ip in self.excluded_ip:
|
||||
if ip.match(remote_ip):
|
||||
return
|
||||
|
||||
# Redirected page/hit
|
||||
if int(hit['status']) in (301, 302, 307, 308):
|
||||
request = hit['extract_request']
|
||||
uri = request.get('extract_uri', request['http_uri'])
|
||||
|
||||
for url in self.ignore_url:
|
||||
if url.match(uri):
|
||||
return
|
||||
|
||||
if not remote_ip in self.current_analysis['visits'].keys():
|
||||
|
@ -391,9 +403,6 @@ class IWLA(object):
|
|||
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
|
||||
super_hit['last_access'] = self.meta_infos['last_time']
|
||||
|
||||
request = hit['extract_request']
|
||||
uri = request.get('extract_uri', request['http_uri'])
|
||||
|
||||
hit['is_page'] = self.isPage(uri)
|
||||
|
||||
if super_hit['robot'] or\
|
||||
|
|
Loading…
Reference in New Issue
Block a user