Compare commits

..

No commits in common. "9b32a81ddbfe91a5edb705d9052c5a6f65223d39" and "4d0b993aecc9b468353031776bc87b2f3bb07a43" have entirely different histories.

3 changed files with 11 additions and 28 deletions

View File

@ -38,9 +38,6 @@ pages_extensions = ['/', 'htm', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
# HTTP codes that are considered OK # HTTP codes that are considered OK
viewed_http_codes = [200, 304] viewed_http_codes = [200, 304]
# URL to ignore
ignore_url = []
# If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...) # If False, doesn't cout visitors that doesn't GET a page but resources only (images, rss...)
count_hit_only_visitors = False count_hit_only_visitors = False
@ -76,6 +73,3 @@ no_referrer_domains = []
# Domains used by robots # Domains used by robots
robot_domains = [] robot_domains = []
# Feeds agent identifier
feeds_agents = [r'.*NextCloud-News']

25
iwla.py
View File

@ -51,13 +51,10 @@ Conf values needed :
analyzed_filename analyzed_filename
domain_name domain_name
locales_path locales_path
locale
keep_requests*
compress_output_files compress_output_files
excluded_ip excluded_ip
excluded_domain_name excluded_domain_name
reverse_dns_timeout* reverse_dns_timeout*
ignore_url*
Output files : Output files :
DB_ROOT/meta.db DB_ROOT/meta.db
@ -168,9 +165,6 @@ class IWLA(object):
self.excluded_domain_name = [] self.excluded_domain_name = []
for domain_name in conf.excluded_domain_name: for domain_name in conf.excluded_domain_name:
self.excluded_domain_name += [re.compile(domain_name)] self.excluded_domain_name += [re.compile(domain_name)]
self.ignore_url = []
for url in conf.ignore_url:
self.ignore_url += [re.compile(url)]
self.multimedia_files_re = [] self.multimedia_files_re = []
for file_re in conf.multimedia_files_re: for file_re in conf.multimedia_files_re:
self.multimedia_files_re += [re.compile(file_re)] self.multimedia_files_re += [re.compile(file_re)]
@ -371,24 +365,18 @@ class IWLA(object):
return hit['robot'] == True return hit['robot'] == True
def _appendHit(self, hit): def _appendHit(self, hit):
# Redirected page/hit remote_ip = hit['remote_ip']
if int(hit['status']) in (301, 302, 307, 308):
return
remote_ip = hit['remote_ip']
if not remote_ip: return if not remote_ip: return
for ip in self.excluded_ip: for ip in self.excluded_ip:
if ip.match(remote_ip): if ip.match(remote_ip):
return return
request = hit['extract_request'] # Redirected page/hit
uri = request.get('extract_uri', request['http_uri']) if int(hit['status']) in (301, 302, 307, 308):
return
for url in self.ignore_url:
if url.match(uri):
return
if not remote_ip in self.current_analysis['visits'].keys(): if not remote_ip in self.current_analysis['visits'].keys():
self._createVisitor(hit) self._createVisitor(hit)
@ -403,6 +391,9 @@ class IWLA(object):
super_hit['bandwidth'][0] += int(hit['body_bytes_sent']) super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time'] super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
uri = request.get('extract_uri', request['http_uri'])
hit['is_page'] = self.isPage(uri) hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\ if super_hit['robot'] or\

View File

@ -132,10 +132,7 @@ class IWLAPreAnalysisRobots(IPlugin):
# 2) Less than 1 hit per page # 2) Less than 1 hit per page
if super_hit['viewed_pages'][0] and (super_hit['viewed_hits'][0] < super_hit['viewed_pages'][0]): if super_hit['viewed_pages'][0] and (super_hit['viewed_hits'][0] < super_hit['viewed_pages'][0]):
isRobot = True isRobot = True
# 2.5) 1 page, 1 hit
elif super_hit['viewed_pages'][0] == 1 and super_hit['viewed_hits'][0] == 1:
isRobot = True
if isRobot: if isRobot:
self._setRobot(k, super_hit) self._setRobot(k, super_hit)
continue continue
@ -155,7 +152,8 @@ class IWLAPreAnalysisRobots(IPlugin):
# Exception for favicon.png and all apple-*icon* # Exception for favicon.png and all apple-*icon*
if int(hit['status']) >= 400 and int(hit['status']) <= 499 and\ if int(hit['status']) >= 400 and int(hit['status']) <= 499 and\
'icon' not in hit['extract_request']['http_uri']: 'icon' not in hit['extract_request']['http_uri'] and\
hit['server_name'] != 'forge.soutade.fr':
error_codes += 1 error_codes += 1
elif int(hit['status']) in (304,): elif int(hit['status']) in (304,):
not_modified_pages += 1 not_modified_pages += 1