iwla/plugins/post_analysis/referers.py

# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015

# This file is part of iwla

# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla.  If not, see <http://www.gnu.org/licenses/>.
#

import re
import urllib.parse

from iwla import IWLA
from iplugin import IPlugin

import awstats_data

"""
Post analysis hook

Extract referers and key phrases from requests

Plugin requirements :
    None

Conf values needed :
    domain_name

Output files :
    None

Statistics creation :
    None

Statistics update :
month_stats :
   referers =>
       pages => count
       hits  => count
   robots_referers =>
       pages => count
       hits  => count
   search_engine_referers =>
       pages => count
       hits  => count
   key_phrases =>
       phrase => count

Statistics deletion :
    None
"""

class IWLAPostAnalysisReferers(IPlugin):
    def __init__(self, iwla):
        super(IWLAPostAnalysisReferers, self).__init__(iwla)
        self.API_VERSION = 1
        self.conf_requires = ['domain_name']

    def _getSearchEngine(self, hashid):
        for (k, e) in self.search_engines.items():
            for (h,h_re) in e['hashid']:
                if hashid == h:
                    return k
        return None

    def load(self):
        domain_name = self.iwla.getConfValue('domain_name', '')
        
        if not domain_name:
            print('domain_name must not be empty !')
            return False

        self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
        self.search_engines = {}

        for (hashid, name) in awstats_data.search_engines_hashid.items():
            hashid_re = re.compile(r'.*%s.*' % (hashid))
            if not name in self.search_engines.keys():
                self.search_engines[name] = {
                    'hashid' : [(hashid, hashid_re)]
                    }
            else:
                self.search_engines[name]['hashid'].append((hashid, hashid_re))
            #print 'Hashid %s => %s' % (name, hashid)

        for (name, known_url) in awstats_data.search_engines_knwown_url.items():
            self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')

        for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
            not_engine_re = re.compile(r'.*%s.*' % (not_engine))
            key = self._getSearchEngine(engine)
            if key:
                self.search_engines[key]['not_search_engine'] = not_engine_re

        return True

    def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
        if not parameters or not key_phrase_re: return

        for p in parameters.split('&'):
            groups = key_phrase_re.match(p)
            if groups:
                key_phrase = groups.groupdict()['key_phrase']
                try:
                    key_phrase = urllib.parse.unquote_plus(key_phrase)
                except Exception as e:
                    print(e)
                    continue
                if not key_phrase in key_phrases.keys():
                    key_phrases[key_phrase] = 1
                else:
                    key_phrases[key_phrase] += 1
                break

    def hook(self):
        stats = self.iwla.getCurrentVisits()
        month_stats = self.iwla.getMonthStats()
        
        referers = month_stats.get('referers', {})
        robots_referers = month_stats.get('robots_referers', {})
        search_engine_referers = month_stats.get('search_engine_referers', {})
        key_phrases = month_stats.get('key_phrases', {})

        for (k, super_hit) in stats.items():
            for r in super_hit['requests'][::-1]:
                if not self.iwla.isValidForCurrentAnalysis(r): break
                if not r['http_referer']: continue
                if not self.iwla.hasBeenViewed(r): continue
                
                uri = r['extract_referer']['extract_uri']
                if self.own_domain_re.match(uri): continue

                if super_hit['robot']:
                    dictionary = robots_referers
                    # print '%s => %s' % (uri, super_hit['remote_ip'])
                else:
                    is_search_engine = False
                    for (name, engine) in self.search_engines.items():
                        for (hashid, hashid_re) in engine['hashid']:
                            if not hashid_re.match(uri): continue

                            not_engine = engine.get('not_search_engine', None)
                            # Try not engine
                            if not_engine and not_engine.match(uri): break
                            is_search_engine = True
                            uri = name

                            parameters = r['extract_referer'].get('extract_parameters', None)
                            key_phrase_re = engine.get('known_url', None)

                            self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
                            break

                    if is_search_engine:
                        dictionary = search_engine_referers
                    else:
                        dictionary = referers

                if r['is_page']:
                    key = 'pages'
                else:
                    key = 'hits'
                if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
                dictionary[uri][key] += 1

        month_stats['referers'] = referers
        month_stats['robots_referers'] = robots_referers
        month_stats['search_engine_referers'] = search_engine_referers
        month_stats['key_phrases'] = key_phrases
Add licence information 2014-12-18 19:54:31 +01:00			`# -- coding: utf-8 --`
			`#`
			`# Copyright Grégory Soutadé 2015`

			`# This file is part of iwla`

			`# iwla is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# iwla is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with iwla. If not, see <http://www.gnu.org/licenses/>.`
			`#`

Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`import re`
Update code for Python3 2020-12-09 13:24:29 +01:00			`import urllib.parse`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00
			`from iwla import IWLA`
			`from iplugin import IPlugin`

			`import awstats_data`

Replace # for module description by """ (help auto extraction) 2014-12-19 11:34:25 +01:00			`"""`
			`Post analysis hook`

			`Extract referers and key phrases from requests`

			`Plugin requirements :`
			`None`

			`Conf values needed :`
			`domain_name`

			`Output files :`
			`None`

			`Statistics creation :`
			`None`

			`Statistics update :`
			`month_stats :`
			`referers =>`
Update documentation of some plugins 2015-01-08 20:58:27 +01:00			`pages => count`
			`hits => count`
Replace # for module description by """ (help auto extraction) 2014-12-19 11:34:25 +01:00			`robots_referers =>`
Update documentation of some plugins 2015-01-08 20:58:27 +01:00			`pages => count`
			`hits => count`
Replace # for module description by """ (help auto extraction) 2014-12-19 11:34:25 +01:00			`search_engine_referers =>`
Update documentation of some plugins 2015-01-08 20:58:27 +01:00			`pages => count`
			`hits => count`
Replace # for module description by """ (help auto extraction) 2014-12-19 11:34:25 +01:00			`key_phrases =>`
Update documentation of some plugins 2015-01-08 20:58:27 +01:00			`phrase => count`
Replace # for module description by """ (help auto extraction) 2014-12-19 11:34:25 +01:00
			`Statistics deletion :`
			`None`
			`"""`
Start big comments (post analysis / referers) 2014-12-09 16:54:02 +01:00
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`class IWLAPostAnalysisReferers(IPlugin):`
			`def __init__(self, iwla):`
			`super(IWLAPostAnalysisReferers, self).__init__(iwla)`
			`self.API_VERSION = 1`
Add conf_requires. Load plugins in order 2014-11-27 12:34:42 +01:00			`self.conf_requires = ['domain_name']`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00
			`def _getSearchEngine(self, hashid):`
			`for (k, e) in self.search_engines.items():`
			`for (h,h_re) in e['hashid']:`
			`if hashid == h:`
			`return k`
			`return None`

			`def load(self):`
			`domain_name = self.iwla.getConfValue('domain_name', '')`

			`if not domain_name:`
Update code for Python3 2020-10-30 14:42:56 +01:00			`print('domain_name must not be empty !')`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`return False`

			`self.own_domain_re = re.compile(r'.%s.' % (domain_name))`
			`self.search_engines = {}`

			`for (hashid, name) in awstats_data.search_engines_hashid.items():`
			`hashid_re = re.compile(r'.%s.' % (hashid))`
			`if not name in self.search_engines.keys():`
			`self.search_engines[name] = {`
			`'hashid' : [(hashid, hashid_re)]`
			`}`
			`else:`
			`self.search_engines[name]['hashid'].append((hashid, hashid_re))`
			`#print 'Hashid %s => %s' % (name, hashid)`

			`for (name, known_url) in awstats_data.search_engines_knwown_url.items():`
			`self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')`

			`for (engine, not_engine) in awstats_data.not_search_engines_keys.items():`
			`not_engine_re = re.compile(r'.%s.' % (not_engine))`
			`key = self._getSearchEngine(engine)`
			`if key:`
			`self.search_engines[key]['not_search_engine'] = not_engine_re`

			`return True`

			`def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):`
			`if not parameters or not key_phrase_re: return`

			`for p in parameters.split('&'):`
			`groups = key_phrase_re.match(p)`
			`if groups:`
			`key_phrase = groups.groupdict()['key_phrase']`
Prevent from error when decoding key phrases 2017-01-29 09:12:47 +01:00			`try:`
Update code for Python3 2020-12-09 13:24:29 +01:00			`key_phrase = urllib.parse.unquote_plus(key_phrase)`
Update code for Python3 2020-10-30 14:42:56 +01:00			`except Exception as e:`
Prevent from error when decoding key phrases 2017-01-29 09:12:47 +01:00			`print(e)`
			`continue`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`if not key_phrase in key_phrases.keys():`
			`key_phrases[key_phrase] = 1`
			`else:`
			`key_phrases[key_phrase] += 1`
			`break`

			`def hook(self):`
Do merge 2016-02-06 14:45:09 +01:00			`stats = self.iwla.getCurrentVisits()`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`month_stats = self.iwla.getMonthStats()`
Add top_pages plugin 2014-11-26 22:03:19 +01:00
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`referers = month_stats.get('referers', {})`
			`robots_referers = month_stats.get('robots_referers', {})`
			`search_engine_referers = month_stats.get('search_engine_referers', {})`
			`key_phrases = month_stats.get('key_phrases', {})`

			`for (k, super_hit) in stats.items():`
Optimize analysis using reverse loop 2014-12-14 15:10:13 +01:00			`for r in super_hit['requests'][::-1]:`
			`if not self.iwla.isValidForCurrentAnalysis(r): break`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`if not r['http_referer']: continue`
Don't analyze referer for non viewed hits/pages 2024-02-15 10:55:38 +01:00			`if not self.iwla.hasBeenViewed(r): continue`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00
			`uri = r['extract_referer']['extract_uri']`
			`if self.own_domain_re.match(uri): continue`

Make backup before compressing (low memory servers) Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin 2019-08-30 07:50:54 +02:00			`if super_hit['robot']:`
Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`dictionary = robots_referers`
			`# print '%s => %s' % (uri, super_hit['remote_ip'])`
			`else:`
Make backup before compressing (low memory servers) Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin 2019-08-30 07:50:54 +02:00			`is_search_engine = False`
			`for (name, engine) in self.search_engines.items():`
			`for (hashid, hashid_re) in engine['hashid']:`
			`if not hashid_re.match(uri): continue`

			`not_engine = engine.get('not_search_engine', None)`
			`# Try not engine`
			`if not_engine and not_engine.match(uri): break`
			`is_search_engine = True`
			`uri = name`

			`parameters = r['extract_referer'].get('extract_parameters', None)`
			`key_phrase_re = engine.get('known_url', None)`

			`self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)`
			`break`

			`if is_search_engine:`
			`dictionary = search_engine_referers`
			`else:`
			`dictionary = referers`

Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display) 2014-11-26 21:06:36 +01:00			`if r['is_page']:`
			`key = 'pages'`
			`else:`
			`key = 'hits'`
			`if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}`
			`dictionary[uri][key] += 1`

			`month_stats['referers'] = referers`
			`month_stats['robots_referers'] = robots_referers`
			`month_stats['search_engine_referers'] = search_engine_referers`
			`month_stats['key_phrases'] = key_phrases`