iwla/plugins/post_analysis/referers.py

# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2015

# This file is part of iwla

# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla.  If not, see <http://www.gnu.org/licenses/>.
#

import re
import urllib.parse

from iwla import IWLA
from iplugin import IPlugin

import awstats_data

"""
Post analysis hook

Extract referers and key phrases from requests

Plugin requirements :
    None

Conf values needed :
    domain_name

Output files :
    None

Statistics creation :
    None

Statistics update :
month_stats :
   referers =>
       pages => count
       hits  => count
   robots_referers =>
       pages => count
       hits  => count
   search_engine_referers =>
       pages => count
       hits  => count
   key_phrases =>
       phrase => count

Statistics deletion :
    None
"""

class IWLAPostAnalysisReferers(IPlugin):
    def __init__(self, iwla):
        super(IWLAPostAnalysisReferers, self).__init__(iwla)
        self.API_VERSION = 1
        self.conf_requires = ['domain_name']

    def _getSearchEngine(self, hashid):
        for (k, e) in self.search_engines.items():
            for (h,h_re) in e['hashid']:
                if hashid == h:
                    return k
        return None

    def load(self):
        domain_name = self.iwla.getConfValue('domain_name', '')

        if not domain_name:
            print('domain_name must not be empty !')
            return False

        self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
        self.search_engines = {}

        for (hashid, name) in awstats_data.search_engines_hashid.items():
            hashid_re = re.compile(r'.*%s.*' % (hashid))
            if not name in self.search_engines.keys():
                self.search_engines[name] = {
                    'hashid' : [(hashid, hashid_re)]
                    }
            else:
                self.search_engines[name]['hashid'].append((hashid, hashid_re))
            #print 'Hashid %s => %s' % (name, hashid)

        for (name, known_url) in awstats_data.search_engines_knwown_url.items():
            self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')

        for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
            not_engine_re = re.compile(r'.*%s.*' % (not_engine))
            key = self._getSearchEngine(engine)
            if key:
                self.search_engines[key]['not_search_engine'] = not_engine_re

        return True

    def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
        if not parameters or not key_phrase_re: return

        for p in parameters.split('&'):
            groups = key_phrase_re.match(p)
            if groups:
                key_phrase = groups.groupdict()['key_phrase']
                try:
                    key_phrase = urllib.parse.unquote_plus(key_phrase)
                except Exception as e:
                    print(e)
                    continue
                if not key_phrase in key_phrases.keys():
                    key_phrases[key_phrase] = 1
                else:
                    key_phrases[key_phrase] += 1
                break

    def hook(self):
        stats = self.iwla.getCurrentVisits()
        month_stats = self.iwla.getMonthStats()

        referers = month_stats.get('referers', {})
        robots_referers = month_stats.get('robots_referers', {})
        search_engine_referers = month_stats.get('search_engine_referers', {})
        key_phrases = month_stats.get('key_phrases', {})

        for (k, super_hit) in stats.items():
            for r in super_hit['requests'][::-1]:
                if not self.iwla.isValidForCurrentAnalysis(r): break
                if not r['http_referer']: continue
                if not self.iwla.hasBeenViewed(r): continue

                uri = r['extract_referer']['extract_uri']
                if self.own_domain_re.match(uri): continue

                if super_hit['robot']:
                    dictionary = robots_referers
                    # print '%s => %s' % (uri, super_hit['remote_ip'])
                else:
                    is_search_engine = False
                    for (name, engine) in self.search_engines.items():
                        for (hashid, hashid_re) in engine['hashid']:
                            if not hashid_re.match(uri): continue

                            not_engine = engine.get('not_search_engine', None)
                            # Try not engine
                            if not_engine and not_engine.match(uri): break
                            is_search_engine = True
                            uri = name

                            parameters = r['extract_referer'].get('extract_parameters', None)
                            key_phrase_re = engine.get('known_url', None)

                            self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
                            break

                    if is_search_engine:
                        dictionary = search_engine_referers
                    else:
                        dictionary = referers

                if r['is_page']:
                    key = 'pages'
                else:
                    key = 'hits'
                if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
                dictionary[uri][key] += 1

        month_stats['referers'] = referers
        month_stats['robots_referers'] = robots_referers
        month_stats['search_engine_referers'] = search_engine_referers
        month_stats['key_phrases'] = key_phrases