Dynastie/search.py

# -*- coding: utf-8 -*-
"""
  Copyright 2012-2013 Grégory Soutadé

  This file is part of Dynastie.

  Dynastie is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  Dynastie is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Dynastie.  If not, see <http://www.gnu.org/licenses/>.
"""
import re
import unicodedata
import os
import operator
import pickle
from django.db import models
from dynastie.models import Post

class Search:
    MINIMUM_LETTERS = 3

    def __init__(self):
        self.report = ''

        self.tagreg = re.compile('<[^>]+>')
        self.htmlreg = re.compile('&[^;]+;')
        self.numreg = re.compile('[0-9]+')
        self.pat = re.compile(r'\s+')

        self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']',
                                 '-', '|', '\t', '\\', '_', '^' '=', '+', '$',
                                 '£', '%', 'µ', '*', ',', '?', ';', '.', '/',
                                 ':', '!', '§', '€', '²')

    # Imported from generator.py
    def _addReport(self, string, color=''):
        if color != '':
            self.report = self.report + '<span style="color:' + color + '">'
        self.report = self.report + '<b>' + self.__class__.__name__ + '</b> : '
        self.report = self.report + string
        if color != '':
            self.report = self.report + '</span>'
        self.report = self.report + '<br/>\n'

    def _addWarning(self, string):
        self.addReport(string, 'yellow')

    def _addError(self, string):
        self.addReport(string, 'red')


    def _saveDatabase(self, blog, hashtable):
        d = pickle.dumps(hashtable)

        f = open(blog.src_path + '/_search.db', 'w')
        f.write(d)
        f.close()

    def _loadDatabase(self, blog):
        filename = blog.src_path + '/_search.db'

        if not os.path.exists(filename):
            print 'No search index !'
            return None

        f = open(filename, 'rb')
        hashtable = pickle.load(f)
        f.close()

        return hashtable

    def _strip_accents(self, s):
        return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

    def _remove_tag(self, content):
        content = self.htmlreg.sub('', content)
        content = self.numreg.sub('', content)

        content = content.replace('\n', '')
        content = content.replace('\r', '')
        content = content.replace('"', '')

        for c in self.replace_by_space:
            content = content.replace(c, ' ')
        
        content = self.tagreg.sub('', content)

        content = self.pat.sub(' ', content)

        return content

    def _prepare_string(self, content):
        content = self._remove_tag(content)
        content = self._strip_accents(unicode(content, 'utf8'))

        return content

    def _indexContent(self, hashtable, index, content, word_weight):
        content = self._prepare_string(content)

        wordlist = content.split(' ')

        for word in wordlist:
            if len(word) < self.MINIMUM_LETTERS:
                continue
            word = word.lower()
            if not word in hashtable:
                hashtable[word] = []
            if not index in hashtable[word]:
                hashtable[word].insert(0, [index, word_weight])
            else:
                weight = hashtable[word][1]
                hashtable[word][1] = weight + word_weight

    def _index_file(self, hashtable, filename, index):
        try:
            post = Post.objects.get(pk=index)
            if post.published == False: return
        except:
            return

        f = open(filename, 'r')
        content = f.read()
        f.close()

        self._indexContent(hashtable, index, content, 1)
        self._indexContent(hashtable, index, post.title.encode('utf-8'), 5)
        
    def create_index(self, blog):
        hashtable = {}
        
        root = blog.src_path + '/_post' 

        if os.path.exists(root):
            for post in os.listdir(root):
                # Not a post number
                if not re.search(self.numreg, post): continue
                self._index_file(hashtable, root + '/' + post, int(post))

        self._saveDatabase(blog, hashtable)

        self._addReport('Search generated @ ' + blog.src_path + '/_search.db')

        return self.report

    def _index_post(self, blog, post, saveDatabase=True):
        hashtable = self._loadDatabase(blog)

        filename = blog.src_path + '/_post/' + str(post)

        if hashtable is None:
            return self.create_index(blog)

        self._index_file(hashtable, filename, int(post))

        if saveDatabase:
            self._saveDatabase(blog, hashtable)

    def _remove_post(self, blog, post, saveDatabase=True):
        hashtable = self._loadDatabase(blog)

        if hashtable is None: return

        for k, v in hashtable.items():
            # For tuples in values
            for t in v:
                if post == v[0]:
                    v.remove(t)

        if saveDatabase:
            self._saveDatabase(blog, hashtable)

    def index_post(self, blog, post):
        return self._index_post(blog, post, True)

    def delete_post(self, blog, post):
        return self._remove_post(blog, post, True)

    def edit_post(self, blog, post, saveDatabase=True):
        self._remove_post(blog, post, False)
        self._index_post(blog, post, True)

    def search(self, blog, string):
        hashtable = self._loadDatabase(blog)
        
        string = self._prepare_string(string.encode('utf-8'))

        wordlist = string.split(' ')

        res = {}
        for word in wordlist:
            if len(word) < Search.MINIMUM_LETTERS:
                continue
            word = word.lower()
            reg = re.compile('.*' + word + '.*')
            for key in hashtable.keys():
                if reg.match(key):
                    for post in hashtable[key]:
                        if not post[0] in res:
                            res[post[0]] = post[1]
                        else:
                            res[post[0]] += post[1]

        sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1))
        sorted_res.reverse()

        res = []
        for i in range(len(sorted_res)):
            res .append(sorted_res[i][0])
        return res
First work on search 2012-11-25 20:39:19 +01:00			`# -- coding: utf-8 --`
Add licence information in all source files 2013-02-07 18:50:54 +01:00			`"""`
			`Copyright 2012-2013 Grégory Soutadé`

			`This file is part of Dynastie.`

			`Dynastie is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`Dynastie is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with Dynastie. If not, see <http://www.gnu.org/licenses/>.`
			`"""`
First work on search 2012-11-25 20:39:19 +01:00			`import re`
			`import unicodedata`
			`import os`
			`import operator`
			`import pickle`
			`from django.db import models`
Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`from dynastie.models import Post`
First work on search 2012-11-25 20:39:19 +01:00
			`class Search:`
			`MINIMUM_LETTERS = 3`

			`def __init__(self):`
			`self.report = ''`

			`self.tagreg = re.compile('<[^>]+>')`
			`self.htmlreg = re.compile('&[^;]+;')`
			`self.numreg = re.compile('[0-9]+')`
			`self.pat = re.compile(r'\s+')`

			`self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']',`
			`'-', '\|', '\t', '\\', '_', '^' '=', '+', '$',`
			`'£', '%', 'µ', '*', ',', '?', ';', '.', '/',`
			`':', '!', '§', '€', '²')`

Speed up search 2012-12-24 19:18:19 +01:00			`# Imported from generator.py`
First work on search 2012-11-25 20:39:19 +01:00			`def _addReport(self, string, color=''):`
			`if color != '':`
			`self.report = self.report + '<span style="color:' + color + '">'`
			`self.report = self.report + '<b>' + self.__class__.__name__ + '</b> : '`
			`self.report = self.report + string`
			`if color != '':`
			`self.report = self.report + '</span>'`
			`self.report = self.report + '<br/>\n'`

			`def _addWarning(self, string):`
			`self.addReport(string, 'yellow')`

			`def _addError(self, string):`
			`self.addReport(string, 'red')`


			`def _saveDatabase(self, blog, hashtable):`
			`d = pickle.dumps(hashtable)`

			`f = open(blog.src_path + '/_search.db', 'w')`
			`f.write(d)`
			`f.close()`

			`def _loadDatabase(self, blog):`
			`filename = blog.src_path + '/_search.db'`

			`if not os.path.exists(filename):`
Basic search works for now 2012-12-10 19:30:25 +01:00			`print 'No search index !'`
First work on search 2012-11-25 20:39:19 +01:00			`return None`

			`f = open(filename, 'rb')`
			`hashtable = pickle.load(f)`
			`f.close()`

			`return hashtable`

			`def _strip_accents(self, s):`
			`return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))`

			`def _remove_tag(self, content):`
Speed up search 2012-12-24 19:18:19 +01:00			`content = self.htmlreg.sub('', content)`
			`content = self.numreg.sub('', content)`
First work on search 2012-11-25 20:39:19 +01:00
			`content = content.replace('\n', '')`
			`content = content.replace('\r', '')`
			`content = content.replace('"', '')`

			`for c in self.replace_by_space:`
			`content = content.replace(c, ' ')`

Speed up search 2012-12-24 19:18:19 +01:00			`content = self.tagreg.sub('', content)`
First work on search 2012-11-25 20:39:19 +01:00
			`content = self.pat.sub(' ', content)`

			`return content`

			`def _prepare_string(self, content):`
			`content = self._remove_tag(content)`
			`content = self._strip_accents(unicode(content, 'utf8'))`

			`return content`

Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`def _indexContent(self, hashtable, index, content, word_weight):`
First work on search 2012-11-25 20:39:19 +01:00			`content = self._prepare_string(content)`

Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`wordlist = content.split(' ')`
First work on search 2012-11-25 20:39:19 +01:00
			`for word in wordlist:`
			`if len(word) < self.MINIMUM_LETTERS:`
			`continue`
			`word = word.lower()`
			`if not word in hashtable:`
			`hashtable[word] = []`
			`if not index in hashtable[word]:`
Do a prepend instead of an append to generate search index. Allow reverse date order results display 2012-12-22 09:34:21 +01:00			`hashtable[word].insert(0, [index, word_weight])`
Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`else:`
			`weight = hashtable[word][1]`
			`hashtable[word][1] = weight + word_weight`

			`def _index_file(self, hashtable, filename, index):`
			`try:`
			`post = Post.objects.get(pk=index)`
			`if post.published == False: return`
			`except:`
			`return`
First work on search 2012-11-25 20:39:19 +01:00
Speed up search 2012-12-24 19:18:19 +01:00			`f = open(filename, 'r')`
			`content = f.read()`
			`f.close()`

Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`self._indexContent(hashtable, index, content, 1)`
			`self._indexContent(hashtable, index, post.title.encode('utf-8'), 5)`

First work on search 2012-11-25 20:39:19 +01:00			`def create_index(self, blog):`
			`hashtable = {}`

			`root = blog.src_path + '/_post'`

			`if os.path.exists(root):`
			`for post in os.listdir(root):`
			`# Not a post number`
			`if not re.search(self.numreg, post): continue`
			`self._index_file(hashtable, root + '/' + post, int(post))`

			`self._saveDatabase(blog, hashtable)`

			`self._addReport('Search generated @ ' + blog.src_path + '/_search.db')`

			`return self.report`

			`def _index_post(self, blog, post, saveDatabase=True):`
			`hashtable = self._loadDatabase(blog)`

Fix bugs in search (string conversion) 2012-12-24 18:12:51 +01:00			`filename = blog.src_path + '/_post/' + str(post)`
First work on search 2012-11-25 20:39:19 +01:00
			`if hashtable is None:`
			`return self.create_index(blog)`

			`self._index_file(hashtable, filename, int(post))`

			`if saveDatabase:`
			`self._saveDatabase(blog, hashtable)`

			`def _remove_post(self, blog, post, saveDatabase=True):`
			`hashtable = self._loadDatabase(blog)`

			`if hashtable is None: return`

			`for k, v in hashtable.items():`
Speed up search 2012-12-24 19:18:19 +01:00			`# For tuples in values`
			`for t in v:`
			`if post == v[0]:`
			`v.remove(t)`
First work on search 2012-11-25 20:39:19 +01:00
			`if saveDatabase:`
			`self._saveDatabase(blog, hashtable)`

			`def index_post(self, blog, post):`
			`return self._index_post(blog, post, True)`

			`def delete_post(self, blog, post):`
			`return self._remove_post(blog, post, True)`

			`def edit_post(self, blog, post, saveDatabase=True):`
			`self._remove_post(blog, post, False)`
			`self._index_post(blog, post, True)`

			`def search(self, blog, string):`
			`hashtable = self._loadDatabase(blog)`

Basic search works for now 2012-12-10 19:30:25 +01:00			`string = self._prepare_string(string.encode('utf-8'))`
First work on search 2012-11-25 20:39:19 +01:00
Do a more pertinent search by including titles in indexing 2012-12-10 20:50:27 +01:00			`wordlist = string.split(' ')`
First work on search 2012-11-25 20:39:19 +01:00
			`res = {}`
			`for word in wordlist:`
Basic search works for now 2012-12-10 19:30:25 +01:00			`if len(word) < Search.MINIMUM_LETTERS:`
First work on search 2012-11-25 20:39:19 +01:00			`continue`
			`word = word.lower()`
Use regexp during search computation 2012-12-22 09:39:09 +01:00			`reg = re.compile('.' + word + '.')`
			`for key in hashtable.keys():`
			`if reg.match(key):`
			`for post in hashtable[key]:`
			`if not post[0] in res:`
			`res[post[0]] = post[1]`
			`else:`
			`res[post[0]] += post[1]`
First work on search 2012-11-25 20:39:19 +01:00
Use regexp during search computation 2012-12-22 09:39:09 +01:00			`sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1))`
Basic search works for now 2012-12-10 19:30:25 +01:00			`sorted_res.reverse()`

			`res = []`
			`for i in range(len(sorted_res)):`
			`res .append(sorted_res[i][0])`
			`return res`