# -*- coding: utf-8 -*- """ Copyright 2016 Grégory Soutadé This file is part of Dénote. Dynastie is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Dynastie is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Dynastie. If not, see . """ import re import unicodedata import os import operator import pickle from django.db import models import models #from models import Note class Search: MINIMUM_LETTERS = 3 def __init__(self): self.report = '' self.tagreg = re.compile('<[^>]+>') self.htmlreg = re.compile('&[^;]+;') self.numreg = re.compile('[0-9]+') self.pat = re.compile(r'\s+') self.replace_by_space = (u'(', u')', u'#', u'\'', u'{', u'}', u'[', u']', u'-', u'|', u'\t', u'\\', u'_', u'^' '=', u'+', u'$', u'£', u'%', u'µ', u'*', u',', u'?', u';', u'.', u'/', u':', u'!', u'§', u'€', u'²') # Imported from generator.py def _addReport(self, string, color=''): if color != '': self.report = self.report + '' self.report = self.report + '' + self.__class__.__name__ + ' : ' self.report = self.report + string if color != '': self.report = self.report + '' self.report = self.report + '
\n' def _addWarning(self, string): self.addReport(string, 'yellow') def _addError(self, string): self.addReport(string, 'red') def _saveDatabase(self, hashtable): d = pickle.dumps(hashtable) f = open(os.environ['DENOTE_ROOT'] + '/_search.db', 'w') f.write(d) f.close() def _loadDatabase(self): filename = os.environ['DENOTE_ROOT'] + '/_search.db' if not os.path.exists(filename): print 'No search index !' return {} f = open(filename, 'rb') hashtable = pickle.load(f) f.close() return hashtable def _strip_accents(self, s): return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) def _remove_tag(self, content): content = self.htmlreg.sub('', content) content = self.numreg.sub('', content) content = content.replace('\n', '') content = content.replace('\r', '') content = content.replace('"', '') for c in self.replace_by_space: content = content.replace(c, ' ') content = self.tagreg.sub('', content) content = self.pat.sub(' ', content) return content def _prepare_string(self, content): content = self._remove_tag(content) content = self._strip_accents(content) return content def _indexContent(self, hashtable, index, content, word_weight): content = self._prepare_string(content) wordlist = content.split(' ') for word in wordlist: if len(word) < self.MINIMUM_LETTERS: continue word = word.lower() if not word in hashtable: hashtable[word] = [] if not index in hashtable[word]: hashtable[word].insert(0, [index, word_weight]) else: weight = hashtable[word][1] hashtable[word][1] = weight + word_weight def _index(self, hashtable, index): note = models.Note.objects.get(pk=index) self._indexContent(hashtable, index, note.text, 1) self._indexContent(hashtable, index, note.title.encode('utf-8'), 5) def _index_note(self, note, saveDatabase=True): hashtable = self._loadDatabase() self._index(hashtable, int(note)) if saveDatabase: self._saveDatabase(hashtable) def _remove_note(self, note, saveDatabase=True): hashtable = self._loadDatabase() if hashtable is None: return for k, v in hashtable.items(): # For tuples in values for t in v: if note == v[0]: v.remove(t) if saveDatabase: self._saveDatabase(hashtable) def generate_index(self, notes): hashtable = self._loadDatabase() for note in notes: self._indexContent(hashtable, note.id, note.text, 1) self._indexContent(hashtable, note.id, note.title, 5) self._saveDatabase(hashtable) def index_note(self, note): return self._index_note(note, True) def delete_note(self, note): return self._remove_note(note, True) def edit_note(self, note, saveDatabase=True): self._remove_note(note, False) self._index_note(note, True) def search(self, string): hashtable = self._loadDatabase() string = self._prepare_string(string.encode('utf-8')) wordlist = string.split(' ') res = {} for word in wordlist: if len(word) < Search.MINIMUM_LETTERS: continue word = word.lower() reg = re.compile('.*' + word + '.*') for key in hashtable.keys(): if reg.match(key): for note in hashtable[key]: res[note[0]] = res.get(note[0],0) + note[1] sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1)) sorted_res.reverse() res = [sorted_res[i][0] for i in range(len(sorted_res))] return res