Dynastie/dynastie/search.py

216 lines
6.5 KiB
Python
Raw Permalink Normal View History

2012-11-25 20:39:19 +01:00
# -*- coding: utf-8 -*-
"""
2014-01-04 13:55:30 +01:00
Copyright 2012-2014 Grégory Soutadé
This file is part of Dynastie.
Dynastie is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Dynastie is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Dynastie. If not, see <http://www.gnu.org/licenses/>.
"""
2012-11-25 20:39:19 +01:00
import re
import unicodedata
import os
import operator
import pickle
from django.db import models
from dynastie.models import Post
2012-11-25 20:39:19 +01:00
class Search:
MINIMUM_LETTERS = 3
def __init__(self):
self.report = ''
self.tagreg = re.compile('<[^>]+>')
self.htmlreg = re.compile('&[^;]+;')
self.numreg = re.compile('[0-9]+')
self.pat = re.compile(r'\s+')
self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']',
'-', '|', '\t', '\\', '_', '^' '=', '+', '$',
'£', '%', 'µ', '*', ',', '?', ';', '.', '/',
':', '!', '§', '', '²')
2012-12-24 19:18:19 +01:00
# Imported from generator.py
2012-11-25 20:39:19 +01:00
def _addReport(self, string, color=''):
if color != '':
self.report = self.report + '<span style="color:' + color + '">'
self.report = self.report + '<b>' + self.__class__.__name__ + '</b> : '
self.report = self.report + string
if color != '':
self.report = self.report + '</span>'
self.report = self.report + '<br/>\n'
def _addWarning(self, string):
self.addReport(string, 'yellow')
def _addError(self, string):
self.addReport(string, 'red')
def _saveDatabase(self, blog, hashtable):
d = pickle.dumps(hashtable)
f = open(blog.src_path + '/_search.db', 'w')
f.write(d)
f.close()
def _loadDatabase(self, blog):
filename = blog.src_path + '/_search.db'
if not os.path.exists(filename):
2012-12-10 19:30:25 +01:00
print 'No search index !'
2012-11-25 20:39:19 +01:00
return None
f = open(filename, 'rb')
hashtable = pickle.load(f)
f.close()
return hashtable
def _strip_accents(self, s):
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
def _remove_tag(self, content):
2012-12-24 19:18:19 +01:00
content = self.htmlreg.sub('', content)
content = self.numreg.sub('', content)
2012-11-25 20:39:19 +01:00
content = content.replace('\n', '')
content = content.replace('\r', '')
content = content.replace('"', '')
for c in self.replace_by_space:
content = content.replace(c, ' ')
2012-12-24 19:18:19 +01:00
content = self.tagreg.sub('', content)
2012-11-25 20:39:19 +01:00
content = self.pat.sub(' ', content)
return content
def _prepare_string(self, content):
content = self._remove_tag(content)
content = self._strip_accents(unicode(content, 'utf8'))
return content
def _indexContent(self, hashtable, index, content, word_weight):
2012-11-25 20:39:19 +01:00
content = self._prepare_string(content)
wordlist = content.split(' ')
2012-11-25 20:39:19 +01:00
for word in wordlist:
if len(word) < self.MINIMUM_LETTERS:
continue
word = word.lower()
if not word in hashtable:
hashtable[word] = []
if not index in hashtable[word]:
hashtable[word].insert(0, [index, word_weight])
else:
weight = hashtable[word][1]
hashtable[word][1] = weight + word_weight
def _index_file(self, hashtable, filename, index):
try:
post = Post.objects.get(pk=index)
if post.published == False: return
except:
return
2012-11-25 20:39:19 +01:00
2012-12-24 19:18:19 +01:00
f = open(filename, 'r')
content = f.read()
f.close()
self._indexContent(hashtable, index, content, 1)
self._indexContent(hashtable, index, post.title.encode('utf-8'), 5)
2012-11-25 20:39:19 +01:00
def create_index(self, blog):
hashtable = {}
root = blog.src_path + '/_post'
if os.path.exists(root):
for post in os.listdir(root):
# Not a post number
if not re.search(self.numreg, post): continue
self._index_file(hashtable, root + '/' + post, int(post))
self._saveDatabase(blog, hashtable)
self._addReport('Search generated @ ' + blog.src_path + '/_search.db')
return self.report
def _index_post(self, blog, post, saveDatabase=True):
hashtable = self._loadDatabase(blog)
2012-12-24 18:12:51 +01:00
filename = blog.src_path + '/_post/' + str(post)
2012-11-25 20:39:19 +01:00
if hashtable is None:
return self.create_index(blog)
self._index_file(hashtable, filename, int(post))
if saveDatabase:
self._saveDatabase(blog, hashtable)
def _remove_post(self, blog, post, saveDatabase=True):
hashtable = self._loadDatabase(blog)
if hashtable is None: return
for k, v in hashtable.items():
2012-12-24 19:18:19 +01:00
# For tuples in values
for t in v:
if post == v[0]:
v.remove(t)
2012-11-25 20:39:19 +01:00
if saveDatabase:
self._saveDatabase(blog, hashtable)
def index_post(self, blog, post):
return self._index_post(blog, post, True)
def delete_post(self, blog, post):
return self._remove_post(blog, post, True)
def edit_post(self, blog, post, saveDatabase=True):
self._remove_post(blog, post, False)
self._index_post(blog, post, True)
def search(self, blog, string):
hashtable = self._loadDatabase(blog)
2012-12-10 19:30:25 +01:00
string = self._prepare_string(string.encode('utf-8'))
2012-11-25 20:39:19 +01:00
wordlist = string.split(' ')
2012-11-25 20:39:19 +01:00
res = {}
for word in wordlist:
2012-12-10 19:30:25 +01:00
if len(word) < Search.MINIMUM_LETTERS:
2012-11-25 20:39:19 +01:00
continue
word = word.lower()
2012-12-22 09:39:09 +01:00
reg = re.compile('.*' + word + '.*')
for key in hashtable.keys():
if reg.match(key):
for post in hashtable[key]:
2014-09-24 20:27:27 +02:00
res[post[0]] = res.get(post[0],0) + post[1]
2012-11-25 20:39:19 +01:00
2012-12-22 09:39:09 +01:00
sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1))
2012-12-10 19:30:25 +01:00
sorted_res.reverse()
2014-09-24 20:27:27 +02:00
res = [sorted_res[i][0] for i in range(len(sorted_res))]
2012-12-10 19:30:25 +01:00
return res