2012-11-25 20:39:19 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
|
|
import unicodedata
|
|
|
|
import os
|
|
|
|
import operator
|
|
|
|
import pickle
|
|
|
|
from django.db import models
|
2012-12-10 20:50:27 +01:00
|
|
|
from dynastie.models import Post
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
class Search:
|
|
|
|
MINIMUM_LETTERS = 3
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.report = ''
|
|
|
|
|
|
|
|
self.tagreg = re.compile('<[^>]+>')
|
|
|
|
self.htmlreg = re.compile('&[^;]+;')
|
|
|
|
self.numreg = re.compile('[0-9]+')
|
|
|
|
self.pat = re.compile(r'\s+')
|
|
|
|
|
|
|
|
self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']',
|
|
|
|
'-', '|', '\t', '\\', '_', '^' '=', '+', '$',
|
|
|
|
'£', '%', 'µ', '*', ',', '?', ';', '.', '/',
|
|
|
|
':', '!', '§', '€', '²')
|
|
|
|
|
2012-12-24 19:18:19 +01:00
|
|
|
# Imported from generator.py
|
2012-11-25 20:39:19 +01:00
|
|
|
def _addReport(self, string, color=''):
|
|
|
|
if color != '':
|
|
|
|
self.report = self.report + '<span style="color:' + color + '">'
|
|
|
|
self.report = self.report + '<b>' + self.__class__.__name__ + '</b> : '
|
|
|
|
self.report = self.report + string
|
|
|
|
if color != '':
|
|
|
|
self.report = self.report + '</span>'
|
|
|
|
self.report = self.report + '<br/>\n'
|
|
|
|
|
|
|
|
def _addWarning(self, string):
|
|
|
|
self.addReport(string, 'yellow')
|
|
|
|
|
|
|
|
def _addError(self, string):
|
|
|
|
self.addReport(string, 'red')
|
|
|
|
|
|
|
|
|
|
|
|
def _saveDatabase(self, blog, hashtable):
|
|
|
|
d = pickle.dumps(hashtable)
|
|
|
|
|
|
|
|
f = open(blog.src_path + '/_search.db', 'w')
|
|
|
|
f.write(d)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
def _loadDatabase(self, blog):
|
|
|
|
filename = blog.src_path + '/_search.db'
|
|
|
|
|
|
|
|
if not os.path.exists(filename):
|
2012-12-10 19:30:25 +01:00
|
|
|
print 'No search index !'
|
2012-11-25 20:39:19 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
f = open(filename, 'rb')
|
|
|
|
hashtable = pickle.load(f)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
return hashtable
|
|
|
|
|
|
|
|
def _strip_accents(self, s):
|
|
|
|
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
|
|
|
|
|
|
|
def _remove_tag(self, content):
|
2012-12-24 19:18:19 +01:00
|
|
|
content = self.htmlreg.sub('', content)
|
|
|
|
content = self.numreg.sub('', content)
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
content = content.replace('\n', '')
|
|
|
|
content = content.replace('\r', '')
|
|
|
|
content = content.replace('"', '')
|
|
|
|
|
|
|
|
for c in self.replace_by_space:
|
|
|
|
content = content.replace(c, ' ')
|
|
|
|
|
2012-12-24 19:18:19 +01:00
|
|
|
content = self.tagreg.sub('', content)
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
content = self.pat.sub(' ', content)
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
def _prepare_string(self, content):
|
|
|
|
content = self._remove_tag(content)
|
|
|
|
content = self._strip_accents(unicode(content, 'utf8'))
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
2012-12-10 20:50:27 +01:00
|
|
|
def _indexContent(self, hashtable, index, content, word_weight):
|
2012-11-25 20:39:19 +01:00
|
|
|
content = self._prepare_string(content)
|
|
|
|
|
2012-12-10 20:50:27 +01:00
|
|
|
wordlist = content.split(' ')
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
for word in wordlist:
|
|
|
|
if len(word) < self.MINIMUM_LETTERS:
|
|
|
|
continue
|
|
|
|
word = word.lower()
|
|
|
|
if not word in hashtable:
|
|
|
|
hashtable[word] = []
|
|
|
|
if not index in hashtable[word]:
|
2012-12-22 09:34:21 +01:00
|
|
|
hashtable[word].insert(0, [index, word_weight])
|
2012-12-10 20:50:27 +01:00
|
|
|
else:
|
|
|
|
weight = hashtable[word][1]
|
|
|
|
hashtable[word][1] = weight + word_weight
|
|
|
|
|
|
|
|
def _index_file(self, hashtable, filename, index):
|
|
|
|
try:
|
|
|
|
post = Post.objects.get(pk=index)
|
|
|
|
if post.published == False: return
|
|
|
|
except:
|
|
|
|
return
|
2012-11-25 20:39:19 +01:00
|
|
|
|
2012-12-24 19:18:19 +01:00
|
|
|
f = open(filename, 'r')
|
|
|
|
content = f.read()
|
|
|
|
f.close()
|
|
|
|
|
2012-12-10 20:50:27 +01:00
|
|
|
self._indexContent(hashtable, index, content, 1)
|
|
|
|
self._indexContent(hashtable, index, post.title.encode('utf-8'), 5)
|
|
|
|
|
2012-11-25 20:39:19 +01:00
|
|
|
def create_index(self, blog):
|
|
|
|
hashtable = {}
|
|
|
|
|
|
|
|
root = blog.src_path + '/_post'
|
|
|
|
|
|
|
|
if os.path.exists(root):
|
|
|
|
for post in os.listdir(root):
|
|
|
|
# Not a post number
|
|
|
|
if not re.search(self.numreg, post): continue
|
|
|
|
self._index_file(hashtable, root + '/' + post, int(post))
|
|
|
|
|
|
|
|
self._saveDatabase(blog, hashtable)
|
|
|
|
|
|
|
|
self._addReport('Search generated @ ' + blog.src_path + '/_search.db')
|
|
|
|
|
|
|
|
return self.report
|
|
|
|
|
|
|
|
def _index_post(self, blog, post, saveDatabase=True):
|
|
|
|
hashtable = self._loadDatabase(blog)
|
|
|
|
|
2012-12-24 18:12:51 +01:00
|
|
|
filename = blog.src_path + '/_post/' + str(post)
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
if hashtable is None:
|
|
|
|
return self.create_index(blog)
|
|
|
|
|
|
|
|
self._index_file(hashtable, filename, int(post))
|
|
|
|
|
|
|
|
if saveDatabase:
|
|
|
|
self._saveDatabase(blog, hashtable)
|
|
|
|
|
|
|
|
def _remove_post(self, blog, post, saveDatabase=True):
|
|
|
|
hashtable = self._loadDatabase(blog)
|
|
|
|
|
|
|
|
if hashtable is None: return
|
|
|
|
|
|
|
|
for k, v in hashtable.items():
|
2012-12-24 19:18:19 +01:00
|
|
|
# For tuples in values
|
|
|
|
for t in v:
|
|
|
|
if post == v[0]:
|
|
|
|
v.remove(t)
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
if saveDatabase:
|
|
|
|
self._saveDatabase(blog, hashtable)
|
|
|
|
|
|
|
|
def index_post(self, blog, post):
|
|
|
|
return self._index_post(blog, post, True)
|
|
|
|
|
|
|
|
def delete_post(self, blog, post):
|
|
|
|
return self._remove_post(blog, post, True)
|
|
|
|
|
|
|
|
def edit_post(self, blog, post, saveDatabase=True):
|
|
|
|
self._remove_post(blog, post, False)
|
|
|
|
self._index_post(blog, post, True)
|
|
|
|
|
|
|
|
def search(self, blog, string):
|
|
|
|
hashtable = self._loadDatabase(blog)
|
|
|
|
|
2012-12-10 19:30:25 +01:00
|
|
|
string = self._prepare_string(string.encode('utf-8'))
|
2012-11-25 20:39:19 +01:00
|
|
|
|
2012-12-10 20:50:27 +01:00
|
|
|
wordlist = string.split(' ')
|
2012-11-25 20:39:19 +01:00
|
|
|
|
|
|
|
res = {}
|
|
|
|
for word in wordlist:
|
2012-12-10 19:30:25 +01:00
|
|
|
if len(word) < Search.MINIMUM_LETTERS:
|
2012-11-25 20:39:19 +01:00
|
|
|
continue
|
|
|
|
word = word.lower()
|
2012-12-22 09:39:09 +01:00
|
|
|
reg = re.compile('.*' + word + '.*')
|
|
|
|
for key in hashtable.keys():
|
|
|
|
if reg.match(key):
|
|
|
|
for post in hashtable[key]:
|
|
|
|
if not post[0] in res:
|
|
|
|
res[post[0]] = post[1]
|
|
|
|
else:
|
|
|
|
res[post[0]] += post[1]
|
2012-11-25 20:39:19 +01:00
|
|
|
|
2012-12-22 09:39:09 +01:00
|
|
|
sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1))
|
2012-12-10 19:30:25 +01:00
|
|
|
sorted_res.reverse()
|
|
|
|
|
|
|
|
res = []
|
|
|
|
for i in range(len(sorted_res)):
|
|
|
|
res .append(sorted_res[i][0])
|
|
|
|
return res
|