diff --git a/search.py b/search.py new file mode 100644 index 0000000..0d77d66 --- /dev/null +++ b/search.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +import re +import unicodedata +import os +import operator +import pickle +from django.db import models + +class Search: + MINIMUM_LETTERS = 3 + + def __init__(self): + self.report = '' + + self.tagreg = re.compile('<[^>]+>') + self.htmlreg = re.compile('&[^;]+;') + self.numreg = re.compile('[0-9]+') + self.pat = re.compile(r'\s+') + self.wordreg = re.compile('\w+') + + self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']', + '-', '|', '\t', '\\', '_', '^' '=', '+', '$', + '£', '%', 'µ', '*', ',', '?', ';', '.', '/', + ':', '!', '§', '€', '²') + + def _addReport(self, string, color=''): + if color != '': + self.report = self.report + '' + self.report = self.report + '' + self.__class__.__name__ + ' : ' + self.report = self.report + string + if color != '': + self.report = self.report + '' + self.report = self.report + '
\n' + + def _addWarning(self, string): + self.addReport(string, 'yellow') + + def _addError(self, string): + self.addReport(string, 'red') + + + def _saveDatabase(self, blog, hashtable): + d = pickle.dumps(hashtable) + + f = open(blog.src_path + '/_search.db', 'w') + f.write(d) + f.close() + + def _loadDatabase(self, blog): + filename = blog.src_path + '/_search.db' + + if not os.path.exists(filename): + return None + + f = open(filename, 'rb') + hashtable = pickle.load(f) + f.close() + + return hashtable + + def _remove_reg(self, content, reg): + found = re.search(reg, content) + while found != None: + #print str(found.start()) + ' ' + str(found.end()) + # print content[found.start(0):found.end(0)] + # print "=============================================" + content = content[:found.start(0)].lstrip() + ' ' + content[found.end(0):] + + found = re.search(reg, content) + + return content + + def _strip_accents(self, s): + return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) + + def _remove_tag(self, content): + content = self._remove_reg(content, self.htmlreg) + content = self._remove_reg(content, self.numreg) + + content = content.replace('\n', '') + content = content.replace('\r', '') + content = content.replace('"', '') + + for c in self.replace_by_space: + content = content.replace(c, ' ') + + content = self._remove_reg(content, self.tagreg) + + content = self.pat.sub(' ', content) + + return content + + def _prepare_string(self, content): + content = self._remove_tag(content) + content = self._strip_accents(unicode(content, 'utf8')) + + return content + + def _index_file(self, hashtable, filename, index): + f = open(filename, 'r') + content = f.read() + f.close() + + content = self._prepare_string(content) + + wordlist = re.findall(self.wordreg, content) + + for word in wordlist: + if len(word) < self.MINIMUM_LETTERS: + continue + word = word.lower() + if not word in hashtable: + hashtable[word] = [] + if not index in hashtable[word]: + hashtable[word].append(index) + + def create_index(self, blog): + hashtable = {} + + root = blog.src_path + '/_post' + + if os.path.exists(root): + for post in os.listdir(root): + # Not a post number + if not re.search(self.numreg, post): continue + self._index_file(hashtable, root + '/' + post, int(post)) + + self._saveDatabase(blog, hashtable) + + self._addReport('Search generated @ ' + blog.src_path + '/_search.db') + + return self.report + + def _index_post(self, blog, post, saveDatabase=True): + hashtable = self._loadDatabase(blog) + + filename = blog.src_path + '/_post/' + post + + if hashtable is None: + return self.create_index(blog) + + self._index_file(hashtable, filename, int(post)) + + if saveDatabase: + self._saveDatabase(blog, hashtable) + + def _remove_post(self, blog, post, saveDatabase=True): + hashtable = self._loadDatabase(blog) + + if hashtable is None: return + + for k, v in hashtable.items(): + if post in v: + v.remove(post) + + if saveDatabase: + self._saveDatabase(blog, hashtable) + + def index_post(self, blog, post): + return self._index_post(blog, post, True) + + def delete_post(self, blog, post): + return self._remove_post(blog, post, True) + + def edit_post(self, blog, post, saveDatabase=True): + self._remove_post(blog, post, False) + self._index_post(blog, post, True) + + def search(self, blog, string): + hashtable = self._loadDatabase(blog) + + string = self._prepare_string(string) + + wordlist = re.findall(self.wordreg, string) + + res = {} + for word in wordlist: + if len(word) < 4: + continue + word = word.lower() + while not word in hashtable and len(word) > 3: + word = word[:-1] + if word not in hashtable: + continue + for post in hashtable[word]: + if not post in res: + res[post] = 0 + res[post] = res[post] + 1 + + sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1)) + + return sorted_res.reverse() diff --git a/templates/generate.html b/templates/generate.html index 9ef2591..ea6f2e6 100644 --- a/templates/generate.html +++ b/templates/generate.html @@ -10,7 +10,7 @@ {% endif %}

-Add a post Generate blog
+Add a post Generate blog Generate search index

{% if report|length == 0 %} Any engine selected

{% else %} diff --git a/templates/view_blog.html b/templates/view_blog.html index 4fac7c0..3b98a8d 100644 --- a/templates/view_blog.html +++ b/templates/view_blog.html @@ -10,7 +10,7 @@ {% endif %}

-Add a post Generate blog +Add a post Generate blog Generate search index

{% if posts|length == 0 %}

diff --git a/urls.py b/urls.py index 7883b20..e2b26d6 100644 --- a/urls.py +++ b/urls.py @@ -31,9 +31,10 @@ urlpatterns = patterns('', url(r'^comment/add/(\d+)/(\d+)$', 'dynastie.views.add_comment', name='add_comment'), url(r'^comment/edit/(\d+)$', 'dynastie.views.edit_comment', name='edit_comment'), url(r'^comment/delete/(\d+)$','dynastie.views.delete_comment',name='delete_comment'), - url(r'^tag/(\d+)$', 'dynastie.views.tag', name='tag'), - url(r'^tag/edit/(\d+)$', 'dynastie.views.edit_tag', name='edit_tag'), - url(r'^tag/delete/(\d+)$', 'dynastie.views.delete_tag', name='delete_tag'), + url(r'^tag/(\d+)$', 'dynastie.views.tag', name='tag'), + url(r'^tag/edit/(\d+)$', 'dynastie.views.edit_tag', name='edit_tag'), + url(r'^tag/delete/(\d+)$', 'dynastie.views.delete_tag', name='delete_tag'), + url(r'^search/generate/(\d+)$', 'dynastie.views.generate_search',name='generate_search'), # url(r'^dynastie/', include('dynastie.foo.urls')), # Uncomment the admin/doc line below to enable admin documentation: diff --git a/views.py b/views.py index 96c766d..dbbefb0 100644 --- a/views.py +++ b/views.py @@ -12,6 +12,7 @@ from django.core.mail import EmailMultiAlternatives from dynastie.models import * from dynastie.forms import * +from dynastie.search import * from django.template.defaultfilters import register from django.template import Variable, VariableDoesNotExist @@ -399,7 +400,7 @@ def edit_blog(request, blog_id): @login_required def add_post(request, blog_id): - (b,p) = have_I_right(request, blog_id) + (b,_) = have_I_right(request, blog_id) if request.method == 'POST': # If the form has been submitted... if 'add' in request.POST: @@ -410,6 +411,8 @@ def add_post(request, blog_id): if form.is_valid(): # All validation rules pass form = form.save() form.createPost(content, request.POST['text_tags']) + s = Search() + s.index_post(b, form.id) # Process the data in form.cleaned_data # ... return HttpResponseRedirect('/blog/' + blog_id) # Redirect after POST @@ -437,6 +440,8 @@ def edit_post(request, post_id): post.remove() form.save() post.createPost(request.POST['content'], request.POST['text_tags']) + s = Search() + s.edit_post(b, post_id) # Process the data in form.cleaned_data # ... return HttpResponseRedirect('/blog/' + str(blog_id)) # Redirect after POST @@ -469,17 +474,16 @@ def edit_post(request, post_id): def delete_post(request, post_id): (b, post) = have_I_right(request, None, post_id) + s = Search() + s.edit_post(b, post_id) + post.delete() return HttpResponseRedirect('/blog/' + str(b.id)) -@login_required -def generate(request, blog_id): +def _generate(request, blog_id, report): b,_ = have_I_right(request, blog_id) - b.create_paths() - report = b.generate() - count = Post.objects.filter(blog=b).count() nb_pages = int(count/50) posts = Post.objects.filter(blog=b).order_by('-creation_date')[0:50] @@ -501,6 +505,27 @@ def generate(request, blog_id): return render(request, 'templates/generate.html', c) +@login_required +def generate(request, blog_id): + b,_ = have_I_right(request, blog_id) + + b.create_paths() + report = b.generate() + + return _generate(request, blog_id, report) + +@login_required +def generate_search(request, blog_id): + b,_ = have_I_right(request, blog_id) + + b.create_paths() + + s = Search() + + report = s.create_index(b) + + return _generate(request, blog_id, report) + @login_required def preview(request, blog_id): from dynastie.generators import post