From 839b935d4783e2c76cc95283b7d977f96f18ee4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Mon, 10 Dec 2012 20:50:27 +0100 Subject: [PATCH] Do a more pertinent search by including titles in indexing --- search.py | 37 +++++++++++++++++++++----------- sites/blog.soutade.fr/about.html | 13 +++++++---- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/search.py b/search.py index 503a27f..cdad7c6 100644 --- a/search.py +++ b/search.py @@ -5,6 +5,7 @@ import os import operator import pickle from django.db import models +from dynastie.models import Post class Search: MINIMUM_LETTERS = 3 @@ -16,7 +17,6 @@ class Search: self.htmlreg = re.compile('&[^;]+;') self.numreg = re.compile('[0-9]+') self.pat = re.compile(r'\s+') - self.wordreg = re.compile('\w+') self.replace_by_space = ('(', ')', '#', '\'', '{', '}', '[', ']', '-', '|', '\t', '\\', '_', '^' '=', '+', '$', @@ -97,14 +97,10 @@ class Search: return content - def _index_file(self, hashtable, filename, index): - f = open(filename, 'r') - content = f.read() - f.close() - + def _indexContent(self, hashtable, index, content, word_weight): content = self._prepare_string(content) - wordlist = re.findall(self.wordreg, content) + wordlist = content.split(' ') for word in wordlist: if len(word) < self.MINIMUM_LETTERS: @@ -113,8 +109,25 @@ class Search: if not word in hashtable: hashtable[word] = [] if not index in hashtable[word]: - hashtable[word].append(index) + hashtable[word].append([index, word_weight]) + else: + weight = hashtable[word][1] + hashtable[word][1] = weight + word_weight + def _index_file(self, hashtable, filename, index): + f = open(filename, 'r') + content = f.read() + f.close() + + try: + post = Post.objects.get(pk=index) + if post.published == False: return + except: + return + + self._indexContent(hashtable, index, content, 1) + self._indexContent(hashtable, index, post.title.encode('utf-8'), 5) + def create_index(self, blog): hashtable = {} @@ -172,7 +185,7 @@ class Search: string = self._prepare_string(string.encode('utf-8')) - wordlist = re.findall(self.wordreg, string) + wordlist = string.split(' ') res = {} for word in wordlist: @@ -184,9 +197,9 @@ class Search: if word not in hashtable: continue for post in hashtable[word]: - if not post in res: - res[post] = 0 - res[post] = res[post] + 1 + if not post[0] in res: + res[post[0]] = post[1] + res[post[0]] += post[1] sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1)) sorted_res.reverse() diff --git a/sites/blog.soutade.fr/about.html b/sites/blog.soutade.fr/about.html index 03223cf..fcc8506 100755 --- a/sites/blog.soutade.fr/about.html +++ b/sites/blog.soutade.fr/about.html @@ -43,15 +43,20 @@