From b7c4cf4e2fb102ccc3eae0577bb9ea365934a83b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Mon, 24 Dec 2012 19:18:19 +0100 Subject: [PATCH] Speed up search --- search.py | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/search.py b/search.py index b551203..ec74e5d 100644 --- a/search.py +++ b/search.py @@ -23,6 +23,7 @@ class Search: '£', '%', 'µ', '*', ',', '?', ';', '.', '/', ':', '!', '§', '€', '²') + # Imported from generator.py def _addReport(self, string, color=''): if color != '': self.report = self.report + '' @@ -59,24 +60,12 @@ class Search: return hashtable - def _remove_reg(self, content, reg): - found = re.search(reg, content) - while found != None: - #print str(found.start()) + ' ' + str(found.end()) - # print content[found.start(0):found.end(0)] - # print "=============================================" - content = content[:found.start(0)].lstrip() + ' ' + content[found.end(0):] - - found = re.search(reg, content) - - return content - def _strip_accents(self, s): return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) def _remove_tag(self, content): - content = self._remove_reg(content, self.htmlreg) - content = self._remove_reg(content, self.numreg) + content = self.htmlreg.sub('', content) + content = self.numreg.sub('', content) content = content.replace('\n', '') content = content.replace('\r', '') @@ -85,7 +74,7 @@ class Search: for c in self.replace_by_space: content = content.replace(c, ' ') - content = self._remove_reg(content, self.tagreg) + content = self.tagreg.sub('', content) content = self.pat.sub(' ', content) @@ -115,16 +104,16 @@ class Search: hashtable[word][1] = weight + word_weight def _index_file(self, hashtable, filename, index): - f = open(filename, 'r') - content = f.read() - f.close() - try: post = Post.objects.get(pk=index) if post.published == False: return except: return + f = open(filename, 'r') + content = f.read() + f.close() + self._indexContent(hashtable, index, content, 1) self._indexContent(hashtable, index, post.title.encode('utf-8'), 5) @@ -164,8 +153,10 @@ class Search: if hashtable is None: return for k, v in hashtable.items(): - if post in v: - v.remove(post) + # For tuples in values + for t in v: + if post == v[0]: + v.remove(t) if saveDatabase: self._saveDatabase(blog, hashtable)