From ce934b9f1bc597ba53c0b82b67cd3bcb3d14a5ac Mon Sep 17 00:00:00 2001 From: Jack Stdin Date: Thu, 18 Feb 2016 13:18:38 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B0=20=D0=BF=D0=BE=D1=81=D1=82-=D0=BE=D0=B1=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20=D1=87=D0=B0=D1=81=D1=82?= =?UTF-8?q?=D0=BE=20=D0=B2=D1=81=D1=82=D1=80=D0=B5=D1=87=D0=B0=D1=8E=D1=89?= =?UTF-8?q?=D0=B8=D1=85=D1=81=D1=8F=20=D1=81=D0=BB=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aore/fias/search.py | 32 +++++++++++++++++++++++++++----- aore/fias/wordentry.py | 7 +++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/aore/fias/search.py b/aore/fias/search.py index 8b80d39..b771b39 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- -import logging import re import Levenshtein import sphinxapi +import time from aore.config import sphinx_conf from aore.fias.wordentry import WordEntry from aore.miscutils.trigram import trigram +from aore.config import basic class SphinxSearch: @@ -24,6 +25,8 @@ class SphinxSearch: regression_coef = 0.08 max_result = 10 + exclude_freq_words = True + def __init__(self, db): self.db = db self.client_sugg = sphinxapi.SphinxClient() @@ -96,15 +99,20 @@ class SphinxSearch: if word_entry.MT_ADD_SOCR: word_entry.add_variation_socr() + # Получает список объектов (слово), пропуская часто используемые слова def __get_word_entries(self, words, strong): we_list = [] for word in words: if word != '': we = WordEntry(self.db, word) - self.__add_word_variations(we, strong) + if self.exclude_freq_words and we.is_freq_word: + pass + else: + self.__add_word_variations(we, strong) + + assert we.get_variations() != "", "Cannot process sentence." + we_list.append(we) - assert we.get_variations() != "", "Cannot process sentence." - we_list.append(we) return we_list def find(self, text, strong): @@ -112,15 +120,29 @@ class SphinxSearch: phrase = unicode(phrase).replace('-', '').replace('@', '').lower() return re.split(r"[ ,:.#$]+", phrase) + # сплитим текст на слова words = split_phrase(text) + + # получаем список объектов word_entries = self.__get_word_entries(words, strong) word_count = len(word_entries) + + # проверяем, есть ли вообще что-либо в списке объектов слов (или же все убрали как частое) + assert word_count > 0, "No legal words is specified" + + # формируем строки для поиска в Сфинксе for x in range(word_count, max(0, word_count - 3), -1): self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x), sphinx_conf.index_addjobj) self.__configure(sphinx_conf.index_addjobj) + + start_t = time.time() rs = self.client_show.RunQueries() + elapsed_t = time.time() - start_t + + if basic.logging: + print(elapsed_t) results = [] parsed_ids = [] @@ -132,7 +154,7 @@ class SphinxSearch: if not ma['attrs']['aoid'] in parsed_ids: parsed_ids.append(ma['attrs']['aoid']) results.append( - dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'], cort=i)) + dict(aoid=ma['attrs']['aoid'], text=unicode(ma['attrs']['fullname']), ratio=ma['weight'], cort=i)) results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True) diff --git a/aore/fias/wordentry.py b/aore/fias/wordentry.py index 03b0b63..2e34047 100644 --- a/aore/fias/wordentry.py +++ b/aore/fias/wordentry.py @@ -112,3 +112,10 @@ class WordEntry: def get_type(self): return ", ".join([x for x in self.match_types if self.__dict__[x]]) + + def __unicode__(self): + return self.word + + def __str__(self): + return str(self.word) +