Добавлена пост-обработка часто встречающихся слов

2016-02-18 13:18:38 +03:00
parent 5d9be67b00
commit ce934b9f1b
2 changed files with 34 additions and 5 deletions
--- a/aore/fias/search.py
+++ b/aore/fias/search.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 import logging
 import re
 import Levenshtein
 import sphinxapi
 import time
 from aore.config import sphinx_conf
 from aore.fias.wordentry import WordEntry
 from aore.miscutils.trigram import trigram
 from aore.config import basic
 class SphinxSearch:
@@ -24,6 +25,8 @@ class SphinxSearch:
    regression_coef = 0.08
    max_result = 10
    exclude_freq_words = True
    def __init__(self, db):
        self.db = db
        self.client_sugg = sphinxapi.SphinxClient()
@@ -96,15 +99,20 @@ class SphinxSearch:
        if word_entry.MT_ADD_SOCR:
            word_entry.add_variation_socr()
    # Получает список объектов (слово), пропуская часто используемые слова
    def __get_word_entries(self, words, strong):
        we_list = []
        for word in words:
            if word != '':
                we = WordEntry(self.db, word)
-                self.__add_word_variations(we, strong)
+                if self.exclude_freq_words and we.is_freq_word:
                    pass
                else:
                    self.__add_word_variations(we, strong)
                    assert we.get_variations() != "", "Cannot process sentence."
                    we_list.append(we)
                assert we.get_variations() != "", "Cannot process sentence."
                we_list.append(we)
        return we_list
    def find(self, text, strong):
@@ -112,15 +120,29 @@ class SphinxSearch:
            phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
            return re.split(r"[ ,:.#$]+", phrase)
        # сплитим текст на слова
        words = split_phrase(text)
        # получаем список объектов
        word_entries = self.__get_word_entries(words, strong)
        word_count = len(word_entries)
        # проверяем, есть ли вообще что-либо в списке объектов слов (или же все убрали как частое)
        assert word_count > 0, "No legal words is specified"
        # формируем строки для поиска в Сфинксе
        for x in range(word_count, max(0, word_count - 3), -1):
            self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
                                      sphinx_conf.index_addjobj)
        self.__configure(sphinx_conf.index_addjobj)
        start_t = time.time()
        rs = self.client_show.RunQueries()
        elapsed_t = time.time() - start_t
        if basic.logging:
            print(elapsed_t)
        results = []
        parsed_ids = []
@@ -132,7 +154,7 @@ class SphinxSearch:
                if not ma['attrs']['aoid'] in parsed_ids:
                    parsed_ids.append(ma['attrs']['aoid'])
                    results.append(
-                        dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'], cort=i))
+                        dict(aoid=ma['attrs']['aoid'], text=unicode(ma['attrs']['fullname']), ratio=ma['weight'], cort=i))
        results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)
--- a/aore/fias/wordentry.py
+++ b/aore/fias/wordentry.py
@@ -112,3 +112,10 @@ class WordEntry:
    def get_type(self):
        return ", ".join([x for x in self.match_types if self.__dict__[x]])
    def __unicode__(self):
        return self.word
    def __str__(self):
        return str(self.word)