Добавлена пост-обработка часто встречающихся слов

2016-02-18 13:18:38 +03:00 · 2016-02-18 13:18:38 +03:00 · ce934b9f1b
commit ce934b9f1b
parent 5d9be67b00
2 changed files with 34 additions and 5 deletions
--- a/aore/fias/search.py
+++ b/aore/fias/search.py
@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
-import logging
 import re

 import Levenshtein
 import sphinxapi
+import time

 from aore.config import sphinx_conf
 from aore.fias.wordentry import WordEntry
 from aore.miscutils.trigram import trigram
+from aore.config import basic


 class SphinxSearch:
@ -24,6 +25,8 @@ class SphinxSearch:
    regression_coef = 0.08
    max_result = 10

+    exclude_freq_words = True
+
    def __init__(self, db):
        self.db = db
        self.client_sugg = sphinxapi.SphinxClient()
@ -96,15 +99,20 @@ class SphinxSearch:
        if word_entry.MT_ADD_SOCR:
            word_entry.add_variation_socr()

+    # Получает список объектов (слово), пропуская часто используемые слова
    def __get_word_entries(self, words, strong):
        we_list = []
        for word in words:
            if word != '':
                we = WordEntry(self.db, word)
-                self.__add_word_variations(we, strong)
+                if self.exclude_freq_words and we.is_freq_word:
+                    pass
+                else:
+                    self.__add_word_variations(we, strong)
+
+                    assert we.get_variations() != "", "Cannot process sentence."
+                    we_list.append(we)

-                assert we.get_variations() != "", "Cannot process sentence."
-                we_list.append(we)
        return we_list

    def find(self, text, strong):
@ -112,15 +120,29 @@ class SphinxSearch:
            phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
            return re.split(r"[ ,:.#$]+", phrase)

+        # сплитим текст на слова
        words = split_phrase(text)
+
+        # получаем список объектов
        word_entries = self.__get_word_entries(words, strong)
        word_count = len(word_entries)
+
+        # проверяем, есть ли вообще что-либо в списке объектов слов (или же все убрали как частое)
+        assert word_count > 0, "No legal words is specified"
+
+        # формируем строки для поиска в Сфинксе
        for x in range(word_count, max(0, word_count - 3), -1):
            self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
                                      sphinx_conf.index_addjobj)

        self.__configure(sphinx_conf.index_addjobj)
+
+        start_t = time.time()
        rs = self.client_show.RunQueries()
+        elapsed_t = time.time() - start_t
+
+        if basic.logging:
+            print(elapsed_t)

        results = []
        parsed_ids = []
@ -132,7 +154,7 @@ class SphinxSearch:
                if not ma['attrs']['aoid'] in parsed_ids:
                    parsed_ids.append(ma['attrs']['aoid'])
                    results.append(
-                        dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'], cort=i))
+                        dict(aoid=ma['attrs']['aoid'], text=unicode(ma['attrs']['fullname']), ratio=ma['weight'], cort=i))

        results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)

--- a/aore/fias/wordentry.py
+++ b/aore/fias/wordentry.py
@ -112,3 +112,10 @@ class WordEntry:

    def get_type(self):
        return ", ".join([x for x in self.match_types if self.__dict__[x]])
+
+    def __unicode__(self):
+        return self.word
+
+    def __str__(self):
+        return str(self.word)
+