Добавлена пост-обработка часто встречающихся слов
This commit is contained in:
parent
5d9be67b00
commit
ce934b9f1b
@ -1,13 +1,14 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
import sphinxapi
|
import sphinxapi
|
||||||
|
import time
|
||||||
|
|
||||||
from aore.config import sphinx_conf
|
from aore.config import sphinx_conf
|
||||||
from aore.fias.wordentry import WordEntry
|
from aore.fias.wordentry import WordEntry
|
||||||
from aore.miscutils.trigram import trigram
|
from aore.miscutils.trigram import trigram
|
||||||
|
from aore.config import basic
|
||||||
|
|
||||||
|
|
||||||
class SphinxSearch:
|
class SphinxSearch:
|
||||||
@ -24,6 +25,8 @@ class SphinxSearch:
|
|||||||
regression_coef = 0.08
|
regression_coef = 0.08
|
||||||
max_result = 10
|
max_result = 10
|
||||||
|
|
||||||
|
exclude_freq_words = True
|
||||||
|
|
||||||
def __init__(self, db):
|
def __init__(self, db):
|
||||||
self.db = db
|
self.db = db
|
||||||
self.client_sugg = sphinxapi.SphinxClient()
|
self.client_sugg = sphinxapi.SphinxClient()
|
||||||
@ -96,15 +99,20 @@ class SphinxSearch:
|
|||||||
if word_entry.MT_ADD_SOCR:
|
if word_entry.MT_ADD_SOCR:
|
||||||
word_entry.add_variation_socr()
|
word_entry.add_variation_socr()
|
||||||
|
|
||||||
|
# Получает список объектов (слово), пропуская часто используемые слова
|
||||||
def __get_word_entries(self, words, strong):
|
def __get_word_entries(self, words, strong):
|
||||||
we_list = []
|
we_list = []
|
||||||
for word in words:
|
for word in words:
|
||||||
if word != '':
|
if word != '':
|
||||||
we = WordEntry(self.db, word)
|
we = WordEntry(self.db, word)
|
||||||
self.__add_word_variations(we, strong)
|
if self.exclude_freq_words and we.is_freq_word:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.__add_word_variations(we, strong)
|
||||||
|
|
||||||
|
assert we.get_variations() != "", "Cannot process sentence."
|
||||||
|
we_list.append(we)
|
||||||
|
|
||||||
assert we.get_variations() != "", "Cannot process sentence."
|
|
||||||
we_list.append(we)
|
|
||||||
return we_list
|
return we_list
|
||||||
|
|
||||||
def find(self, text, strong):
|
def find(self, text, strong):
|
||||||
@ -112,15 +120,29 @@ class SphinxSearch:
|
|||||||
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
|
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
|
||||||
return re.split(r"[ ,:.#$]+", phrase)
|
return re.split(r"[ ,:.#$]+", phrase)
|
||||||
|
|
||||||
|
# сплитим текст на слова
|
||||||
words = split_phrase(text)
|
words = split_phrase(text)
|
||||||
|
|
||||||
|
# получаем список объектов
|
||||||
word_entries = self.__get_word_entries(words, strong)
|
word_entries = self.__get_word_entries(words, strong)
|
||||||
word_count = len(word_entries)
|
word_count = len(word_entries)
|
||||||
|
|
||||||
|
# проверяем, есть ли вообще что-либо в списке объектов слов (или же все убрали как частое)
|
||||||
|
assert word_count > 0, "No legal words is specified"
|
||||||
|
|
||||||
|
# формируем строки для поиска в Сфинксе
|
||||||
for x in range(word_count, max(0, word_count - 3), -1):
|
for x in range(word_count, max(0, word_count - 3), -1):
|
||||||
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
|
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
|
||||||
sphinx_conf.index_addjobj)
|
sphinx_conf.index_addjobj)
|
||||||
|
|
||||||
self.__configure(sphinx_conf.index_addjobj)
|
self.__configure(sphinx_conf.index_addjobj)
|
||||||
|
|
||||||
|
start_t = time.time()
|
||||||
rs = self.client_show.RunQueries()
|
rs = self.client_show.RunQueries()
|
||||||
|
elapsed_t = time.time() - start_t
|
||||||
|
|
||||||
|
if basic.logging:
|
||||||
|
print(elapsed_t)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
parsed_ids = []
|
parsed_ids = []
|
||||||
@ -132,7 +154,7 @@ class SphinxSearch:
|
|||||||
if not ma['attrs']['aoid'] in parsed_ids:
|
if not ma['attrs']['aoid'] in parsed_ids:
|
||||||
parsed_ids.append(ma['attrs']['aoid'])
|
parsed_ids.append(ma['attrs']['aoid'])
|
||||||
results.append(
|
results.append(
|
||||||
dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'], cort=i))
|
dict(aoid=ma['attrs']['aoid'], text=unicode(ma['attrs']['fullname']), ratio=ma['weight'], cort=i))
|
||||||
|
|
||||||
results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)
|
results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)
|
||||||
|
|
||||||
|
@ -112,3 +112,10 @@ class WordEntry:
|
|||||||
|
|
||||||
def get_type(self):
|
def get_type(self):
|
||||||
return ", ".join([x for x in self.match_types if self.__dict__[x]])
|
return ", ".join([x for x in self.match_types if self.__dict__[x]])
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.word
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str(self.word)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user