py-phias/aore/fias/search.py

162 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import Levenshtein
import sphinxapi
import time
from aore.config import sphinx_conf
from aore.fias.wordentry import WordEntry
from aore.miscutils.trigram import trigram
from aore.config import basic
class SphinxSearch:
# Config's
delta_len = 2
rating_limit_soft = 0.41
rating_limit_soft_count = 6
rating_limit_hard = 0.82
rating_limit_hard_count = 3
default_rating_delta = 2
regression_coef = 0.08
max_result = 10
exclude_freq_words = True
def __init__(self, db):
self.db = db
self.client_sugg = sphinxapi.SphinxClient()
self.client_sugg.SetServer(sphinx_conf.host_name, sphinx_conf.port)
self.client_sugg.SetLimits(0, self.max_result)
self.client_sugg.SetConnectTimeout(3.0)
self.client_show = sphinxapi.SphinxClient()
self.client_show.SetServer(sphinx_conf.host_name, sphinx_conf.port)
self.client_show.SetLimits(0, self.max_result)
self.client_show.SetConnectTimeout(3.0)
def __configure(self, index_name, wlen=None):
self.client_sugg.ResetFilters()
if index_name == sphinx_conf.index_sugg and wlen:
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_show.SetSortMode(sphinxapi.SPH_SORT_RELEVANCE)
def __get_suggest(self, word, rating_limit, count):
word_len = str(len(word) / 2)
trigrammed_word = '"{}"/1'.format(trigram(word))
self.__configure(sphinx_conf.index_sugg, word_len)
result = self.client_sugg.Query(trigrammed_word, sphinx_conf.index_sugg)
# Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем []
if not result['matches']:
return []
maxrank = result['matches'][0]['attrs']['krank']
maxleven = None
outlist = list()
for match in result['matches']:
if len(outlist) >= count:
break
if maxrank - match['attrs']['krank'] < self.default_rating_delta:
jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
if not maxleven:
maxleven = jaro_rating - jaro_rating * self.regression_coef
if jaro_rating >= rating_limit and jaro_rating >= maxleven:
outlist.append([match['attrs']['word'], jaro_rating])
del jaro_rating
outlist.sort(key=lambda x: x[1], reverse=True)
return outlist
def __add_word_variations(self, word_entry, strong):
if word_entry.MT_MANY_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, self.rating_limit_soft_count)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_SOME_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, self.rating_limit_hard_count)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_LAST_STAR:
word_entry.add_variation(word_entry.word + '*')
if word_entry.MT_AS_IS:
word_entry.add_variation(word_entry.word)
if word_entry.MT_ADD_SOCR:
word_entry.add_variation_socr()
# Получает список объектов (слово), пропуская часто используемые слова
def __get_word_entries(self, words, strong):
we_list = []
for word in words:
if word != '':
we = WordEntry(self.db, word)
if self.exclude_freq_words and we.is_freq_word:
pass
else:
self.__add_word_variations(we, strong)
assert we.get_variations() != "", "Cannot process sentence."
we_list.append(we)
return we_list
def find(self, text, strong):
def split_phrase(phrase):
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.#$]+", phrase)
# сплитим текст на слова
words = split_phrase(text)
# получаем список объектов
word_entries = self.__get_word_entries(words, strong)
word_count = len(word_entries)
# проверяем, есть ли вообще что-либо в списке объектов слов (или же все убрали как частое)
assert word_count > 0, "No legal words is specified"
# формируем строки для поиска в Сфинксе
for x in range(word_count, max(0, word_count - 3), -1):
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
sphinx_conf.index_addjobj)
self.__configure(sphinx_conf.index_addjobj)
start_t = time.time()
rs = self.client_show.RunQueries()
elapsed_t = time.time() - start_t
if basic.logging:
print(elapsed_t)
results = []
parsed_ids = []
for i in range(0, len(rs)):
for ma in rs[i]['matches']:
if len(results) >= self.max_result:
break
if not ma['attrs']['aoid'] in parsed_ids:
parsed_ids.append(ma['attrs']['aoid'])
results.append(
dict(aoid=ma['attrs']['aoid'], text=unicode(ma['attrs']['fullname']), ratio=ma['weight'], cort=i))
results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)
return results