py-phias/aore/fias/search.py
2016-02-13 21:59:37 +03:00

144 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import logging
import re
import Levenshtein
import sphinxapi
from aore.config import sphinx_conf
from aore.fias.wordentry import WordEntry
from aore.miscutils.trigram import trigram
class SphinxSearch:
# Config's
delta_len = 2
rating_limit_soft = 0.41
rating_limit_soft_count = 6
rating_limit_hard = 0.82
rating_limit_hard_count = 3
default_rating_delta = 2
regression_coef = 0.08
max_result = 10
def __init__(self, db):
self.db = db
self.client_sugg = sphinxapi.SphinxClient()
self.client_sugg.SetServer(sphinx_conf.host_name, sphinx_conf.port)
self.client_sugg.SetLimits(0, self.max_result)
self.client_sugg.SetConnectTimeout(3.0)
self.client_show = sphinxapi.SphinxClient()
self.client_show.SetServer(sphinx_conf.host_name, sphinx_conf.port)
self.client_show.SetLimits(0, self.max_result)
self.client_show.SetConnectTimeout(3.0)
def __configure(self, index_name, wlen=None):
self.client_sugg.ResetFilters()
if index_name == sphinx_conf.index_sugg and wlen:
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_show.SetSortMode(sphinxapi.SPH_SORT_RELEVANCE)
def __get_suggest(self, word, rating_limit, count):
word_len = str(len(word) / 2)
trigrammed_word = '"{}"/1'.format(trigram(word))
self.__configure(sphinx_conf.index_sugg, word_len)
result = self.client_sugg.Query(trigrammed_word, sphinx_conf.index_sugg)
# Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем []
if not result['matches']:
return []
maxrank = result['matches'][0]['attrs']['krank']
maxleven = None
outlist = list()
for match in result['matches']:
if len(outlist) >= count:
break
if maxrank - match['attrs']['krank'] < self.default_rating_delta:
jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
if not maxleven:
maxleven = jaro_rating - jaro_rating * self.regression_coef
if jaro_rating >= rating_limit and jaro_rating >= maxleven:
outlist.append([match['attrs']['word'], jaro_rating])
del jaro_rating
outlist.sort(key=lambda x: x[1], reverse=True)
return outlist
def __add_word_variations(self, word_entry, strong):
if word_entry.MT_MANY_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, self.rating_limit_soft_count)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_SOME_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, self.rating_limit_hard_count)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_LAST_STAR:
word_entry.add_variation(word_entry.word + '*')
if word_entry.MT_AS_IS:
word_entry.add_variation(word_entry.word)
if word_entry.MT_ADD_SOCR:
word_entry.add_variation_socr()
def __get_word_entries(self, words, strong):
we_list = []
for word in words:
if word != '':
we = WordEntry(self.db, word)
self.__add_word_variations(we, strong)
assert we.get_variations() != "", "Cannot process sentence."
we_list.append(we)
return we_list
def find(self, text, strong):
def split_phrase(phrase):
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.#$]+", phrase)
words = split_phrase(text)
word_entries = self.__get_word_entries(words, strong)
word_count = len(word_entries)
for x in range(word_count, max(0, word_count - 3), -1):
logging.info("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x))
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
sphinx_conf.index_addjobj)
self.__configure(sphinx_conf.index_addjobj)
logging.info("QUERY ")
rs = self.client_show.RunQueries()
logging.info("OK")
results = []
parsed_ids = []
for i in range(0, len(rs)):
for ma in rs[i]['matches']:
if len(results) >= self.max_result:
break
if not ma['attrs']['aoid'] in parsed_ids:
parsed_ids.append(ma['attrs']['aoid'])
results.append(
dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'], cort=i))
if strong:
results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True)
return results