Начало работы над логикой релевантности.
This commit is contained in:
@@ -1,25 +1,35 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
import Levenshtein
|
||||
import psycopg2
|
||||
import sphinxapi
|
||||
import aore.sphinxapi as sphinxapi
|
||||
|
||||
from aore.config import db as dbparams
|
||||
from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj
|
||||
from aore.dbutils.dbimpl import DBImpl
|
||||
from aore.fias.word import WordEntry
|
||||
from aore.fias.wordentry import WordEntry
|
||||
from aore.miscutils.trigram import trigram
|
||||
|
||||
|
||||
class SphinxSearch:
|
||||
def __init__(self):
|
||||
self.delta_len = 2
|
||||
self.rating_limit_soft = 0.4
|
||||
self.rating_limit_hard = 0.82
|
||||
self.default_rating_delta = 2
|
||||
self.regression_coef = 0.04
|
||||
|
||||
self.db = DBImpl(psycopg2, dbparams)
|
||||
self.client = sphinxapi.SphinxClient()
|
||||
self.client.SetServer("localhost", 9312)
|
||||
self.client.SetLimits(0, 10)
|
||||
|
||||
self.client1 = sphinxapi.SphinxClient()
|
||||
self.client1.SetServer("localhost", 9312)
|
||||
self.client1.SetLimits(0, 10)
|
||||
self.client1.SetConnectTimeout(7.0)
|
||||
|
||||
def __configure(self, index_name, wlen=None):
|
||||
if index_name == "idx_fias_sugg":
|
||||
if wlen:
|
||||
@@ -28,15 +38,13 @@ class SphinxSearch:
|
||||
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
||||
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
|
||||
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
|
||||
else:
|
||||
self.client.SetMatchMode(sphinxapi.MA)
|
||||
|
||||
def __get_suggest(self, word):
|
||||
def __get_suggest(self, word, rating_limit, count):
|
||||
word_len = str(len(word) / 2)
|
||||
trigrammed_word = '"{}"/1'.format(trigram(word))
|
||||
|
||||
self.__configure("idx_fias_sugg", word_len)
|
||||
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
|
||||
self.__configure(sphinx_index_sugg, word_len)
|
||||
result = self.client.Query(trigrammed_word, sphinx_index_sugg)
|
||||
|
||||
# Если по данному слову не найдено подсказок (а такое бывает?)
|
||||
# возвращаем []
|
||||
@@ -44,29 +52,63 @@ class SphinxSearch:
|
||||
return []
|
||||
|
||||
maxrank = result['matches'][0]['attrs']['krank']
|
||||
maxleven = None
|
||||
|
||||
outlist = list()
|
||||
for match in result['matches']:
|
||||
if maxrank - match['attrs']['krank'] < 2:
|
||||
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
|
||||
if len(outlist) >= count:
|
||||
break;
|
||||
|
||||
if maxrank - match['attrs']['krank'] < self.default_rating_delta:
|
||||
jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
|
||||
if not maxleven:
|
||||
maxleven = jaro_rating - jaro_rating * self.regression_coef
|
||||
if jaro_rating >= rating_limit and jaro_rating >= maxleven:
|
||||
outlist.append([match['attrs']['word'], jaro_rating])
|
||||
|
||||
outlist.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for x in outlist:
|
||||
print x[0], x[1]
|
||||
return outlist
|
||||
|
||||
def __split_phrase(self, phrase):
|
||||
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
|
||||
return re.split(r"[ ,:.]+", phrase)
|
||||
return re.split(r"[ ,:.#$]+", phrase)
|
||||
|
||||
def __process_words(self, words):
|
||||
def __add_word_variations(self, word_entry):
|
||||
if word_entry.MT_MANY_SUGG:
|
||||
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6)
|
||||
for suggestion in suggs:
|
||||
word_entry.add_variation(suggestion[0])
|
||||
if word_entry.MT_SOME_SUGG:
|
||||
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3)
|
||||
for suggestion in suggs:
|
||||
word_entry.add_variation(suggestion[0])
|
||||
if word_entry.MT_LAST_STAR:
|
||||
word_entry.add_variation(word_entry.word+'*')
|
||||
if word_entry.MT_AS_IS:
|
||||
word_entry.add_variation(word_entry.word)
|
||||
if word_entry.MT_ADD_SOCR:
|
||||
word_entry.add_variation_socr()
|
||||
|
||||
|
||||
def __get_word_entries(self, words):
|
||||
for word in words:
|
||||
yield WordEntry(self.db, word)
|
||||
if word != '':
|
||||
we = WordEntry(self.db, word)
|
||||
self.__add_word_variations(we)
|
||||
yield we
|
||||
|
||||
|
||||
def find(self, text):
|
||||
words = self.__split_phrase(text)
|
||||
word_entries = self.__process_words(words)
|
||||
for word_entry in word_entries:
|
||||
print word_entry, word_entry.get_type()
|
||||
# result = self.client.Query(text)
|
||||
# print json.dumps(result)
|
||||
# logging.info("12")
|
||||
word_entries = self.__get_word_entries(words)
|
||||
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))
|
||||
#self.__configure(sphinx_index_addjobj)
|
||||
self.client1.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
|
||||
self.client1.SetRankingMode(sphinxapi.SPH_RANK_SPH04)
|
||||
#self.client1.SetF
|
||||
rs = self.client1.Query(sentence, sphinx_index_addjobj)
|
||||
print rs
|
||||
for ma in rs['matches']:
|
||||
print ma['attrs']['fullname'], ma['weight']
|
||||
print sentence
|
||||
|
||||
Reference in New Issue
Block a user