Добавлен коостяк для подсказок
This commit is contained in:
@@ -1,17 +1,89 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
import Levenshtein
|
||||
import psycopg2
|
||||
import sphinxapi
|
||||
|
||||
import logging
|
||||
import json
|
||||
from aore.config import db as dbparams
|
||||
from aore.dbutils.dbimpl import DBImpl
|
||||
from aore.miscutils.trigram import trigram
|
||||
|
||||
|
||||
class SphinxSearch:
|
||||
def __init__(self):
|
||||
self.delta_len = 2
|
||||
self.db = DBImpl(psycopg2, dbparams)
|
||||
self.client = sphinxapi.SphinxClient()
|
||||
self.client.SetServer("localhost", 9312)
|
||||
self.client.SetLimits(0, 10)
|
||||
|
||||
def __configure(self, index_name, wlen=None):
|
||||
if index_name == "idx_fias_sugg":
|
||||
if wlen:
|
||||
self.client.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
|
||||
self.client.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
|
||||
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
||||
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
|
||||
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
|
||||
else:
|
||||
self.client.SetMatchMode(sphinxapi.MA)
|
||||
|
||||
# Types =
|
||||
class SRankType:
|
||||
names = dict(
|
||||
SRANK_EXACTLY_MISSPRINT=['00'], # Точно - опечатка, нужно много подсказок, без word*
|
||||
SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word*
|
||||
SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word*
|
||||
SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word*
|
||||
SRANK_PROBABLY_COMPLEX=['1*']
|
||||
# Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
|
||||
)
|
||||
|
||||
def __init__(self, rtype):
|
||||
self.rtype = rtype
|
||||
for x, y in self.names.iteritems():
|
||||
self.__dict__[x] = self.rtype in y
|
||||
|
||||
def __get_strong_and_uncomplete_ranks(self, word):
|
||||
word_len = str(len(word) / 2)
|
||||
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
||||
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
|
||||
word, word_len, word)
|
||||
|
||||
result = self.db.get_rows(sql_qry)
|
||||
strong_rank = result[1][0]
|
||||
uncomplete_rank = result[0][0]
|
||||
if uncomplete_rank > 1:
|
||||
uncomplete_rank = '*'
|
||||
|
||||
return self.SRankType(str(strong_rank) + str(uncomplete_rank))
|
||||
|
||||
def get_suggest(self, word):
|
||||
word_len = str(len(word) / 2)
|
||||
trigrammed_word = '"{}"/2'.format(trigram(word))
|
||||
|
||||
self.__configure("idx_fias_sugg", word_len)
|
||||
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
|
||||
|
||||
# Если по данному слову не найдено подсказок (а такое бывает?)
|
||||
# возвращаем []
|
||||
if not result['matches']:
|
||||
return []
|
||||
|
||||
maxrank = result['matches'][0]['attrs']['krank']
|
||||
outlist = list()
|
||||
for match in result['matches']:
|
||||
if maxrank - match['attrs']['krank'] < 2:
|
||||
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
|
||||
outlist.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for x in outlist:
|
||||
print x[0], x[1]
|
||||
return outlist
|
||||
|
||||
def find(self, text):
|
||||
# TODO: ADD index
|
||||
logging.info("12")
|
||||
|
||||
Reference in New Issue
Block a user