Добавлен коостяк для подсказок

This commit is contained in:
Jack Stdin 2016-01-15 15:15:10 +03:00
parent 8156fa3d8d
commit 1e861f7285
7 changed files with 220 additions and 65 deletions

View File

@ -30,10 +30,7 @@ class DBImpl:
self.transaction_rollback() self.transaction_rollback()
raise BaseException("Error execute sql query. Reason : {}".format(format_exc())) raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
def get_rows(self, query_string, for_dict=True): def get_rows(self, query_string):
if for_dict:
cur = self.connection.cursor(self.db_engine.cursors.DictCursor)
else:
cur = self.connection.cursor() cur = self.connection.cursor()
cur.execute(query_string) cur.execute(query_string)

View File

@ -18,4 +18,7 @@ db_shemas['ADDROBJ'] = DbSchema("ADDROBJ",
db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st", db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st",
"AddressObjectType") "AddressObjectType")
db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM"], "word",
None)
allowed_tables = ["ADDROBJ", "SOCRBASE"] allowed_tables = ["ADDROBJ", "SOCRBASE"]

View File

@ -1,17 +1,89 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
import logging
import Levenshtein
import psycopg2
import sphinxapi import sphinxapi
import logging from aore.config import db as dbparams
import json from aore.dbutils.dbimpl import DBImpl
from aore.miscutils.trigram import trigram
class SphinxSearch: class SphinxSearch:
def __init__(self): def __init__(self):
self.delta_len = 2
self.db = DBImpl(psycopg2, dbparams)
self.client = sphinxapi.SphinxClient() self.client = sphinxapi.SphinxClient()
self.client.SetServer("localhost", 9312) self.client.SetServer("localhost", 9312)
self.client.SetLimits(0, 10) self.client.SetLimits(0, 10)
def __configure(self, index_name, wlen=None):
if index_name == "idx_fias_sugg":
if wlen:
self.client.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client.SetMatchMode(sphinxapi.MA)
# Types =
class SRankType:
names = dict(
SRANK_EXACTLY_MISSPRINT=['00'], # Точно - опечатка, нужно много подсказок, без word*
SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word*
SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word*
SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word*
SRANK_PROBABLY_COMPLEX=['1*']
# Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
)
def __init__(self, rtype):
self.rtype = rtype
for x, y in self.names.iteritems():
self.__dict__[x] = self.rtype in y
def __get_strong_and_uncomplete_ranks(self, word):
word_len = str(len(word) / 2)
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
word, word_len, word)
result = self.db.get_rows(sql_qry)
strong_rank = result[1][0]
uncomplete_rank = result[0][0]
if uncomplete_rank > 1:
uncomplete_rank = '*'
return self.SRankType(str(strong_rank) + str(uncomplete_rank))
def get_suggest(self, word):
word_len = str(len(word) / 2)
trigrammed_word = '"{}"/2'.format(trigram(word))
self.__configure("idx_fias_sugg", word_len)
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
# Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем []
if not result['matches']:
return []
maxrank = result['matches'][0]['attrs']['krank']
outlist = list()
for match in result['matches']:
if maxrank - match['attrs']['krank'] < 2:
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
outlist.sort(key=lambda x: x[1], reverse=True)
for x in outlist:
print x[0], x[1]
return outlist
def find(self, text): def find(self, text):
# TODO: ADD index # TODO: ADD index
logging.info("12") logging.info("12")

View File

@ -5,75 +5,121 @@ import os
from bottle import template from bottle import template
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.dbutils.dbhandler import DbHandler
from trigram import trigram
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg
def configure_sphinx(indexer_binary): class SphinxHelper:
def __init__(self, ):
self.index_binary = None
self.files = dict()
def configure_indexer(self, indexer_binary):
logging.info("Start configuring Sphinx...") logging.info("Start configuring Sphinx...")
self.index_binary = indexer_binary
# Create ADDROBJ config # Create ADDROBJ config
addrobj_cfg_name = get_addrobj_config() self.files['addrobj.conf'] = self.__create_ao_index_config()
# Indexing it... # Indexing ADDROBJ config
run_index_cmd = "{} -c {} --all".format(indexer_binary, addrobj_cfg_name) run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['addrobj.conf'])
logging.info("Run indexer (indexing ADDROBJ)...") logging.info("Indexing main ({})...".format(sphinx_index_addjobj))
os.system(run_index_cmd) #os.system(run_index_cmd)
logging.info("{} index was created.".format(sphinx_index_addjobj)) logging.info("{} index was created.".format(sphinx_index_addjobj))
# Produce dict file # Produce dict file
sugg_dict_name = get_suggestion_dict(indexer_binary, addrobj_cfg_name) self.files['dict.txt'] = self.__create_suggestion_dict()
# Put dict into db
#self.files['dict.csv'] = self.__dbexport_sugg_dict()
# Create SUGGEST config
self.files['suggest.conf'] = self.__create_sugg_index_config()
run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['suggest.conf'])
logging.info("Indexing main ({})...".format(sphinx_index_sugg))
os.system(run_index_cmd)
logging.info("{} index was created.".format(sphinx_index_sugg))
def get_suggestion_dict(indexer_binary, addrobj_cfg_name): def __create_sugg_index_config(self):
logging.info("Make suggestion dict...") fname = os.path.abspath(trashfolder + "suggest.conf")
dict_file_name = os.path.abspath(trashfolder + "suggdict.txt") logging.info("Creating config {}".format(fname))
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(indexer_binary, sphinx_index_addjobj,
addrobj_cfg_name, dict_file_name) conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=dbconfig['host'],
os.system(run_builddict_cmd) db_user=dbconfig['user'],
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
index_name=sphinx_index_sugg,
sphinx_var_path=sphinx_var_dir)
f = open(fname, "w")
f.write(conf_data)
f.close()
logging.info("Done.") logging.info("Done.")
return dict_file_name return fname
def __dbexport_sugg_dict(self):
logging.info("Place suggestion dict to DB {}...".format(self.files['dict.txt']))
dict_dat_fname = os.path.abspath(trashfolder + "suggdict.csv")
def get_addrobj_config(): with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file:
config_fname = os.path.abspath(trashfolder + "addrobj.conf") line = None
logging.info("Creating config {}".format(config_fname)) while line != '':
nodes = []
line = dict_file.readline()
if line == '':
break
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'], db_user=dbconfig['user'], keyword = line.split(' ')[0]
if not keyword:
raise BaseException("Cannot process {}".format(self.files['dict.txt']))
nodes.append(keyword)
nodes.append(trigram(keyword))
exit_file.write("\t".join(nodes) + "\n")
aodp = DbHandler()
aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", 8, dict_dat_fname)
logging.info("Done.")
def __create_ao_index_config(self):
fname = os.path.abspath(trashfolder + "addrobj.conf")
logging.info("Creating config {}".format(fname))
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'],
db_user=dbconfig['user'],
db_password=dbconfig['password'], db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'], db_name=dbconfig['database'], db_port=dbconfig['port'],
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
index_name=sphinx_index_addjobj, index_name=sphinx_index_addjobj,
sphinx_var_path=sphinx_var_dir) sphinx_var_path=sphinx_var_dir)
f = open(config_fname, "w") f = open(fname, "w")
f.write(conf_data) f.write(conf_data)
f.close() f.close()
logging.info("Done.") logging.info("Done.")
return config_fname return fname
def __create_suggestion_dict(self):
fname = os.path.abspath(trashfolder + "suggdict.txt")
logging.info("Make suggestion dict ({})...".format(fname))
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary,
sphinx_index_addjobj,
self.files['addrobj.conf'], fname)
#os.system(run_builddict_cmd)
logging.info("Done.")
return fname
# TRASH # TRASH
def produce_sphinx_config(config_name): # conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
if os.path.isfile(config_name):
choice = raw_input(
"WARNING! File {} already exists. It will be overwritten, "
"all settings all setting will be lost! Are you sure? [y/n]: ".format(
config_name))
if choice.lower() != 'y':
logging.warning("Aborted.")
return
conf_file = open(config_name, "w")
conf_file.write(conf_data)
conf_file.close()
logging.info("Success! Re-index db: \n"
"\t$indexer -c {} --all --rotate\n"
"and then re/start your Sphinx:\n"
"\t$/etc/init.d/sphinxsearch stop\n"
"\t$/etc/init.d/sphinxsearch start".format(config_name))

11
aore/miscutils/trigram.py Normal file
View File

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
def trigram(inp):
inp = u"__"+inp+u"__"
output = []
for i in range(0, len(inp) - 2):
output.append(inp[i:i + 3])
return " ".join(output)

View File

@ -0,0 +1,23 @@
source {{index_name}}
{
type = pgsql
sql_host = {{db_host}}
sql_user = {{db_user}}
sql_pass = {{db_password}}
sql_db = {{db_name}}
sql_port = {{db_port}}
sql_query = SELECT id, trigramm, word, LENGTH(word) AS len FROM "AOTRIG"
sql_field_string = trigramm
sql_attr_uint = len
sql_attr_string = word
}
index {{index_name}}
{
source = {{index_name}}
path = {{sphinx_var_path}}/data/{{index_name}}
docinfo = extern
charset_type = utf-8
}

View File

@ -3,7 +3,7 @@
import optparse import optparse
from aore.aoutils.aoupdater import AoUpdater from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import configure_sphinx from aore.miscutils.sphinx import SphinxHelper
from aore.fias.search import SphinxSearch from aore.fias.search import SphinxSearch
@ -42,7 +42,10 @@ def main():
update_base(options.source, int(options.update_count)) update_base(options.source, int(options.update_count))
if options.sphinx and options.indexer_path: if options.sphinx and options.indexer_path:
configure_sphinx(options.indexer_path) sphinxh = SphinxHelper()
sphinxh.configure_indexer(options.indexer_path)
if __name__ == '__main__': if __name__ == '__main__':
main() sph = SphinxSearch()
sph.get_suggest('апасьево')
#main()