Добавлен коостяк для подсказок
This commit is contained in:
parent
8156fa3d8d
commit
1e861f7285
@ -30,10 +30,7 @@ class DBImpl:
|
|||||||
self.transaction_rollback()
|
self.transaction_rollback()
|
||||||
raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
|
raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
|
||||||
|
|
||||||
def get_rows(self, query_string, for_dict=True):
|
def get_rows(self, query_string):
|
||||||
if for_dict:
|
|
||||||
cur = self.connection.cursor(self.db_engine.cursors.DictCursor)
|
|
||||||
else:
|
|
||||||
cur = self.connection.cursor()
|
cur = self.connection.cursor()
|
||||||
cur.execute(query_string)
|
cur.execute(query_string)
|
||||||
|
|
||||||
|
@ -18,4 +18,7 @@ db_shemas['ADDROBJ'] = DbSchema("ADDROBJ",
|
|||||||
db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st",
|
db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st",
|
||||||
"AddressObjectType")
|
"AddressObjectType")
|
||||||
|
|
||||||
|
db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM"], "word",
|
||||||
|
None)
|
||||||
|
|
||||||
allowed_tables = ["ADDROBJ", "SOCRBASE"]
|
allowed_tables = ["ADDROBJ", "SOCRBASE"]
|
||||||
|
@ -1,17 +1,89 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import Levenshtein
|
||||||
|
import psycopg2
|
||||||
import sphinxapi
|
import sphinxapi
|
||||||
|
|
||||||
import logging
|
from aore.config import db as dbparams
|
||||||
import json
|
from aore.dbutils.dbimpl import DBImpl
|
||||||
|
from aore.miscutils.trigram import trigram
|
||||||
|
|
||||||
|
|
||||||
class SphinxSearch:
|
class SphinxSearch:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.delta_len = 2
|
||||||
|
self.db = DBImpl(psycopg2, dbparams)
|
||||||
self.client = sphinxapi.SphinxClient()
|
self.client = sphinxapi.SphinxClient()
|
||||||
self.client.SetServer("localhost", 9312)
|
self.client.SetServer("localhost", 9312)
|
||||||
self.client.SetLimits(0, 10)
|
self.client.SetLimits(0, 10)
|
||||||
|
|
||||||
|
def __configure(self, index_name, wlen=None):
|
||||||
|
if index_name == "idx_fias_sugg":
|
||||||
|
if wlen:
|
||||||
|
self.client.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
|
||||||
|
self.client.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
|
||||||
|
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
||||||
|
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
|
||||||
|
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
|
||||||
|
else:
|
||||||
|
self.client.SetMatchMode(sphinxapi.MA)
|
||||||
|
|
||||||
|
# Types =
|
||||||
|
class SRankType:
|
||||||
|
names = dict(
|
||||||
|
SRANK_EXACTLY_MISSPRINT=['00'], # Точно - опечатка, нужно много подсказок, без word*
|
||||||
|
SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word*
|
||||||
|
SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word*
|
||||||
|
SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word*
|
||||||
|
SRANK_PROBABLY_COMPLEX=['1*']
|
||||||
|
# Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, rtype):
|
||||||
|
self.rtype = rtype
|
||||||
|
for x, y in self.names.iteritems():
|
||||||
|
self.__dict__[x] = self.rtype in y
|
||||||
|
|
||||||
|
def __get_strong_and_uncomplete_ranks(self, word):
|
||||||
|
word_len = str(len(word) / 2)
|
||||||
|
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
||||||
|
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
|
||||||
|
word, word_len, word)
|
||||||
|
|
||||||
|
result = self.db.get_rows(sql_qry)
|
||||||
|
strong_rank = result[1][0]
|
||||||
|
uncomplete_rank = result[0][0]
|
||||||
|
if uncomplete_rank > 1:
|
||||||
|
uncomplete_rank = '*'
|
||||||
|
|
||||||
|
return self.SRankType(str(strong_rank) + str(uncomplete_rank))
|
||||||
|
|
||||||
|
def get_suggest(self, word):
|
||||||
|
word_len = str(len(word) / 2)
|
||||||
|
trigrammed_word = '"{}"/2'.format(trigram(word))
|
||||||
|
|
||||||
|
self.__configure("idx_fias_sugg", word_len)
|
||||||
|
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
|
||||||
|
|
||||||
|
# Если по данному слову не найдено подсказок (а такое бывает?)
|
||||||
|
# возвращаем []
|
||||||
|
if not result['matches']:
|
||||||
|
return []
|
||||||
|
|
||||||
|
maxrank = result['matches'][0]['attrs']['krank']
|
||||||
|
outlist = list()
|
||||||
|
for match in result['matches']:
|
||||||
|
if maxrank - match['attrs']['krank'] < 2:
|
||||||
|
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
|
||||||
|
outlist.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
for x in outlist:
|
||||||
|
print x[0], x[1]
|
||||||
|
return outlist
|
||||||
|
|
||||||
def find(self, text):
|
def find(self, text):
|
||||||
# TODO: ADD index
|
# TODO: ADD index
|
||||||
logging.info("12")
|
logging.info("12")
|
||||||
|
@ -5,75 +5,121 @@ import os
|
|||||||
|
|
||||||
from bottle import template
|
from bottle import template
|
||||||
|
|
||||||
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder
|
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
|
||||||
|
from aore.dbutils.dbhandler import DbHandler
|
||||||
|
from trigram import trigram
|
||||||
|
|
||||||
|
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg
|
||||||
|
|
||||||
|
|
||||||
def configure_sphinx(indexer_binary):
|
class SphinxHelper:
|
||||||
|
def __init__(self, ):
|
||||||
|
self.index_binary = None
|
||||||
|
self.files = dict()
|
||||||
|
|
||||||
|
def configure_indexer(self, indexer_binary):
|
||||||
logging.info("Start configuring Sphinx...")
|
logging.info("Start configuring Sphinx...")
|
||||||
|
self.index_binary = indexer_binary
|
||||||
|
|
||||||
# Create ADDROBJ config
|
# Create ADDROBJ config
|
||||||
addrobj_cfg_name = get_addrobj_config()
|
self.files['addrobj.conf'] = self.__create_ao_index_config()
|
||||||
|
|
||||||
# Indexing it...
|
# Indexing ADDROBJ config
|
||||||
run_index_cmd = "{} -c {} --all".format(indexer_binary, addrobj_cfg_name)
|
run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['addrobj.conf'])
|
||||||
logging.info("Run indexer (indexing ADDROBJ)...")
|
logging.info("Indexing main ({})...".format(sphinx_index_addjobj))
|
||||||
os.system(run_index_cmd)
|
#os.system(run_index_cmd)
|
||||||
logging.info("{} index was created.".format(sphinx_index_addjobj))
|
logging.info("{} index was created.".format(sphinx_index_addjobj))
|
||||||
|
|
||||||
# Produce dict file
|
# Produce dict file
|
||||||
sugg_dict_name = get_suggestion_dict(indexer_binary, addrobj_cfg_name)
|
self.files['dict.txt'] = self.__create_suggestion_dict()
|
||||||
|
|
||||||
|
# Put dict into db
|
||||||
|
#self.files['dict.csv'] = self.__dbexport_sugg_dict()
|
||||||
|
|
||||||
|
# Create SUGGEST config
|
||||||
|
self.files['suggest.conf'] = self.__create_sugg_index_config()
|
||||||
|
run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['suggest.conf'])
|
||||||
|
logging.info("Indexing main ({})...".format(sphinx_index_sugg))
|
||||||
|
os.system(run_index_cmd)
|
||||||
|
logging.info("{} index was created.".format(sphinx_index_sugg))
|
||||||
|
|
||||||
|
|
||||||
def get_suggestion_dict(indexer_binary, addrobj_cfg_name):
|
def __create_sugg_index_config(self):
|
||||||
logging.info("Make suggestion dict...")
|
fname = os.path.abspath(trashfolder + "suggest.conf")
|
||||||
dict_file_name = os.path.abspath(trashfolder + "suggdict.txt")
|
logging.info("Creating config {}".format(fname))
|
||||||
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(indexer_binary, sphinx_index_addjobj,
|
|
||||||
addrobj_cfg_name, dict_file_name)
|
conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=dbconfig['host'],
|
||||||
os.system(run_builddict_cmd)
|
db_user=dbconfig['user'],
|
||||||
|
db_password=dbconfig['password'],
|
||||||
|
db_name=dbconfig['database'], db_port=dbconfig['port'],
|
||||||
|
index_name=sphinx_index_sugg,
|
||||||
|
sphinx_var_path=sphinx_var_dir)
|
||||||
|
|
||||||
|
f = open(fname, "w")
|
||||||
|
f.write(conf_data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
logging.info("Done.")
|
logging.info("Done.")
|
||||||
|
|
||||||
return dict_file_name
|
return fname
|
||||||
|
|
||||||
|
def __dbexport_sugg_dict(self):
|
||||||
|
logging.info("Place suggestion dict to DB {}...".format(self.files['dict.txt']))
|
||||||
|
dict_dat_fname = os.path.abspath(trashfolder + "suggdict.csv")
|
||||||
|
|
||||||
def get_addrobj_config():
|
with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file:
|
||||||
config_fname = os.path.abspath(trashfolder + "addrobj.conf")
|
line = None
|
||||||
logging.info("Creating config {}".format(config_fname))
|
while line != '':
|
||||||
|
nodes = []
|
||||||
|
line = dict_file.readline()
|
||||||
|
if line == '':
|
||||||
|
break
|
||||||
|
|
||||||
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'], db_user=dbconfig['user'],
|
keyword = line.split(' ')[0]
|
||||||
|
if not keyword:
|
||||||
|
raise BaseException("Cannot process {}".format(self.files['dict.txt']))
|
||||||
|
|
||||||
|
nodes.append(keyword)
|
||||||
|
nodes.append(trigram(keyword))
|
||||||
|
|
||||||
|
exit_file.write("\t".join(nodes) + "\n")
|
||||||
|
|
||||||
|
aodp = DbHandler()
|
||||||
|
aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", 8, dict_dat_fname)
|
||||||
|
logging.info("Done.")
|
||||||
|
|
||||||
|
def __create_ao_index_config(self):
|
||||||
|
fname = os.path.abspath(trashfolder + "addrobj.conf")
|
||||||
|
logging.info("Creating config {}".format(fname))
|
||||||
|
|
||||||
|
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'],
|
||||||
|
db_user=dbconfig['user'],
|
||||||
db_password=dbconfig['password'],
|
db_password=dbconfig['password'],
|
||||||
db_name=dbconfig['database'], db_port=dbconfig['port'],
|
db_name=dbconfig['database'], db_port=dbconfig['port'],
|
||||||
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
|
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
|
||||||
index_name=sphinx_index_addjobj,
|
index_name=sphinx_index_addjobj,
|
||||||
sphinx_var_path=sphinx_var_dir)
|
sphinx_var_path=sphinx_var_dir)
|
||||||
|
|
||||||
f = open(config_fname, "w")
|
f = open(fname, "w")
|
||||||
f.write(conf_data)
|
f.write(conf_data)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
logging.info("Done.")
|
logging.info("Done.")
|
||||||
|
|
||||||
return config_fname
|
return fname
|
||||||
|
|
||||||
|
def __create_suggestion_dict(self):
|
||||||
|
fname = os.path.abspath(trashfolder + "suggdict.txt")
|
||||||
|
logging.info("Make suggestion dict ({})...".format(fname))
|
||||||
|
|
||||||
|
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary,
|
||||||
|
sphinx_index_addjobj,
|
||||||
|
self.files['addrobj.conf'], fname)
|
||||||
|
#os.system(run_builddict_cmd)
|
||||||
|
logging.info("Done.")
|
||||||
|
|
||||||
|
return fname
|
||||||
|
|
||||||
|
|
||||||
# TRASH
|
# TRASH
|
||||||
def produce_sphinx_config(config_name):
|
# conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
|
||||||
conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
|
|
||||||
|
|
||||||
if os.path.isfile(config_name):
|
|
||||||
choice = raw_input(
|
|
||||||
"WARNING! File {} already exists. It will be overwritten, "
|
|
||||||
"all settings all setting will be lost! Are you sure? [y/n]: ".format(
|
|
||||||
config_name))
|
|
||||||
if choice.lower() != 'y':
|
|
||||||
logging.warning("Aborted.")
|
|
||||||
return
|
|
||||||
|
|
||||||
conf_file = open(config_name, "w")
|
|
||||||
conf_file.write(conf_data)
|
|
||||||
conf_file.close()
|
|
||||||
|
|
||||||
logging.info("Success! Re-index db: \n"
|
|
||||||
"\t$indexer -c {} --all --rotate\n"
|
|
||||||
"and then re/start your Sphinx:\n"
|
|
||||||
"\t$/etc/init.d/sphinxsearch stop\n"
|
|
||||||
"\t$/etc/init.d/sphinxsearch start".format(config_name))
|
|
11
aore/miscutils/trigram.py
Normal file
11
aore/miscutils/trigram.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
def trigram(inp):
|
||||||
|
inp = u"__"+inp+u"__"
|
||||||
|
output = []
|
||||||
|
|
||||||
|
for i in range(0, len(inp) - 2):
|
||||||
|
output.append(inp[i:i + 3])
|
||||||
|
|
||||||
|
return " ".join(output)
|
23
aore/templates/sphinx/idx_suggest.conf
Normal file
23
aore/templates/sphinx/idx_suggest.conf
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
source {{index_name}}
|
||||||
|
{
|
||||||
|
type = pgsql
|
||||||
|
sql_host = {{db_host}}
|
||||||
|
sql_user = {{db_user}}
|
||||||
|
sql_pass = {{db_password}}
|
||||||
|
sql_db = {{db_name}}
|
||||||
|
sql_port = {{db_port}}
|
||||||
|
|
||||||
|
sql_query = SELECT id, trigramm, word, LENGTH(word) AS len FROM "AOTRIG"
|
||||||
|
|
||||||
|
sql_field_string = trigramm
|
||||||
|
sql_attr_uint = len
|
||||||
|
sql_attr_string = word
|
||||||
|
}
|
||||||
|
|
||||||
|
index {{index_name}}
|
||||||
|
{
|
||||||
|
source = {{index_name}}
|
||||||
|
path = {{sphinx_var_path}}/data/{{index_name}}
|
||||||
|
docinfo = extern
|
||||||
|
charset_type = utf-8
|
||||||
|
}
|
@ -3,7 +3,7 @@
|
|||||||
import optparse
|
import optparse
|
||||||
|
|
||||||
from aore.aoutils.aoupdater import AoUpdater
|
from aore.aoutils.aoupdater import AoUpdater
|
||||||
from aore.miscutils.sphinx import configure_sphinx
|
from aore.miscutils.sphinx import SphinxHelper
|
||||||
from aore.fias.search import SphinxSearch
|
from aore.fias.search import SphinxSearch
|
||||||
|
|
||||||
|
|
||||||
@ -42,7 +42,10 @@ def main():
|
|||||||
update_base(options.source, int(options.update_count))
|
update_base(options.source, int(options.update_count))
|
||||||
|
|
||||||
if options.sphinx and options.indexer_path:
|
if options.sphinx and options.indexer_path:
|
||||||
configure_sphinx(options.indexer_path)
|
sphinxh = SphinxHelper()
|
||||||
|
sphinxh.configure_indexer(options.indexer_path)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
sph = SphinxSearch()
|
||||||
|
sph.get_suggest('апасьево')
|
||||||
|
#main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user