Добавлен коостяк для подсказок

This commit is contained in:
Jack Stdin 2016-01-15 15:15:10 +03:00
parent 8156fa3d8d
commit 1e861f7285
7 changed files with 220 additions and 65 deletions

View File

@ -30,11 +30,8 @@ class DBImpl:
self.transaction_rollback() self.transaction_rollback()
raise BaseException("Error execute sql query. Reason : {}".format(format_exc())) raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
def get_rows(self, query_string, for_dict=True): def get_rows(self, query_string):
if for_dict: cur = self.connection.cursor()
cur = self.connection.cursor(self.db_engine.cursors.DictCursor)
else:
cur = self.connection.cursor()
cur.execute(query_string) cur.execute(query_string)
rows = cur.fetchall() rows = cur.fetchall()

View File

@ -18,4 +18,7 @@ db_shemas['ADDROBJ'] = DbSchema("ADDROBJ",
db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st", db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st",
"AddressObjectType") "AddressObjectType")
db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM"], "word",
None)
allowed_tables = ["ADDROBJ", "SOCRBASE"] allowed_tables = ["ADDROBJ", "SOCRBASE"]

View File

@ -1,17 +1,89 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
import logging
import Levenshtein
import psycopg2
import sphinxapi import sphinxapi
import logging from aore.config import db as dbparams
import json from aore.dbutils.dbimpl import DBImpl
from aore.miscutils.trigram import trigram
class SphinxSearch: class SphinxSearch:
def __init__(self): def __init__(self):
self.delta_len = 2
self.db = DBImpl(psycopg2, dbparams)
self.client = sphinxapi.SphinxClient() self.client = sphinxapi.SphinxClient()
self.client.SetServer("localhost", 9312) self.client.SetServer("localhost", 9312)
self.client.SetLimits(0, 10) self.client.SetLimits(0, 10)
def __configure(self, index_name, wlen=None):
if index_name == "idx_fias_sugg":
if wlen:
self.client.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client.SetMatchMode(sphinxapi.MA)
# Types =
class SRankType:
names = dict(
SRANK_EXACTLY_MISSPRINT=['00'], # Точно - опечатка, нужно много подсказок, без word*
SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word*
SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word*
SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word*
SRANK_PROBABLY_COMPLEX=['1*']
# Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
)
def __init__(self, rtype):
self.rtype = rtype
for x, y in self.names.iteritems():
self.__dict__[x] = self.rtype in y
def __get_strong_and_uncomplete_ranks(self, word):
word_len = str(len(word) / 2)
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
word, word_len, word)
result = self.db.get_rows(sql_qry)
strong_rank = result[1][0]
uncomplete_rank = result[0][0]
if uncomplete_rank > 1:
uncomplete_rank = '*'
return self.SRankType(str(strong_rank) + str(uncomplete_rank))
def get_suggest(self, word):
word_len = str(len(word) / 2)
trigrammed_word = '"{}"/2'.format(trigram(word))
self.__configure("idx_fias_sugg", word_len)
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
# Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем []
if not result['matches']:
return []
maxrank = result['matches'][0]['attrs']['krank']
outlist = list()
for match in result['matches']:
if maxrank - match['attrs']['krank'] < 2:
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
outlist.sort(key=lambda x: x[1], reverse=True)
for x in outlist:
print x[0], x[1]
return outlist
def find(self, text): def find(self, text):
# TODO: ADD index # TODO: ADD index
logging.info("12") logging.info("12")

View File

@ -5,75 +5,121 @@ import os
from bottle import template from bottle import template
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.dbutils.dbhandler import DbHandler
from trigram import trigram
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg
def configure_sphinx(indexer_binary): class SphinxHelper:
logging.info("Start configuring Sphinx...") def __init__(self, ):
self.index_binary = None
self.files = dict()
# Create ADDROBJ config def configure_indexer(self, indexer_binary):
addrobj_cfg_name = get_addrobj_config() logging.info("Start configuring Sphinx...")
self.index_binary = indexer_binary
# Indexing it... # Create ADDROBJ config
run_index_cmd = "{} -c {} --all".format(indexer_binary, addrobj_cfg_name) self.files['addrobj.conf'] = self.__create_ao_index_config()
logging.info("Run indexer (indexing ADDROBJ)...")
os.system(run_index_cmd)
logging.info("{} index was created.".format(sphinx_index_addjobj))
# Produce dict file # Indexing ADDROBJ config
sugg_dict_name = get_suggestion_dict(indexer_binary, addrobj_cfg_name) run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['addrobj.conf'])
logging.info("Indexing main ({})...".format(sphinx_index_addjobj))
#os.system(run_index_cmd)
logging.info("{} index was created.".format(sphinx_index_addjobj))
# Produce dict file
self.files['dict.txt'] = self.__create_suggestion_dict()
# Put dict into db
#self.files['dict.csv'] = self.__dbexport_sugg_dict()
# Create SUGGEST config
self.files['suggest.conf'] = self.__create_sugg_index_config()
run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['suggest.conf'])
logging.info("Indexing main ({})...".format(sphinx_index_sugg))
os.system(run_index_cmd)
logging.info("{} index was created.".format(sphinx_index_sugg))
def get_suggestion_dict(indexer_binary, addrobj_cfg_name): def __create_sugg_index_config(self):
logging.info("Make suggestion dict...") fname = os.path.abspath(trashfolder + "suggest.conf")
dict_file_name = os.path.abspath(trashfolder + "suggdict.txt") logging.info("Creating config {}".format(fname))
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(indexer_binary, sphinx_index_addjobj,
addrobj_cfg_name, dict_file_name)
os.system(run_builddict_cmd)
logging.info("Done.")
return dict_file_name conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=dbconfig['host'],
db_user=dbconfig['user'],
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
index_name=sphinx_index_sugg,
sphinx_var_path=sphinx_var_dir)
f = open(fname, "w")
f.write(conf_data)
f.close()
def get_addrobj_config(): logging.info("Done.")
config_fname = os.path.abspath(trashfolder + "addrobj.conf")
logging.info("Creating config {}".format(config_fname))
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'], db_user=dbconfig['user'], return fname
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
index_name=sphinx_index_addjobj,
sphinx_var_path=sphinx_var_dir)
f = open(config_fname, "w") def __dbexport_sugg_dict(self):
f.write(conf_data) logging.info("Place suggestion dict to DB {}...".format(self.files['dict.txt']))
f.close() dict_dat_fname = os.path.abspath(trashfolder + "suggdict.csv")
logging.info("Done.") with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file:
line = None
while line != '':
nodes = []
line = dict_file.readline()
if line == '':
break
return config_fname keyword = line.split(' ')[0]
if not keyword:
raise BaseException("Cannot process {}".format(self.files['dict.txt']))
nodes.append(keyword)
nodes.append(trigram(keyword))
exit_file.write("\t".join(nodes) + "\n")
aodp = DbHandler()
aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", 8, dict_dat_fname)
logging.info("Done.")
def __create_ao_index_config(self):
fname = os.path.abspath(trashfolder + "addrobj.conf")
logging.info("Creating config {}".format(fname))
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'],
db_user=dbconfig['user'],
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
index_name=sphinx_index_addjobj,
sphinx_var_path=sphinx_var_dir)
f = open(fname, "w")
f.write(conf_data)
f.close()
logging.info("Done.")
return fname
def __create_suggestion_dict(self):
fname = os.path.abspath(trashfolder + "suggdict.txt")
logging.info("Make suggestion dict ({})...".format(fname))
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary,
sphinx_index_addjobj,
self.files['addrobj.conf'], fname)
#os.system(run_builddict_cmd)
logging.info("Done.")
return fname
# TRASH # TRASH
def produce_sphinx_config(config_name): # conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
if os.path.isfile(config_name):
choice = raw_input(
"WARNING! File {} already exists. It will be overwritten, "
"all settings all setting will be lost! Are you sure? [y/n]: ".format(
config_name))
if choice.lower() != 'y':
logging.warning("Aborted.")
return
conf_file = open(config_name, "w")
conf_file.write(conf_data)
conf_file.close()
logging.info("Success! Re-index db: \n"
"\t$indexer -c {} --all --rotate\n"
"and then re/start your Sphinx:\n"
"\t$/etc/init.d/sphinxsearch stop\n"
"\t$/etc/init.d/sphinxsearch start".format(config_name))

11
aore/miscutils/trigram.py Normal file
View File

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
def trigram(inp):
inp = u"__"+inp+u"__"
output = []
for i in range(0, len(inp) - 2):
output.append(inp[i:i + 3])
return " ".join(output)

View File

@ -0,0 +1,23 @@
source {{index_name}}
{
type = pgsql
sql_host = {{db_host}}
sql_user = {{db_user}}
sql_pass = {{db_password}}
sql_db = {{db_name}}
sql_port = {{db_port}}
sql_query = SELECT id, trigramm, word, LENGTH(word) AS len FROM "AOTRIG"
sql_field_string = trigramm
sql_attr_uint = len
sql_attr_string = word
}
index {{index_name}}
{
source = {{index_name}}
path = {{sphinx_var_path}}/data/{{index_name}}
docinfo = extern
charset_type = utf-8
}

View File

@ -3,7 +3,7 @@
import optparse import optparse
from aore.aoutils.aoupdater import AoUpdater from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import configure_sphinx from aore.miscutils.sphinx import SphinxHelper
from aore.fias.search import SphinxSearch from aore.fias.search import SphinxSearch
@ -42,7 +42,10 @@ def main():
update_base(options.source, int(options.update_count)) update_base(options.source, int(options.update_count))
if options.sphinx and options.indexer_path: if options.sphinx and options.indexer_path:
configure_sphinx(options.indexer_path) sphinxh = SphinxHelper()
sphinxh.configure_indexer(options.indexer_path)
if __name__ == '__main__': if __name__ == '__main__':
main() sph = SphinxSearch()
sph.get_suggest('апасьево')
#main()