From 1e861f72858619bfcaba63c5ea291b1a11e89c82 Mon Sep 17 00:00:00 2001 From: Jack Stdin Date: Fri, 15 Jan 2016 15:15:10 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=20=D0=BA=D0=BE=D0=BE=D1=81=D1=82=D1=8F=D0=BA=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20=D0=BF=D0=BE=D0=B4=D1=81=D0=BA=D0=B0=D0=B7=D0=BE?= =?UTF-8?q?=D0=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aore/dbutils/dbimpl.py | 7 +- aore/dbutils/dbschemas.py | 3 + aore/fias/search.py | 76 +++++++++++- aore/miscutils/sphinx.py | 156 ++++++++++++++++--------- aore/miscutils/trigram.py | 11 ++ aore/templates/sphinx/idx_suggest.conf | 23 ++++ manage.py | 9 +- 7 files changed, 220 insertions(+), 65 deletions(-) create mode 100644 aore/miscutils/trigram.py create mode 100644 aore/templates/sphinx/idx_suggest.conf diff --git a/aore/dbutils/dbimpl.py b/aore/dbutils/dbimpl.py index 7c26126..75b5281 100644 --- a/aore/dbutils/dbimpl.py +++ b/aore/dbutils/dbimpl.py @@ -30,11 +30,8 @@ class DBImpl: self.transaction_rollback() raise BaseException("Error execute sql query. Reason : {}".format(format_exc())) - def get_rows(self, query_string, for_dict=True): - if for_dict: - cur = self.connection.cursor(self.db_engine.cursors.DictCursor) - else: - cur = self.connection.cursor() + def get_rows(self, query_string): + cur = self.connection.cursor() cur.execute(query_string) rows = cur.fetchall() diff --git a/aore/dbutils/dbschemas.py b/aore/dbutils/dbschemas.py index 13f1f52..02fdd28 100644 --- a/aore/dbutils/dbschemas.py +++ b/aore/dbutils/dbschemas.py @@ -18,4 +18,7 @@ db_shemas['ADDROBJ'] = DbSchema("ADDROBJ", db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st", "AddressObjectType") +db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM"], "word", + None) + allowed_tables = ["ADDROBJ", "SOCRBASE"] diff --git a/aore/fias/search.py b/aore/fias/search.py index ee8d5f6..aa5861a 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -1,17 +1,89 @@ # -*- coding: utf-8 -*- +import json +import logging + +import Levenshtein +import psycopg2 import sphinxapi -import logging -import json +from aore.config import db as dbparams +from aore.dbutils.dbimpl import DBImpl +from aore.miscutils.trigram import trigram class SphinxSearch: def __init__(self): + self.delta_len = 2 + self.db = DBImpl(psycopg2, dbparams) self.client = sphinxapi.SphinxClient() self.client.SetServer("localhost", 9312) self.client.SetLimits(0, 10) + def __configure(self, index_name, wlen=None): + if index_name == "idx_fias_sugg": + if wlen: + self.client.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) + self.client.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT) + self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len) + self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen)) + self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC") + else: + self.client.SetMatchMode(sphinxapi.MA) + + # Types = + class SRankType: + names = dict( + SRANK_EXACTLY_MISSPRINT=['00'], # Точно - опечатка, нужно много подсказок, без word* + SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word* + SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word* + SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word* + SRANK_PROBABLY_COMPLEX=['1*'] + # Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word* + ) + + def __init__(self, rtype): + self.rtype = rtype + for x, y in self.names.iteritems(): + self.__dict__[x] = self.rtype in y + + def __get_strong_and_uncomplete_ranks(self, word): + word_len = str(len(word) / 2) + sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ + "UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format( + word, word_len, word) + + result = self.db.get_rows(sql_qry) + strong_rank = result[1][0] + uncomplete_rank = result[0][0] + if uncomplete_rank > 1: + uncomplete_rank = '*' + + return self.SRankType(str(strong_rank) + str(uncomplete_rank)) + + def get_suggest(self, word): + word_len = str(len(word) / 2) + trigrammed_word = '"{}"/2'.format(trigram(word)) + + self.__configure("idx_fias_sugg", word_len) + result = self.client.Query(trigrammed_word, 'idx_fias_sugg') + + # Если по данному слову не найдено подсказок (а такое бывает?) + # возвращаем [] + if not result['matches']: + return [] + + maxrank = result['matches'][0]['attrs']['krank'] + outlist = list() + for match in result['matches']: + if maxrank - match['attrs']['krank'] < 2: + outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])]) + outlist.sort(key=lambda x: x[1], reverse=True) + + for x in outlist: + print x[0], x[1] + return outlist + def find(self, text): # TODO: ADD index logging.info("12") diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py index 77faffa..44b21ca 100644 --- a/aore/miscutils/sphinx.py +++ b/aore/miscutils/sphinx.py @@ -5,75 +5,121 @@ import os from bottle import template -from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder +from aore.aoutils.aoxmltableentry import AoXmlTableEntry +from aore.dbutils.dbhandler import DbHandler +from trigram import trigram + +from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg -def configure_sphinx(indexer_binary): - logging.info("Start configuring Sphinx...") +class SphinxHelper: + def __init__(self, ): + self.index_binary = None + self.files = dict() - # Create ADDROBJ config - addrobj_cfg_name = get_addrobj_config() + def configure_indexer(self, indexer_binary): + logging.info("Start configuring Sphinx...") + self.index_binary = indexer_binary - # Indexing it... - run_index_cmd = "{} -c {} --all".format(indexer_binary, addrobj_cfg_name) - logging.info("Run indexer (indexing ADDROBJ)...") - os.system(run_index_cmd) - logging.info("{} index was created.".format(sphinx_index_addjobj)) + # Create ADDROBJ config + self.files['addrobj.conf'] = self.__create_ao_index_config() - # Produce dict file - sugg_dict_name = get_suggestion_dict(indexer_binary, addrobj_cfg_name) + # Indexing ADDROBJ config + run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['addrobj.conf']) + logging.info("Indexing main ({})...".format(sphinx_index_addjobj)) + #os.system(run_index_cmd) + logging.info("{} index was created.".format(sphinx_index_addjobj)) + + # Produce dict file + self.files['dict.txt'] = self.__create_suggestion_dict() + + # Put dict into db + #self.files['dict.csv'] = self.__dbexport_sugg_dict() + + # Create SUGGEST config + self.files['suggest.conf'] = self.__create_sugg_index_config() + run_index_cmd = "{} -c {} --all".format(self.index_binary, self.files['suggest.conf']) + logging.info("Indexing main ({})...".format(sphinx_index_sugg)) + os.system(run_index_cmd) + logging.info("{} index was created.".format(sphinx_index_sugg)) -def get_suggestion_dict(indexer_binary, addrobj_cfg_name): - logging.info("Make suggestion dict...") - dict_file_name = os.path.abspath(trashfolder + "suggdict.txt") - run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(indexer_binary, sphinx_index_addjobj, - addrobj_cfg_name, dict_file_name) - os.system(run_builddict_cmd) - logging.info("Done.") + def __create_sugg_index_config(self): + fname = os.path.abspath(trashfolder + "suggest.conf") + logging.info("Creating config {}".format(fname)) - return dict_file_name + conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=dbconfig['host'], + db_user=dbconfig['user'], + db_password=dbconfig['password'], + db_name=dbconfig['database'], db_port=dbconfig['port'], + index_name=sphinx_index_sugg, + sphinx_var_path=sphinx_var_dir) + f = open(fname, "w") + f.write(conf_data) + f.close() -def get_addrobj_config(): - config_fname = os.path.abspath(trashfolder + "addrobj.conf") - logging.info("Creating config {}".format(config_fname)) + logging.info("Done.") - conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'], db_user=dbconfig['user'], - db_password=dbconfig['password'], - db_name=dbconfig['database'], db_port=dbconfig['port'], - sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), - index_name=sphinx_index_addjobj, - sphinx_var_path=sphinx_var_dir) + return fname - f = open(config_fname, "w") - f.write(conf_data) - f.close() + def __dbexport_sugg_dict(self): + logging.info("Place suggestion dict to DB {}...".format(self.files['dict.txt'])) + dict_dat_fname = os.path.abspath(trashfolder + "suggdict.csv") - logging.info("Done.") + with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file: + line = None + while line != '': + nodes = [] + line = dict_file.readline() + if line == '': + break - return config_fname + keyword = line.split(' ')[0] + if not keyword: + raise BaseException("Cannot process {}".format(self.files['dict.txt'])) + + nodes.append(keyword) + nodes.append(trigram(keyword)) + + exit_file.write("\t".join(nodes) + "\n") + + aodp = DbHandler() + aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", 8, dict_dat_fname) + logging.info("Done.") + + def __create_ao_index_config(self): + fname = os.path.abspath(trashfolder + "addrobj.conf") + logging.info("Creating config {}".format(fname)) + + conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=dbconfig['host'], + db_user=dbconfig['user'], + db_password=dbconfig['password'], + db_name=dbconfig['database'], db_port=dbconfig['port'], + sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), + index_name=sphinx_index_addjobj, + sphinx_var_path=sphinx_var_dir) + + f = open(fname, "w") + f.write(conf_data) + f.close() + + logging.info("Done.") + + return fname + + def __create_suggestion_dict(self): + fname = os.path.abspath(trashfolder + "suggdict.txt") + logging.info("Make suggestion dict ({})...".format(fname)) + + run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary, + sphinx_index_addjobj, + self.files['addrobj.conf'], fname) + #os.system(run_builddict_cmd) + logging.info("Done.") + + return fname # TRASH -def produce_sphinx_config(config_name): - conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir) - - if os.path.isfile(config_name): - choice = raw_input( - "WARNING! File {} already exists. It will be overwritten, " - "all settings all setting will be lost! Are you sure? [y/n]: ".format( - config_name)) - if choice.lower() != 'y': - logging.warning("Aborted.") - return - - conf_file = open(config_name, "w") - conf_file.write(conf_data) - conf_file.close() - - logging.info("Success! Re-index db: \n" - "\t$indexer -c {} --all --rotate\n" - "and then re/start your Sphinx:\n" - "\t$/etc/init.d/sphinxsearch stop\n" - "\t$/etc/init.d/sphinxsearch start".format(config_name)) +# conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir) \ No newline at end of file diff --git a/aore/miscutils/trigram.py b/aore/miscutils/trigram.py new file mode 100644 index 0000000..dae37fa --- /dev/null +++ b/aore/miscutils/trigram.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + + +def trigram(inp): + inp = u"__"+inp+u"__" + output = [] + + for i in range(0, len(inp) - 2): + output.append(inp[i:i + 3]) + + return " ".join(output) diff --git a/aore/templates/sphinx/idx_suggest.conf b/aore/templates/sphinx/idx_suggest.conf new file mode 100644 index 0000000..ce3e0f7 --- /dev/null +++ b/aore/templates/sphinx/idx_suggest.conf @@ -0,0 +1,23 @@ +source {{index_name}} +{ + type = pgsql + sql_host = {{db_host}} + sql_user = {{db_user}} + sql_pass = {{db_password}} + sql_db = {{db_name}} + sql_port = {{db_port}} + + sql_query = SELECT id, trigramm, word, LENGTH(word) AS len FROM "AOTRIG" + + sql_field_string = trigramm + sql_attr_uint = len + sql_attr_string = word +} + +index {{index_name}} +{ + source = {{index_name}} + path = {{sphinx_var_path}}/data/{{index_name}} + docinfo = extern + charset_type = utf-8 +} \ No newline at end of file diff --git a/manage.py b/manage.py index c14ef9e..2636e22 100644 --- a/manage.py +++ b/manage.py @@ -3,7 +3,7 @@ import optparse from aore.aoutils.aoupdater import AoUpdater -from aore.miscutils.sphinx import configure_sphinx +from aore.miscutils.sphinx import SphinxHelper from aore.fias.search import SphinxSearch @@ -42,7 +42,10 @@ def main(): update_base(options.source, int(options.update_count)) if options.sphinx and options.indexer_path: - configure_sphinx(options.indexer_path) + sphinxh = SphinxHelper() + sphinxh.configure_indexer(options.indexer_path) if __name__ == '__main__': - main() + sph = SphinxSearch() + sph.get_suggest('апасьево') + #main()