diff --git a/aore/config.py b/aore/config.py index a6fa77b..da9b7f4 100644 --- a/aore/config.py +++ b/aore/config.py @@ -37,9 +37,14 @@ SPHINX_VAR_DIRS = dict( # config_type = "test" # Main section -sphinx_index_addjobj="idx_fias_addrobj" -sphinx_index_sugg="idx_fias_sugg" -sphinx_var_dir=SPHINX_VAR_DIRS[config_type] +sphinx = dict( + host_name="localhost", + port=9312, + index_addjobj="idx_fias_addrobj", + index_sugg="idx_fias_sugg", + var_dir=SPHINX_VAR_DIRS[config_type] +) + db = DB_INSTANCES[config_type] unrar = UNRAR_PATHES[config_type] trashfolder = "files/" diff --git a/aore/dbutils/dbimpl.py b/aore/dbutils/dbimpl.py index 75b5281..09a1a1e 100644 --- a/aore/dbutils/dbimpl.py +++ b/aore/dbutils/dbimpl.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from traceback import format_exc +import psycopg2.extras class DBImpl: @@ -30,8 +31,11 @@ class DBImpl: self.transaction_rollback() raise BaseException("Error execute sql query. Reason : {}".format(format_exc())) - def get_rows(self, query_string): - cur = self.connection.cursor() + def get_rows(self, query_string, dict_cursor=False): + if dict_cursor: + cur = self.connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + else: + cur = self.connection.cursor() cur.execute(query_string) rows = cur.fetchall() diff --git a/aore/fias/fiasfactory.py b/aore/fias/fiasfactory.py index ffaca33..4f8803a 100644 --- a/aore/fias/fiasfactory.py +++ b/aore/fias/fiasfactory.py @@ -1,11 +1,17 @@ # -*- coding: utf-8 -*- +import psycopg2 +from bottle import template + +from aore.dbutils.dbimpl import DBImpl from aore.fias.search import SphinxSearch -import logging +from aore.config import db as dbparams class FiasFactory: def __init__(self): - self.searcher = SphinxSearch() + self.db = DBImpl(psycopg2, dbparams) + self.searcher = SphinxSearch(self.db) + self.expand_templ = template('aore/templates/postgre/expand_query.sql', aoid="//aoid") # text - строка поиска # strong - строгий поиск (True) или "мягкий" (False) (с допущением ошибок, опечаток) @@ -13,6 +19,21 @@ class FiasFactory: def find(self, text, strong=False, out_format="simple"): try: results = self.searcher.find(text, strong) - print results - except: - return [] + except Exception, err: + return dict(error=err.args[0]) + + return results + + # Нормализует подаваемый AOID или AOGUID в актуальный AOID + def normalize(self, aoid_guid): + pass + + # Разворачивает AOID в представление (перед этим нормализует) + def expand(self, aoid_guid): + try: + sql_query = self.expand_templ.replace("//aoid", aoid_guid) + rows = self.db.get_rows(sql_query, True) + except Exception, err: + return dict(error=err.args[0]) + + return rows diff --git a/aore/fias/search.py b/aore/fias/search.py index 0e261bf..a56653f 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -1,40 +1,42 @@ # -*- coding: utf-8 -*- -import json +import logging import re import Levenshtein -import psycopg2 import sphinxapi -import logging -from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj -from aore.dbutils.dbimpl import DBImpl +from aore.config import sphinx from aore.fias.wordentry import WordEntry from aore.miscutils.trigram import trigram class SphinxSearch: - def __init__(self): + def __init__(self, db): self.delta_len = 2 + self.rating_limit_soft = 0.4 + self.rating_limit_soft_count = 6 + self.word_length_soft = 3 + self.rating_limit_hard = 0.82 + self.rating_limit_hard_count = 3 + self.default_rating_delta = 2 self.regression_coef = 0.04 - self.db = DBImpl(psycopg2, dbparams) - + self.db = db self.client_sugg = sphinxapi.SphinxClient() - self.client_sugg.SetServer("127.0.0.1", 9312) + self.client_sugg.SetServer(sphinx.host, sphinx.port) self.client_sugg.SetLimits(0, 10) - self.client_sugg.SetConnectTimeout(7.0) + self.client_sugg.SetConnectTimeout(3.0) self.client_show = sphinxapi.SphinxClient() - self.client_show.SetServer("127.0.0.1", 9312) + self.client_show.SetServer(sphinx.host, sphinx.port) self.client_show.SetLimits(0, 10) - self.client_show.SetConnectTimeout(7.0) + self.client_show.SetConnectTimeout(3.0) def __configure(self, index_name, wlen=None): - if index_name == "idx_fias_sugg": + if index_name == sphinx.index_sugg: if wlen: self.client_sugg.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT) @@ -43,14 +45,15 @@ class SphinxSearch: self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC") else: self.client_show.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) - #self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25) + self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25) + self.client_show.SetSortMode(sphinxapi.SPH_SORT_RELEVANCE) def __get_suggest(self, word, rating_limit, count): word_len = str(len(word) / 2) trigrammed_word = '"{}"/1'.format(trigram(word)) - self.__configure(sphinx_index_sugg, word_len) - result = self.client_sugg.Query(trigrammed_word, sphinx_index_sugg) + self.__configure(sphinx.index_sugg, word_len) + result = self.client_sugg.Query(trigrammed_word, sphinx.index_sugg) # Если по данному слову не найдено подсказок (а такое бывает?) # возвращаем [] @@ -83,15 +86,15 @@ class SphinxSearch: def __add_word_variations(self, word_entry, strong): if word_entry.MT_MANY_SUGG and not strong: - suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6) + suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, self.rating_limit_soft_count) for suggestion in suggs: word_entry.add_variation(suggestion[0]) if word_entry.MT_SOME_SUGG and not strong: - suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3) + suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, self.rating_limit_hard_count) for suggestion in suggs: word_entry.add_variation(suggestion[0]) if word_entry.MT_LAST_STAR: - word_entry.add_variation(word_entry.word+'*') + word_entry.add_variation(word_entry.word + '*') if word_entry.MT_AS_IS: word_entry.add_variation(word_entry.word) if word_entry.MT_ADD_SOCR: @@ -99,6 +102,8 @@ class SphinxSearch: def __get_word_entries(self, words, strong): for word in words: + if not strong and len(word) < self.word_length_soft: + continue if word != '': we = WordEntry(self.db, word) self.__add_word_variations(we, strong) @@ -112,16 +117,15 @@ class SphinxSearch: word_entries = self.__get_word_entries(words, strong) sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries)) - self.__configure(sphinx_index_addjobj) - logging.info("QUERY "+sentence) - rs = self.client_show.Query(sentence, sphinx_index_addjobj) - logging.info("OK") - - print json.dumps(rs) - + self.__configure(sphinx.index_addjobj) + logging.info("QUERY " + sentence) + rs = self.client_show.Query(sentence, sphinx.index_addjobj) logging.info("OK") results = [] for ma in rs['matches']: - results.append([ma['attrs']['aoid'], ma['attrs']['fullname'], ma['weight']]) + results.append(dict(aoid=ma['attrs']['aoid'], text=ma['attrs']['fullname'], ratio=ma['weight'])) + + if strong: + results.sort(key=lambda x: Levenshtein.ratio(text, x['text']), reverse=True) return results diff --git a/aore/fias/wordentry.py b/aore/fias/wordentry.py index f42a50a..50d4421 100644 --- a/aore/fias/wordentry.py +++ b/aore/fias/wordentry.py @@ -80,7 +80,10 @@ class WordEntry: outmask = "" for ra in result: if ra[0] > 1: - outmask += 'x' + if word_len > 2: + outmask += 'x' + else: + outmask += '1' else: outmask += str(ra[0]) diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py index 3af57a2..8bed437 100644 --- a/aore/miscutils/sphinx.py +++ b/aore/miscutils/sphinx.py @@ -7,12 +7,12 @@ from bottle import template from aore.updater.aoxmltableentry import AoXmlTableEntry from aore.updater.dbhandler import DbHandler -from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg +from aore.config import db as dbconfig, sphinx, trashfolder from trigram import trigram class SphinxHelper: - def __init__(self, ): + def __init__(self): self.index_binary = None self.files = dict() @@ -58,8 +58,8 @@ class SphinxHelper: db_user=dbconfig['user'], db_password=dbconfig['password'], db_name=dbconfig['database'], db_port=dbconfig['port'], - index_name=sphinx_index_sugg, - sphinx_var_path=sphinx_var_dir) + index_name=sphinx.index_sugg, + sphinx_var_path=sphinx.var_dir) f = open(fname, "w") f.write(conf_data) @@ -112,8 +112,8 @@ class SphinxHelper: db_password=dbconfig['password'], db_name=dbconfig['database'], db_port=dbconfig['port'], sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), - index_name=sphinx_index_addjobj, - sphinx_var_path=sphinx_var_dir) + index_name=sphinx.index_addjobj, + sphinx_var_path=sphinx.var_dir) f = open(fname, "w") f.write(conf_data) @@ -128,7 +128,7 @@ class SphinxHelper: logging.info("Make suggestion dict ({})...".format(fname)) run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary, - sphinx_index_addjobj, + sphinx.index_addjobj, self.files['addrobj.conf'], fname) os.system(run_builddict_cmd) logging.info("Done.") @@ -139,7 +139,7 @@ class SphinxHelper: out_filename = os.path.abspath(config_fname) logging.info("Creating main config {}...".format(out_filename)) - conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir) + conf_data = template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx.var_dir) f = open(out_filename, "w") for fname, fpath in self.files.iteritems(): diff --git a/aore/templates/postgre/expand_query.sql b/aore/templates/postgre/expand_query.sql new file mode 100644 index 0000000..3a63279 --- /dev/null +++ b/aore/templates/postgre/expand_query.sql @@ -0,0 +1,9 @@ +WITH RECURSIVE child_to_parents AS ( + SELECT "ADDROBJ".* FROM "ADDROBJ" + WHERE aoid = '{{ aoid }}' + UNION ALL + SELECT "ADDROBJ".* FROM "ADDROBJ", child_to_parents + WHERE "ADDROBJ".aoguid = child_to_parents.parentguid + AND "ADDROBJ".actstatus = True AND "ADDROBJ".livestatus = True AND "ADDROBJ".nextid IS NULL +) +SELECT DISTINCT ON (scname) cs.aoid, cs.aoguid, cs.shortname, cs.formalname, cs.aolevel, s.socrname FROM child_to_parents cs LEFT JOIN "SOCRBASE" s ON s.scname=cs.shortname ORDER BY scname, aolevel; diff --git a/aore/templates/sphinx/idx_addrobj.conf b/aore/templates/sphinx/idx_addrobj.conf index 6a38675..112df1a 100644 --- a/aore/templates/sphinx/idx_addrobj.conf +++ b/aore/templates/sphinx/idx_addrobj.conf @@ -17,16 +17,10 @@ source {{index_name}} index {{ index_name }} { - docinfo = extern - morphology = stem_ru - min_stemming_len = 3 - - stopwords = min_word_len = 1 - charset_type = utf-8 min_prefix_len = 1 min_infix_len = 0 - enable_star = 1 + ngram_len = 1 # strip html by default html_strip = 1 diff --git a/manage.py b/manage.py index 874b953..5499ecd 100644 --- a/manage.py +++ b/manage.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +import json import optparse from aore.fias.fiasfactory import FiasFactory @@ -117,7 +117,9 @@ def main(): # 4 Debug purposes.. if options.test: sph = FiasFactory() - sph.find('ул кемровая пасраул алтай майминский р-н') + print json.dumps(sph.expand("453091f5-2336-4aea-9b90-c4060dca0b33")) + print json.dumps(sph.find('с паспаул ул кедровая', True)) + print json.dumps(sph.find('с паспаул ул кедровая')) if __name__ == '__main__':