From 897ea660464006cd1df578ebc165953ed80c65da Mon Sep 17 00:00:00 2001 From: Jack Stdin Date: Wed, 3 Feb 2016 17:30:47 +0300 Subject: [PATCH] =?UTF-8?q?=D0=98=D0=B7=D0=BC=D0=B5=D0=BD=D0=B5=D0=BD?= =?UTF-8?q?=D0=B0=20=D0=B3=D0=B5=D0=BD=D0=B5=D1=80=D0=B0=D1=86=D0=B8=D1=8F?= =?UTF-8?q?=20=D0=B8=D0=BD=D0=B4=D0=B5=D0=BA=D1=81=D0=BE=D0=B2,=20=D0=BD?= =?UTF-8?q?=D0=B0=D1=87=D0=B0=D0=BB=D0=BE=20=D0=BE=D0=B1=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B8=20"=D1=87=D0=B0=D1=81=D1=82=D1=8B?= =?UTF-8?q?=D1=85"=20=D1=81=D0=BB=D0=BE=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- aore/config/common.py | 1 + aore/fias/search.py | 30 ++++++++++++++------------ aore/fias/wordentry.py | 21 +++++++++++------- aore/miscutils/sphinx.py | 7 +++--- aore/phias.py | 10 ++++----- aore/templates/sphinx/idx_addrobj.conf | 4 ++-- 7 files changed, 42 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 006b87a..9ac37ad 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ _Внимание_! Только Python 2.7, только PostgreSQL, тольк 1. Python 2.7.x, pip 2. PostgreSql 9.5 и выше (из-за синтаксиса _ON CONFLICT ... DO_) -3. Sphinx 2.2.3 и новее (из-за синтаксиса _MAYBE_) +3. Sphinx 2.2.1 и новее 4. Web-сервер с поддержкой WSGI, любой, по Вашему желанию. ### Windows diff --git a/aore/config/common.py b/aore/config/common.py index 328fccb..9afbcc5 100644 --- a/aore/config/common.py +++ b/aore/config/common.py @@ -7,6 +7,7 @@ class sphinx_conf: index_addjobj = "idx_fias_addrobj" index_sugg = "idx_fias_sugg" var_dir = None + min_length_to_star = 3 class db_conf: diff --git a/aore/fias/search.py b/aore/fias/search.py index 1b61a54..7a7f923 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -11,21 +11,21 @@ from aore.miscutils.trigram import trigram class SphinxSearch: + # Config's + delta_len = 2 + + rating_limit_soft = 0.41 + rating_limit_soft_count = 6 + word_length_soft = 3 + + rating_limit_hard = 0.82 + rating_limit_hard_count = 3 + + default_rating_delta = 2 + regression_coef = 0.08 + max_result = 10 + def __init__(self, db): - self.delta_len = 2 - - self.rating_limit_soft = 0.41 - self.rating_limit_soft_count = 6 - self.word_length_soft = 3 - - self.rating_limit_hard = 0.82 - self.rating_limit_hard_count = 3 - - self.default_rating_delta = 2 - self.regression_coef = 0.08 - - self.max_result = 10 - self.db = db self.client_sugg = sphinxapi.SphinxClient() self.client_sugg.SetServer(sphinx_conf.host_name, sphinx_conf.port) @@ -38,6 +38,7 @@ class SphinxSearch: self.client_show.SetConnectTimeout(3.0) def __configure(self, index_name, wlen=None): + self.client_sugg.ResetFilters() if index_name == sphinx_conf.index_sugg and wlen: self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT) self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len) @@ -116,6 +117,7 @@ class SphinxSearch: word_entries = self.__get_word_entries(words, strong) word_count = len(word_entries) for x in range(word_count, max(0, word_count - 3), -1): + logging.info("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x)) self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x), sphinx_conf.index_addjobj) diff --git a/aore/fias/wordentry.py b/aore/fias/wordentry.py index ce93aab..9feaf8c 100644 --- a/aore/fias/wordentry.py +++ b/aore/fias/wordentry.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- import re +from aore.config import sphinx_conf + class WordEntry: # Варианты распеределния для слов с первыми двумя символами, где: @@ -39,14 +41,13 @@ class WordEntry: MT_ADD_SOCR=['..10', '..x0'] ) - min_word_len_to_star = 4 - def __init__(self, db, word): self.db = db self.word = str(word) self.word_len = len(unicode(self.word)) self.variations = [] self.scname = None + self.is_freq_word = False self.ranks = self.__get_ranks() for x, y in self.match_types.iteritems(): @@ -59,7 +60,7 @@ class WordEntry: self.MT_AS_IS = False # Строка слишком котроткая, то по лайку не ищем, будет очень долго - if self.MT_LAST_STAR and self.word_len < self.min_word_len_to_star: + if self.MT_LAST_STAR and self.word_len < sphinx_conf.min_length_to_star: self.MT_LAST_STAR = False self.MT_AS_IS = True @@ -79,8 +80,9 @@ class WordEntry: sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ "UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \ "UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \ - "UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}';".format( - self.word, self.word_len, self.word, self.word, self.word) + "UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'" \ + "UNION ALL SELECT frequency, NULL FROM \"AOTRIG\" WHERE word='{}';".format( + self.word, self.word_len, self.word, self.word, self.word, self.word) result = self.db.get_rows(sql_qry) @@ -88,6 +90,9 @@ class WordEntry: if not self.scname: self.scname = result[2][1] + if len(result) == 5 and result[4][0] > 30000: + self.is_freq_word = True + # Формируем список найденных величин совпадений: # result[x] # x = 0, поиск по неполному совпадению (лайк*), и по длине строки больше исходной @@ -95,11 +100,11 @@ class WordEntry: # x = 2, поиск по базе сокращений (по полному) # x = 3, то же, но по краткому out_mask_list = [] - for ra in result: - if ra[0] > 1: + for i in range(0, 4): + if result[i][0] > 1: out_mask_list.append('x') else: - out_mask_list.append(str(ra[0])) + out_mask_list.append(str(result[i][0])) return ''.join(out_mask_list) diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py index 5fc3b7f..f1a162e 100644 --- a/aore/miscutils/sphinx.py +++ b/aore/miscutils/sphinx.py @@ -15,6 +15,7 @@ class SphinxHelper: def __init__(self): self.index_binary = None self.files = dict() + self.aodp = DbHandler() def configure_indexer(self, indexer_binary, config_filename): logging.info("Start configuring Sphinx...") @@ -98,8 +99,7 @@ class SphinxHelper: except: pass - aodp = DbHandler() - aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname) + self.aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname) logging.info("Done.") def __create_ao_index_config(self): @@ -112,7 +112,8 @@ class SphinxHelper: db_name=db_conf.database, db_port=db_conf.port, sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), index_name=sphinx_conf.index_addjobj, - sphinx_var_path=sphinx_conf.var_dir) + sphinx_var_path=sphinx_conf.var_dir, + min_length_to_star=sphinx_conf.min_length_to_star) f = open(fname, "w") f.write(conf_data) diff --git a/aore/phias.py b/aore/phias.py index 5adf1e9..770df35 100644 --- a/aore/phias.py +++ b/aore/phias.py @@ -11,20 +11,20 @@ app = Bottle() fias_factory = FiasFactory() -@app.route('/expand/') +@app.route(r'/expand/') def expand(aoid): response.content_type = 'application/json' return json.dumps(fias_factory.expand(aoid)) -@app.route('/normalize/') +@app.route(r'/normalize/') def normalize(aoid): response.content_type = 'application/json' return json.dumps(fias_factory.normalize(aoid)) -@app.route('/find/') -@app.route('/find//') +@app.route(r'/find/') +@app.route(r'/find//') def find(text, strong=False): strong = (strong == "strong") response.content_type = 'application/json' @@ -33,6 +33,6 @@ def find(text, strong=False): @app.error(404) -def error404(): +def error404(error): response.content_type = 'application/json' return json.dumps(dict(error="Page not found")) diff --git a/aore/templates/sphinx/idx_addrobj.conf b/aore/templates/sphinx/idx_addrobj.conf index 112df1a..1ce634d 100644 --- a/aore/templates/sphinx/idx_addrobj.conf +++ b/aore/templates/sphinx/idx_addrobj.conf @@ -18,9 +18,9 @@ source {{index_name}} index {{ index_name }} { min_word_len = 1 - min_prefix_len = 1 + min_prefix_len = {{min_length_to_star}} min_infix_len = 0 - ngram_len = 1 + bigram_index = all # strip html by default html_strip = 1