Изменена генерация индексов, начало обработки "частых" слов.
This commit is contained in:
parent
29a26132e1
commit
897ea66046
@ -35,7 +35,7 @@ _Внимание_! Только Python 2.7, только PostgreSQL, тольк
|
|||||||
|
|
||||||
1. Python 2.7.x, pip
|
1. Python 2.7.x, pip
|
||||||
2. PostgreSql 9.5 и выше (из-за синтаксиса _ON CONFLICT ... DO_)
|
2. PostgreSql 9.5 и выше (из-за синтаксиса _ON CONFLICT ... DO_)
|
||||||
3. Sphinx 2.2.3 и новее (из-за синтаксиса _MAYBE_)
|
3. Sphinx 2.2.1 и новее
|
||||||
4. Web-сервер с поддержкой WSGI, любой, по Вашему желанию.
|
4. Web-сервер с поддержкой WSGI, любой, по Вашему желанию.
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
@ -7,6 +7,7 @@ class sphinx_conf:
|
|||||||
index_addjobj = "idx_fias_addrobj"
|
index_addjobj = "idx_fias_addrobj"
|
||||||
index_sugg = "idx_fias_sugg"
|
index_sugg = "idx_fias_sugg"
|
||||||
var_dir = None
|
var_dir = None
|
||||||
|
min_length_to_star = 3
|
||||||
|
|
||||||
|
|
||||||
class db_conf:
|
class db_conf:
|
||||||
|
@ -11,21 +11,21 @@ from aore.miscutils.trigram import trigram
|
|||||||
|
|
||||||
|
|
||||||
class SphinxSearch:
|
class SphinxSearch:
|
||||||
|
# Config's
|
||||||
|
delta_len = 2
|
||||||
|
|
||||||
|
rating_limit_soft = 0.41
|
||||||
|
rating_limit_soft_count = 6
|
||||||
|
word_length_soft = 3
|
||||||
|
|
||||||
|
rating_limit_hard = 0.82
|
||||||
|
rating_limit_hard_count = 3
|
||||||
|
|
||||||
|
default_rating_delta = 2
|
||||||
|
regression_coef = 0.08
|
||||||
|
max_result = 10
|
||||||
|
|
||||||
def __init__(self, db):
|
def __init__(self, db):
|
||||||
self.delta_len = 2
|
|
||||||
|
|
||||||
self.rating_limit_soft = 0.41
|
|
||||||
self.rating_limit_soft_count = 6
|
|
||||||
self.word_length_soft = 3
|
|
||||||
|
|
||||||
self.rating_limit_hard = 0.82
|
|
||||||
self.rating_limit_hard_count = 3
|
|
||||||
|
|
||||||
self.default_rating_delta = 2
|
|
||||||
self.regression_coef = 0.08
|
|
||||||
|
|
||||||
self.max_result = 10
|
|
||||||
|
|
||||||
self.db = db
|
self.db = db
|
||||||
self.client_sugg = sphinxapi.SphinxClient()
|
self.client_sugg = sphinxapi.SphinxClient()
|
||||||
self.client_sugg.SetServer(sphinx_conf.host_name, sphinx_conf.port)
|
self.client_sugg.SetServer(sphinx_conf.host_name, sphinx_conf.port)
|
||||||
@ -38,6 +38,7 @@ class SphinxSearch:
|
|||||||
self.client_show.SetConnectTimeout(3.0)
|
self.client_show.SetConnectTimeout(3.0)
|
||||||
|
|
||||||
def __configure(self, index_name, wlen=None):
|
def __configure(self, index_name, wlen=None):
|
||||||
|
self.client_sugg.ResetFilters()
|
||||||
if index_name == sphinx_conf.index_sugg and wlen:
|
if index_name == sphinx_conf.index_sugg and wlen:
|
||||||
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
|
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
|
||||||
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
||||||
@ -116,6 +117,7 @@ class SphinxSearch:
|
|||||||
word_entries = self.__get_word_entries(words, strong)
|
word_entries = self.__get_word_entries(words, strong)
|
||||||
word_count = len(word_entries)
|
word_count = len(word_entries)
|
||||||
for x in range(word_count, max(0, word_count - 3), -1):
|
for x in range(word_count, max(0, word_count - 3), -1):
|
||||||
|
logging.info("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x))
|
||||||
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
|
self.client_show.AddQuery("\"{}\"/{}".format(" ".join(x.get_variations() for x in word_entries), x),
|
||||||
sphinx_conf.index_addjobj)
|
sphinx_conf.index_addjobj)
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from aore.config import sphinx_conf
|
||||||
|
|
||||||
|
|
||||||
class WordEntry:
|
class WordEntry:
|
||||||
# Варианты распеределния для слов с первыми двумя символами, где:
|
# Варианты распеределния для слов с первыми двумя символами, где:
|
||||||
@ -39,14 +41,13 @@ class WordEntry:
|
|||||||
MT_ADD_SOCR=['..10', '..x0']
|
MT_ADD_SOCR=['..10', '..x0']
|
||||||
)
|
)
|
||||||
|
|
||||||
min_word_len_to_star = 4
|
|
||||||
|
|
||||||
def __init__(self, db, word):
|
def __init__(self, db, word):
|
||||||
self.db = db
|
self.db = db
|
||||||
self.word = str(word)
|
self.word = str(word)
|
||||||
self.word_len = len(unicode(self.word))
|
self.word_len = len(unicode(self.word))
|
||||||
self.variations = []
|
self.variations = []
|
||||||
self.scname = None
|
self.scname = None
|
||||||
|
self.is_freq_word = False
|
||||||
self.ranks = self.__get_ranks()
|
self.ranks = self.__get_ranks()
|
||||||
|
|
||||||
for x, y in self.match_types.iteritems():
|
for x, y in self.match_types.iteritems():
|
||||||
@ -59,7 +60,7 @@ class WordEntry:
|
|||||||
self.MT_AS_IS = False
|
self.MT_AS_IS = False
|
||||||
|
|
||||||
# Строка слишком котроткая, то по лайку не ищем, будет очень долго
|
# Строка слишком котроткая, то по лайку не ищем, будет очень долго
|
||||||
if self.MT_LAST_STAR and self.word_len < self.min_word_len_to_star:
|
if self.MT_LAST_STAR and self.word_len < sphinx_conf.min_length_to_star:
|
||||||
self.MT_LAST_STAR = False
|
self.MT_LAST_STAR = False
|
||||||
self.MT_AS_IS = True
|
self.MT_AS_IS = True
|
||||||
|
|
||||||
@ -79,8 +80,9 @@ class WordEntry:
|
|||||||
sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
||||||
"UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
|
"UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
|
||||||
"UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
|
"UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
|
||||||
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}';".format(
|
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'" \
|
||||||
self.word, self.word_len, self.word, self.word, self.word)
|
"UNION ALL SELECT frequency, NULL FROM \"AOTRIG\" WHERE word='{}';".format(
|
||||||
|
self.word, self.word_len, self.word, self.word, self.word, self.word)
|
||||||
|
|
||||||
result = self.db.get_rows(sql_qry)
|
result = self.db.get_rows(sql_qry)
|
||||||
|
|
||||||
@ -88,6 +90,9 @@ class WordEntry:
|
|||||||
if not self.scname:
|
if not self.scname:
|
||||||
self.scname = result[2][1]
|
self.scname = result[2][1]
|
||||||
|
|
||||||
|
if len(result) == 5 and result[4][0] > 30000:
|
||||||
|
self.is_freq_word = True
|
||||||
|
|
||||||
# Формируем список найденных величин совпадений:
|
# Формируем список найденных величин совпадений:
|
||||||
# result[x]
|
# result[x]
|
||||||
# x = 0, поиск по неполному совпадению (лайк*), и по длине строки больше исходной
|
# x = 0, поиск по неполному совпадению (лайк*), и по длине строки больше исходной
|
||||||
@ -95,11 +100,11 @@ class WordEntry:
|
|||||||
# x = 2, поиск по базе сокращений (по полному)
|
# x = 2, поиск по базе сокращений (по полному)
|
||||||
# x = 3, то же, но по краткому
|
# x = 3, то же, но по краткому
|
||||||
out_mask_list = []
|
out_mask_list = []
|
||||||
for ra in result:
|
for i in range(0, 4):
|
||||||
if ra[0] > 1:
|
if result[i][0] > 1:
|
||||||
out_mask_list.append('x')
|
out_mask_list.append('x')
|
||||||
else:
|
else:
|
||||||
out_mask_list.append(str(ra[0]))
|
out_mask_list.append(str(result[i][0]))
|
||||||
|
|
||||||
return ''.join(out_mask_list)
|
return ''.join(out_mask_list)
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ class SphinxHelper:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.index_binary = None
|
self.index_binary = None
|
||||||
self.files = dict()
|
self.files = dict()
|
||||||
|
self.aodp = DbHandler()
|
||||||
|
|
||||||
def configure_indexer(self, indexer_binary, config_filename):
|
def configure_indexer(self, indexer_binary, config_filename):
|
||||||
logging.info("Start configuring Sphinx...")
|
logging.info("Start configuring Sphinx...")
|
||||||
@ -98,8 +99,7 @@ class SphinxHelper:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
aodp = DbHandler()
|
self.aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname)
|
||||||
aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname)
|
|
||||||
logging.info("Done.")
|
logging.info("Done.")
|
||||||
|
|
||||||
def __create_ao_index_config(self):
|
def __create_ao_index_config(self):
|
||||||
@ -112,7 +112,8 @@ class SphinxHelper:
|
|||||||
db_name=db_conf.database, db_port=db_conf.port,
|
db_name=db_conf.database, db_port=db_conf.port,
|
||||||
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
|
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
|
||||||
index_name=sphinx_conf.index_addjobj,
|
index_name=sphinx_conf.index_addjobj,
|
||||||
sphinx_var_path=sphinx_conf.var_dir)
|
sphinx_var_path=sphinx_conf.var_dir,
|
||||||
|
min_length_to_star=sphinx_conf.min_length_to_star)
|
||||||
|
|
||||||
f = open(fname, "w")
|
f = open(fname, "w")
|
||||||
f.write(conf_data)
|
f.write(conf_data)
|
||||||
|
@ -11,20 +11,20 @@ app = Bottle()
|
|||||||
fias_factory = FiasFactory()
|
fias_factory = FiasFactory()
|
||||||
|
|
||||||
|
|
||||||
@app.route('/expand/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
|
@app.route(r'/expand/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
|
||||||
def expand(aoid):
|
def expand(aoid):
|
||||||
response.content_type = 'application/json'
|
response.content_type = 'application/json'
|
||||||
return json.dumps(fias_factory.expand(aoid))
|
return json.dumps(fias_factory.expand(aoid))
|
||||||
|
|
||||||
|
|
||||||
@app.route('/normalize/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
|
@app.route(r'/normalize/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
|
||||||
def normalize(aoid):
|
def normalize(aoid):
|
||||||
response.content_type = 'application/json'
|
response.content_type = 'application/json'
|
||||||
return json.dumps(fias_factory.normalize(aoid))
|
return json.dumps(fias_factory.normalize(aoid))
|
||||||
|
|
||||||
|
|
||||||
@app.route('/find/<text>')
|
@app.route(r'/find/<text>')
|
||||||
@app.route('/find/<text>/<strong>')
|
@app.route(r'/find/<text>/<strong>')
|
||||||
def find(text, strong=False):
|
def find(text, strong=False):
|
||||||
strong = (strong == "strong")
|
strong = (strong == "strong")
|
||||||
response.content_type = 'application/json'
|
response.content_type = 'application/json'
|
||||||
@ -33,6 +33,6 @@ def find(text, strong=False):
|
|||||||
|
|
||||||
|
|
||||||
@app.error(404)
|
@app.error(404)
|
||||||
def error404():
|
def error404(error):
|
||||||
response.content_type = 'application/json'
|
response.content_type = 'application/json'
|
||||||
return json.dumps(dict(error="Page not found"))
|
return json.dumps(dict(error="Page not found"))
|
||||||
|
@ -18,9 +18,9 @@ source {{index_name}}
|
|||||||
index {{ index_name }}
|
index {{ index_name }}
|
||||||
{
|
{
|
||||||
min_word_len = 1
|
min_word_len = 1
|
||||||
min_prefix_len = 1
|
min_prefix_len = {{min_length_to_star}}
|
||||||
min_infix_len = 0
|
min_infix_len = 0
|
||||||
ngram_len = 1
|
bigram_index = all
|
||||||
|
|
||||||
# strip html by default
|
# strip html by default
|
||||||
html_strip = 1
|
html_strip = 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user