Начало работы с парсером входных данных (строки адресов)

This commit is contained in:
Jack Stdin 2016-01-15 17:06:14 +03:00
parent f91fb27150
commit 327a1c994e
2 changed files with 37 additions and 15 deletions

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import re
import logging
import Levenshtein import Levenshtein
import psycopg2 import psycopg2
@ -38,8 +37,9 @@ class SphinxSearch:
SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word* SRANK_EXACTLY_TYPING=['01', '11'], # Точно - слово недопечатано, не надо подсказок, только word*
SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word* SRANK_PROBABLY_TYPING=['0*'], # Возможно - слово недопечатано, немного подсказок и word*
SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word* SRANK_PROBABLY_FOUND=['10'], # Возможно - слово введено точно, немного подсказок, без word*
SRANK_PROBABLY_COMPLEX=['1*'] SRANK_PROBABLY_COMPLEX=['1*'],
# Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word* # Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
SRANK_PROBABLY_SOCR=['1!'] # Возможно - сокращение, не трогаем вообще
) )
def __init__(self, rtype): def __init__(self, rtype):
@ -47,8 +47,11 @@ class SphinxSearch:
for x, y in self.names.iteritems(): for x, y in self.names.iteritems():
self.__dict__[x] = self.rtype in y self.__dict__[x] = self.rtype in y
def __str__(self):
return ", ".join([x for x in self.names if self.__dict__[x]])
def __get_strong_and_uncomplete_ranks(self, word): def __get_strong_and_uncomplete_ranks(self, word):
word_len = str(len(word) / 2) word_len = len(word)
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format( "UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
word, word_len, word) word, word_len, word)
@ -56,14 +59,18 @@ class SphinxSearch:
result = self.db.get_rows(sql_qry) result = self.db.get_rows(sql_qry)
strong_rank = result[1][0] strong_rank = result[1][0]
uncomplete_rank = result[0][0] uncomplete_rank = result[0][0]
if uncomplete_rank > 1:
uncomplete_rank = '*' if uncomplete_rank > 1000 and word_len < 4:
uncomplete_rank = '!'
else:
if uncomplete_rank > 1:
uncomplete_rank = '*'
return self.SRankType(str(strong_rank) + str(uncomplete_rank)) return self.SRankType(str(strong_rank) + str(uncomplete_rank))
def get_suggest(self, word): def __get_suggest(self, word):
word_len = str(len(word) / 2) word_len = str(len(word) / 2)
trigrammed_word = '"{}"/2'.format(trigram(word)) trigrammed_word = '"{}"/1'.format(trigram(word))
self.__configure("idx_fias_sugg", word_len) self.__configure("idx_fias_sugg", word_len)
result = self.client.Query(trigrammed_word, 'idx_fias_sugg') result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
@ -84,9 +91,17 @@ class SphinxSearch:
print x[0], x[1] print x[0], x[1]
return outlist return outlist
def __split_phrase(self, phrase):
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.]+", phrase)
def __process_word(self, word):
print word, self.__get_strong_and_uncomplete_ranks(word)
def find(self, text): def find(self, text):
# TODO: ADD index words = self.__split_phrase(text)
logging.info("12") for word in words:
result = self.client.Query(text) self.__process_word(word)
print json.dumps(result) # result = self.client.Query(text)
logging.info("12") # print json.dumps(result)
# logging.info("12")

View File

@ -32,9 +32,12 @@ def main():
help="Path to sphinx indexer binary. Must be specified for '--sphinx-configure'") help="Path to sphinx indexer binary. Must be specified for '--sphinx-configure'")
p.add_option('--output-conf', '-o', p.add_option('--output-conf', '-o',
help="Output config filename. Must be specified for '--sphinx-configure'") help="Output config filename. Must be specified for '--sphinx-configure'")
p.add_option('--test', '-t', action="store_true", dest="test",
help="Test")
options, arguments = p.parse_args() options, arguments = p.parse_args()
# Manage DB
if options.database: if options.database:
# create new database # create new database
if options.database == "create": if options.database == "create":
@ -43,11 +46,15 @@ def main():
if options.database == "update": if options.database == "update":
update_base(options.source, int(options.update_count)) update_base(options.source, int(options.update_count))
# Manage Sphinx
if options.sphinx and options.indexer_path and options.output_conf: if options.sphinx and options.indexer_path and options.output_conf:
sphinxh = SphinxHelper() sphinxh = SphinxHelper()
sphinxh.configure_indexer(options.indexer_path, options.output_conf) sphinxh.configure_indexer(options.indexer_path, options.output_conf)
# 4 Debug purposes..
if options.test:
sph = SphinxSearch()
sph.find('город Гавно д. пидарская, ул Кощеева')
if __name__ == '__main__': if __name__ == '__main__':
#sph = SphinxSearch()
#sph.get_suggest('апасьево')
main() main()