Улучшен поиск, правки кода.

This commit is contained in:
Jack Stdin 2016-02-02 13:27:42 +03:00
parent 90cae604fa
commit 8088bff07a
13 changed files with 107 additions and 46 deletions

27
LICENSE.txt Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2016, hellotan
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of py-phias nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -4,6 +4,8 @@ from traceback import format_exc
import psycopg2.extras
from aore.miscutils.exceptions import FiasException
class DBImpl:
def __init__(self, engine, db_config):
@ -28,20 +30,26 @@ class DBImpl:
try:
cur = self.get_cursor()
cur.execute(sql_query)
cur.close()
self.transaction_commit()
except:
self.transaction_rollback()
raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
raise FiasException("Error execute sql query. Reason : {}".format(format_exc()))
def get_rows(self, query_string, dict_cursor=False):
if dict_cursor:
cur = self.connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
else:
cur = self.connection.cursor()
cur.execute(query_string)
rows = cur.fetchall()
if cur:
try:
cur.execute(query_string)
rows = cur.fetchall()
cur.close()
self.transaction_commit()
except:
self.transaction_rollback()
raise FiasException("Error execute sql query. Reason : {}".format(format_exc()))
return rows

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import psycopg2
from bottle import template
import sys
from aore.dbutils.dbimpl import DBImpl
from aore.fias.search import SphinxSearch
from aore.config import db_conf
@ -49,7 +49,7 @@ class FiasFactory:
results = self.searcher.find(text, strong)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
return results
@ -61,7 +61,7 @@ class FiasFactory:
sql_query = self.normalize_templ.replace("//aoid", aoid_guid)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
if len(rows) == 0:
return []
@ -80,6 +80,6 @@ class FiasFactory:
sql_query = self.expand_templ.replace("//aoid", normalized_id)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
return rows

View File

@ -36,15 +36,12 @@ class SphinxSearch:
self.client_show.SetConnectTimeout(3.0)
def __configure(self, index_name, wlen=None):
if index_name == sphinx_conf.index_sugg:
if wlen:
self.client_sugg.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
if index_name == sphinx_conf.index_sugg and wlen:
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client_show.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_show.SetSortMode(sphinxapi.SPH_SORT_RELEVANCE)
@ -108,11 +105,11 @@ class SphinxSearch:
we = WordEntry(self.db, word)
self.__add_word_variations(we, strong)
if we.get_variations() == "()":
raise BaseException("Cannot process sentence.")
assert we.get_variations() != "()", "Cannot process sentence."
yield we
def find(self, text, strong):
logging.info("FIND ")
words = self.__split_phrase(text)
word_entries = self.__get_word_entries(words, strong)
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))

View File

@ -16,7 +16,7 @@ class WordEntry:
# -1x - одно по лайку и много точных. Быть не может.
# x0 - много по лайку и нет точных. Недопечатка. Немного подсказок и *.
# x1 - много по лайку и один точный. Чет нашли. Как есть и *.
# xx - много по лайку и много точных. Оставляем как есть и *
# xx - много по лайку и много точных. Оставляем как есть и * TODO В данном случае лайк лучше убрать
#
# Теперь по сокращениям. Они работюат отдельно (ПОКА ЧТО)
# 3rd - кол-во слов по точному совпдению по полному сокращению.
@ -29,7 +29,7 @@ class WordEntry:
# 11 - найдено одно полное и одно малое. Бывает (допустим, 'сад'). Добавляем как есть.
# -1x - найдено одно полное и куча малых. Ну бред.
# x0 - найдено куча полных и ни одного малого. Добавляем малое.
# x1 - Куча полных и 1 малое. TODO Хз, бывает ли. Не обрабатываем.
# x1 - Куча полных и 1 малое. Хз, бывает ли. Не обрабатываем.
# xx - Куча полных и куча малых. Не обрабатываем.
match_types = dict(
MT_MANY_SUGG=['0000'],
@ -39,9 +39,12 @@ class WordEntry:
MT_ADD_SOCR=['..10', '..x0']
)
min_word_len_to_star = 4
def __init__(self, db, word):
self.db = db
self.word = str(word)
self.word_len = len(unicode(self.word))
self.variations = []
self.scname = None
self.ranks = self.__get_ranks()
@ -51,9 +54,15 @@ class WordEntry:
for z in y:
self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
# Если ищем по лайку, то точное совпадение не ищем (оно и так будет включено)
if self.MT_LAST_STAR:
self.MT_AS_IS = False
# Строка слишком котроткая, то по лайку не ищем, будет очень долго
if self.MT_LAST_STAR and self.word_len < self.min_word_len_to_star:
self.MT_LAST_STAR = False
self.MT_AS_IS = True
def add_variation_socr(self):
if self.scname:
self.add_variation(self.scname)
@ -65,28 +74,32 @@ class WordEntry:
return "({})".format(" | ".join(self.variations))
def __get_ranks(self):
word_len = len(unicode(self.word))
sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
"UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
self.word, word_len, self.word, self.word, self.word)
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}';".format(
self.word, self.word_len, self.word, self.word, self.word)
result = self.db.get_rows(sql_qry)
# Проставляем "сокращенное" сокращение, если нашли полное
if not self.scname:
self.scname = result[2][1]
outmask = ""
# Формируем список найденных величин совпадений:
# result[x]
# x = 0, поиск по неполному совпадению (лайк*), и по длине строки больше исходной
# x = 1, поиск по точному совпадению
# x = 2, поиск по базе сокращений (по полному)
# x = 3, то же, но по краткому
out_mask_list = []
for ra in result:
if ra[0] > 1:
if word_len > 2:
outmask += 'x'
else:
outmask += '1'
out_mask_list.append('x')
else:
outmask += str(ra[0])
out_mask_list.append(str(ra[0]))
return outmask
return ''.join(out_mask_list)
def get_type(self):
return ", ".join([x for x in self.match_types if self.__dict__[x]])

View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
class FiasException(Exception):
def __str__(self):
return repr(self.args[0])

View File

@ -85,8 +85,7 @@ class SphinxHelper:
splitting_seq = line.split(' ')
keyword = splitting_seq[0]
freq = splitting_seq[1].rstrip('\n')
if not keyword or not freq:
raise BaseException("Cannot process {}".format(self.files['dict.txt']))
assert keyword and freq, "Cannot process {}".format(self.files['dict.txt'])
nodes.append(keyword)
nodes.append(trigram(keyword))

View File

@ -2,7 +2,7 @@
import json
import logging
from bottle import Bottle
from bottle import Bottle, response
from aore.fias.fiasfactory import FiasFactory
@ -13,21 +13,29 @@ fias_factory = FiasFactory()
@app.route('/expand/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
def expand(aoid):
response.content_type = 'application/json'
return json.dumps(fias_factory.expand(aoid))
@app.route('/normalize/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
def normalize(aoid):
response.content_type = 'application/json'
return json.dumps(fias_factory.normalize(aoid))
@app.route('/find/<text>')
@app.route('/find/<text>/<strong>')
def find(text, strong=False):
logging.warning("START")
strong = (strong == "strong")
return json.dumps(fias_factory.find(text, strong))
response.content_type = 'application/json'
res = json.dumps(fias_factory.find(text, strong))
logging.warning("END")
return res
@app.error(404)
def error404(error):
response.content_type = 'application/json'
return json.dumps(dict(error="Page not found"))

View File

@ -3,6 +3,7 @@ import os
from aore.config import folders
from aore.dbutils.dbschemas import db_shemas
from aore.miscutils.exceptions import FiasException
from xmlparser import XMLParser
@ -10,7 +11,7 @@ class AoDataParser:
def __init__(self, datasource, pagesize):
self.datasource = datasource
if self.datasource.table_name not in db_shemas:
raise BaseException("Cannot parse {}: Not configured.".format(self.datasource.table_name))
raise FiasException("Cannot parse {}: Not configured.".format(self.datasource.table_name))
else:
self.allowed_fields = db_shemas[self.datasource.table_name].fields

View File

@ -8,6 +8,7 @@ import rarfile
import requests
from aore.config import folders, unrar_config
from aore.miscutils.exceptions import FiasException
from aoxmltableentry import AoXmlTableEntry
@ -29,7 +30,7 @@ class AoRar:
if chunk:
f.write(chunk)
except:
raise BaseException("Error downloading. Reason : {}".format(format_exc()))
raise FiasException("Error downloading. Reason : {}".format(format_exc()))
logging.info("Downloaded {} bytes".format(request.headers['Content-length']))
return local_filename

View File

@ -63,7 +63,7 @@ class Updater:
def __init_update_entries(self, updates_generator):
if self.mode == "http":
assert updates_generator
assert updates_generator, "No generator"
self.tablelist_generator = self.__get_updates_from_rar
self.updalist_generator = updates_generator
else:

View File

@ -5,16 +5,17 @@ setup(
version='0.0.1',
packages=['aore', 'aore.fias', 'aore.config', 'aore.dbutils', 'aore.updater', 'aore.miscutils'],
url='https://github.com/jar3b/py-phias',
license='',
license='BSD',
author='hellotan',
author_email='hellotan@live.ru',
description='Python application that can operate with FIAS (Russian Address Object DB)',
install_requires=['lxml',
'psycopg2',
'bottle',
'pysimplesoap',
'python-Levenshtein',
'enum34',
'rarfile',
'requests']
install_requires=
['lxml',
'psycopg2>=2.6.0',
'bottle>=0.12.0',
'pysimplesoap',
'python-Levenshtein',
'enum34',
'rarfile',
'requests']
)