Улучшен поиск, правки кода.

This commit is contained in:
Jack Stdin
2016-02-02 13:27:42 +03:00
parent 90cae604fa
commit 8088bff07a
13 changed files with 107 additions and 46 deletions

View File

@@ -4,6 +4,8 @@ from traceback import format_exc
import psycopg2.extras
from aore.miscutils.exceptions import FiasException
class DBImpl:
def __init__(self, engine, db_config):
@@ -28,20 +30,26 @@ class DBImpl:
try:
cur = self.get_cursor()
cur.execute(sql_query)
cur.close()
self.transaction_commit()
except:
self.transaction_rollback()
raise BaseException("Error execute sql query. Reason : {}".format(format_exc()))
raise FiasException("Error execute sql query. Reason : {}".format(format_exc()))
def get_rows(self, query_string, dict_cursor=False):
if dict_cursor:
cur = self.connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
else:
cur = self.connection.cursor()
cur.execute(query_string)
rows = cur.fetchall()
if cur:
try:
cur.execute(query_string)
rows = cur.fetchall()
cur.close()
self.transaction_commit()
except:
self.transaction_rollback()
raise FiasException("Error execute sql query. Reason : {}".format(format_exc()))
return rows

View File

@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import psycopg2
from bottle import template
import sys
from aore.dbutils.dbimpl import DBImpl
from aore.fias.search import SphinxSearch
from aore.config import db_conf
@@ -49,7 +49,7 @@ class FiasFactory:
results = self.searcher.find(text, strong)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
return results
@@ -61,7 +61,7 @@ class FiasFactory:
sql_query = self.normalize_templ.replace("//aoid", aoid_guid)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
if len(rows) == 0:
return []
@@ -80,6 +80,6 @@ class FiasFactory:
sql_query = self.expand_templ.replace("//aoid", normalized_id)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
return dict(error=err.message)
return dict(error=err.args[0])
return rows

View File

@@ -36,15 +36,12 @@ class SphinxSearch:
self.client_show.SetConnectTimeout(3.0)
def __configure(self, index_name, wlen=None):
if index_name == sphinx_conf.index_sugg:
if wlen:
self.client_sugg.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
if index_name == sphinx_conf.index_sugg and wlen:
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_sugg.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client_sugg.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client_show.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
self.client_show.SetRankingMode(sphinxapi.SPH_RANK_BM25)
self.client_show.SetSortMode(sphinxapi.SPH_SORT_RELEVANCE)
@@ -108,11 +105,11 @@ class SphinxSearch:
we = WordEntry(self.db, word)
self.__add_word_variations(we, strong)
if we.get_variations() == "()":
raise BaseException("Cannot process sentence.")
assert we.get_variations() != "()", "Cannot process sentence."
yield we
def find(self, text, strong):
logging.info("FIND ")
words = self.__split_phrase(text)
word_entries = self.__get_word_entries(words, strong)
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))

View File

@@ -16,7 +16,7 @@ class WordEntry:
# -1x - одно по лайку и много точных. Быть не может.
# x0 - много по лайку и нет точных. Недопечатка. Немного подсказок и *.
# x1 - много по лайку и один точный. Чет нашли. Как есть и *.
# xx - много по лайку и много точных. Оставляем как есть и *
# xx - много по лайку и много точных. Оставляем как есть и * TODO В данном случае лайк лучше убрать
#
# Теперь по сокращениям. Они работюат отдельно (ПОКА ЧТО)
# 3rd - кол-во слов по точному совпдению по полному сокращению.
@@ -29,7 +29,7 @@ class WordEntry:
# 11 - найдено одно полное и одно малое. Бывает (допустим, 'сад'). Добавляем как есть.
# -1x - найдено одно полное и куча малых. Ну бред.
# x0 - найдено куча полных и ни одного малого. Добавляем малое.
# x1 - Куча полных и 1 малое. TODO Хз, бывает ли. Не обрабатываем.
# x1 - Куча полных и 1 малое. Хз, бывает ли. Не обрабатываем.
# xx - Куча полных и куча малых. Не обрабатываем.
match_types = dict(
MT_MANY_SUGG=['0000'],
@@ -39,9 +39,12 @@ class WordEntry:
MT_ADD_SOCR=['..10', '..x0']
)
min_word_len_to_star = 4
def __init__(self, db, word):
self.db = db
self.word = str(word)
self.word_len = len(unicode(self.word))
self.variations = []
self.scname = None
self.ranks = self.__get_ranks()
@@ -51,9 +54,15 @@ class WordEntry:
for z in y:
self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
# Если ищем по лайку, то точное совпадение не ищем (оно и так будет включено)
if self.MT_LAST_STAR:
self.MT_AS_IS = False
# Строка слишком котроткая, то по лайку не ищем, будет очень долго
if self.MT_LAST_STAR and self.word_len < self.min_word_len_to_star:
self.MT_LAST_STAR = False
self.MT_AS_IS = True
def add_variation_socr(self):
if self.scname:
self.add_variation(self.scname)
@@ -65,28 +74,32 @@ class WordEntry:
return "({})".format(" | ".join(self.variations))
def __get_ranks(self):
word_len = len(unicode(self.word))
sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
"UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
self.word, word_len, self.word, self.word, self.word)
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}';".format(
self.word, self.word_len, self.word, self.word, self.word)
result = self.db.get_rows(sql_qry)
# Проставляем "сокращенное" сокращение, если нашли полное
if not self.scname:
self.scname = result[2][1]
outmask = ""
# Формируем список найденных величин совпадений:
# result[x]
# x = 0, поиск по неполному совпадению (лайк*), и по длине строки больше исходной
# x = 1, поиск по точному совпадению
# x = 2, поиск по базе сокращений (по полному)
# x = 3, то же, но по краткому
out_mask_list = []
for ra in result:
if ra[0] > 1:
if word_len > 2:
outmask += 'x'
else:
outmask += '1'
out_mask_list.append('x')
else:
outmask += str(ra[0])
out_mask_list.append(str(ra[0]))
return outmask
return ''.join(out_mask_list)
def get_type(self):
return ", ".join([x for x in self.match_types if self.__dict__[x]])

121
aore/manage.py Normal file
View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
import json
import logging
import optparse
from aore.fias.fiasfactory import FiasFactory
from aore.miscutils.sphinx import SphinxHelper
from aore.updater.soapreceiver import SoapReceiver
from aore.updater.updater import Updater
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
def print_fias_versions():
imp = SoapReceiver()
current_version = Updater.get_current_fias_version()
all_versions = imp.get_update_list()
print("Installed version: {}".format(current_version))
print("Avaliable updates:")
print("Number\t\tDate")
for upd in all_versions:
mark_current = (' ', '*')[int(upd['intver']) == current_version]
print "{}{}\t\t{}".format(mark_current, upd['intver'], upd['strver'])
def parse_update_str(updates_str):
if updates_str == "all":
return None
upd_list = updates_str.lower().replace(' ', '').split(',')
out_list = []
for u_entry in upd_list:
if '-' in u_entry:
u_range = u_entry.split('-')
out_list += range(int(u_range[0]), int(u_range[1]))
else:
out_list.append(int(u_entry))
return out_list
def get_allowed_updates(updates_str, mode="create"):
imp = SoapReceiver()
current_version = Updater.get_current_fias_version()
all_versions = [x for x in imp.get_update_list()]
user_defined_list = parse_update_str(updates_str)
out_list = []
if mode == "create" and not user_defined_list:
yield all_versions[-1]
if mode == "create":
assert len(user_defined_list) == 1, "Ony single update number allowed for DB create"
for uv in all_versions:
uv_ver = uv['intver']
if uv_ver > current_version and (not user_defined_list or uv_ver in user_defined_list):
out_list.append(uv)
out_list.sort(key=lambda item: item['intver'])
for ol_entry in out_list:
yield ol_entry
def main():
# Parse options
p = optparse.OptionParser()
p.add_option('--database', '-b', action="store", type="string",
help="Database management. Values: "
"create - create new DB, "
"update - update existing DB without losing the data")
p.add_option('--update-version', '-u', default="all", type="string",
help="Valid only for updating via HTTP. "
"Version update numbers for processing. Can be 111 or 111-222 or 111,222,333."
"For '--database-create' only one value (like 111) may be specified. If not specified, "
"all updates will be processed (for '--database update') or last DB snapshot "
"(for '--database create')")
p.add_option('--show-versions', '-v', action="store_true", dest="show_versions", default=False,
help="Show available fias versions. "
"These version numbers are required for the '--update-version' option")
p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: 'http' or absolute path to folder containing XMLs")
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
help="Configure Sphinx. Creates a sphinx.conf file specified in '--output-conf'")
p.add_option('--indexer-path', '-i',
help="Path to Sphinx indexer binary. Required for '--sphinx-configure'")
p.add_option('--output-conf', '-o',
help="Output config filename. Required for '--sphinx-configure'")
options, arguments = p.parse_args()
# Show FIAS updates
if options.show_versions:
print_fias_versions()
return
# Manage DB
if options.database:
# create new database
aoupdater = Updater(options.source)
allowed_updates = None
if options.source == "http":
allowed_updates = get_allowed_updates(options.update_version, options.database)
if options.database == "create":
aoupdater.create(allowed_updates)
# update database
if options.database == "update":
aoupdater.update(allowed_updates)
# Manage Sphinx
if options.sphinx and options.indexer_path and options.output_conf:
sphinxh = SphinxHelper()
sphinxh.configure_indexer(options.indexer_path, options.output_conf)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
class FiasException(Exception):
def __str__(self):
return repr(self.args[0])

View File

@@ -85,8 +85,7 @@ class SphinxHelper:
splitting_seq = line.split(' ')
keyword = splitting_seq[0]
freq = splitting_seq[1].rstrip('\n')
if not keyword or not freq:
raise BaseException("Cannot process {}".format(self.files['dict.txt']))
assert keyword and freq, "Cannot process {}".format(self.files['dict.txt'])
nodes.append(keyword)
nodes.append(trigram(keyword))

View File

@@ -2,7 +2,7 @@
import json
import logging
from bottle import Bottle
from bottle import Bottle, response
from aore.fias.fiasfactory import FiasFactory
@@ -13,21 +13,29 @@ fias_factory = FiasFactory()
@app.route('/expand/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
def expand(aoid):
response.content_type = 'application/json'
return json.dumps(fias_factory.expand(aoid))
@app.route('/normalize/<aoid:re:[\w]{8}(-[\w]{4}){3}-[\w]{12}>')
def normalize(aoid):
response.content_type = 'application/json'
return json.dumps(fias_factory.normalize(aoid))
@app.route('/find/<text>')
@app.route('/find/<text>/<strong>')
def find(text, strong=False):
logging.warning("START")
strong = (strong == "strong")
return json.dumps(fias_factory.find(text, strong))
response.content_type = 'application/json'
res = json.dumps(fias_factory.find(text, strong))
logging.warning("END")
return res
@app.error(404)
def error404(error):
response.content_type = 'application/json'
return json.dumps(dict(error="Page not found"))

View File

@@ -3,6 +3,7 @@ import os
from aore.config import folders
from aore.dbutils.dbschemas import db_shemas
from aore.miscutils.exceptions import FiasException
from xmlparser import XMLParser
@@ -10,7 +11,7 @@ class AoDataParser:
def __init__(self, datasource, pagesize):
self.datasource = datasource
if self.datasource.table_name not in db_shemas:
raise BaseException("Cannot parse {}: Not configured.".format(self.datasource.table_name))
raise FiasException("Cannot parse {}: Not configured.".format(self.datasource.table_name))
else:
self.allowed_fields = db_shemas[self.datasource.table_name].fields

View File

@@ -8,6 +8,7 @@ import rarfile
import requests
from aore.config import folders, unrar_config
from aore.miscutils.exceptions import FiasException
from aoxmltableentry import AoXmlTableEntry
@@ -29,7 +30,7 @@ class AoRar:
if chunk:
f.write(chunk)
except:
raise BaseException("Error downloading. Reason : {}".format(format_exc()))
raise FiasException("Error downloading. Reason : {}".format(format_exc()))
logging.info("Downloaded {} bytes".format(request.headers['Content-length']))
return local_filename

View File

@@ -63,7 +63,7 @@ class Updater:
def __init_update_entries(self, updates_generator):
if self.mode == "http":
assert updates_generator
assert updates_generator, "No generator"
self.tablelist_generator = self.__get_updates_from_rar
self.updalist_generator = updates_generator
else: