diff --git a/aore/config/common.py b/aore/config/common.py index 34bee2e..10a304e 100644 --- a/aore/config/common.py +++ b/aore/config/common.py @@ -1,29 +1,45 @@ # -*- coding: utf-8 -*- -class basic: + +class BasicConfig: logging = False logfile = "" + def __init__(self): + pass -class sphinx_conf: + +class SphinxConfig: listen = "127.0.0.1:9312" index_addjobj = "idx_fias_addrobj" index_sugg = "idx_fias_sugg" var_dir = None min_length_to_star = 3 + def __init__(self): + pass -class db_conf: + +class DatabaseConfig: host = None user = None password = None database = None port = None + def __init__(self): + pass -class unrar_config: + +class UnrarConfig: path = None + def __init__(self): + pass -class folders: + +class Folders: temp = None + + def __init__(self): + pass diff --git a/aore/miscutils/fysearch.py b/aore/miscutils/fysearch.py new file mode 100644 index 0000000..367cb47 --- /dev/null +++ b/aore/miscutils/fysearch.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import re + +import Levenshtein + + +def violet_ratio(pattern, candidate): + arr_pattern = re.split(r"[ ,:.#$-]+", pattern) + arr_candidate = re.split(r"[ ,:.#$-]+", candidate) + + result = list() + + for i in range(len(arr_pattern) - 1, -1, -1): + max_j = -1 + max_ratio = -1 + allowed_nums = range(len(arr_candidate) - 1, -1, -1) + + for j in allowed_nums: + ratio = Levenshtein.ratio(arr_pattern[i], arr_candidate[j]) + if max_ratio < ratio: + max_ratio = ratio + max_j = j + + result.append(max_j*abs(max_ratio)) + + if max_j > -1: + allowed_nums.remove(max_j) + del arr_candidate[max_j] + + return sum(result) - len(arr_candidate) diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py index 011dd82..22c1542 100644 --- a/aore/miscutils/sphinx.py +++ b/aore/miscutils/sphinx.py @@ -5,7 +5,7 @@ import os from bottle import template -from aore.config import folders, db_conf, sphinx_conf +from aore.config import Folders, DatabaseConfig, SphinxConfig from aore.miscutils.trigram import trigram from aore.updater.aoxmltableentry import AoXmlTableEntry from aore.updater.dbhandler import DbHandler @@ -18,16 +18,16 @@ class SphinxHelper: self.aodp = DbHandler() # Создаем временную папку, если ее нет - if not os.path.exists(folders.temp): - os.makedirs(folders.temp) + if not os.path.exists(Folders.temp): + os.makedirs(Folders.temp) # оздаем 3 папки для Сфинкса - if not os.path.exists(sphinx_conf.var_dir+'/run'): - os.makedirs(sphinx_conf.var_dir+'/run') - if not os.path.exists(sphinx_conf.var_dir+'/log'): - os.makedirs(sphinx_conf.var_dir+'/log') - if not os.path.exists(sphinx_conf.var_dir+'/data'): - os.makedirs(sphinx_conf.var_dir+'/data') + if not os.path.exists(SphinxConfig.var_dir+ '/run'): + os.makedirs(SphinxConfig.var_dir + '/run') + if not os.path.exists(SphinxConfig.var_dir+ '/log'): + os.makedirs(SphinxConfig.var_dir + '/log') + if not os.path.exists(SphinxConfig.var_dir+ '/data'): + os.makedirs(SphinxConfig.var_dir + '/data') def configure_indexer(self, indexer_binary, config_filename): logging.info("Start configuring Sphinx...") @@ -64,15 +64,15 @@ class SphinxHelper: logging.info("Successfully configured. Please restart searchd.") def __create_sugg_index_config(self): - fname = os.path.abspath(folders.temp + "/suggest.conf") + fname = os.path.abspath(Folders.temp + "/suggest.conf") logging.info("Creating config %s", fname) - conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=db_conf.host, - db_user=db_conf.user, - db_password=db_conf.password, - db_name=db_conf.database, db_port=db_conf.port, - index_name=sphinx_conf.index_sugg, - sphinx_var_path=sphinx_conf.var_dir) + conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=DatabaseConfig.host, + db_user=DatabaseConfig.user, + db_password=DatabaseConfig.password, + db_name=DatabaseConfig.database, db_port=DatabaseConfig.port, + index_name=SphinxConfig.index_sugg, + sphinx_var_path=SphinxConfig.var_dir) f = open(fname, "w") f.write(conf_data) @@ -84,7 +84,7 @@ class SphinxHelper: def __dbexport_sugg_dict(self): logging.info("Place suggestion dict to DB %s...", self.files['dict.txt']) - dict_dat_fname = os.path.abspath(folders.temp + "/suggdict.csv") + dict_dat_fname = os.path.abspath(Folders.temp + "/suggdict.csv") csv_counter = 0 with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file: @@ -115,17 +115,17 @@ class SphinxHelper: logging.info("Done.") def __create_ao_index_config(self): - fname = os.path.abspath(folders.temp + "/addrobj.conf") + fname = os.path.abspath(Folders.temp + "/addrobj.conf") logging.info("Creating config %s", fname) - conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=db_conf.host, - db_user=db_conf.user, - db_password=db_conf.password, - db_name=db_conf.database, db_port=db_conf.port, + conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=DatabaseConfig.host, + db_user=DatabaseConfig.user, + db_password=DatabaseConfig.password, + db_name=DatabaseConfig.database, db_port=DatabaseConfig.port, sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"), - index_name=sphinx_conf.index_addjobj, - sphinx_var_path=sphinx_conf.var_dir, - min_length_to_star=sphinx_conf.min_length_to_star) + index_name=SphinxConfig.index_addjobj, + sphinx_var_path=SphinxConfig.var_dir, + min_length_to_star=SphinxConfig.min_length_to_star) f = open(fname, "w") f.write(conf_data) @@ -136,11 +136,11 @@ class SphinxHelper: return fname def __create_suggestion_dict(self): - fname = os.path.abspath(folders.temp + "/suggdict.txt") + fname = os.path.abspath(Folders.temp + "/suggdict.txt") logging.info("Make suggestion dict (%s)...", fname) run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary, - sphinx_conf.index_addjobj, + SphinxConfig.index_addjobj, self.files['addrobj.conf'], fname) os.system(run_builddict_cmd) logging.info("Done.") @@ -152,8 +152,8 @@ class SphinxHelper: logging.info("Creating main config %s...", out_filename) conf_data = template('aore/templates/sphinx/sphinx.conf', - sphinx_listen=sphinx_conf.listen.replace("unix://", ''), - sphinx_var_path=sphinx_conf.var_dir) + sphinx_listen=SphinxConfig.listen.replace("unix://", ''), + sphinx_var_path=SphinxConfig.var_dir) f = open(out_filename, "w") for fname, fpath in self.files.iteritems(): diff --git a/aore/search/fiasfactory.py b/aore/search/fiasfactory.py index b8cdae8..edee085 100644 --- a/aore/search/fiasfactory.py +++ b/aore/search/fiasfactory.py @@ -8,14 +8,14 @@ import psycopg2 import traceback from bottle import template -from aore.config import db_conf, basic +from aore.config import DatabaseConfig, BasicConfig from aore.dbutils.dbimpl import DBImpl from search import SphinxSearch class FiasFactory: def __init__(self): - self.db = DBImpl(psycopg2, db_conf) + self.db = DBImpl(psycopg2, DatabaseConfig) self.searcher = SphinxSearch(self.db) self.expand_templ = template('aore/templates/postgre/expand_query.sql', aoid="//aoid") self.normalize_templ = template('aore/templates/postgre/normalize_query.sql', aoid="//aoid") @@ -57,7 +57,7 @@ class FiasFactory: results = self.searcher.find(text, strong) except Exception, err: - if basic.logging: + if BasicConfig.logging: logging.error(traceback.format_exc(err)) return dict(error=err.args[0]) @@ -71,7 +71,7 @@ class FiasFactory: sql_query = self.normalize_templ.replace("//aoid", aoid_guid) rows = self.db.get_rows(sql_query, True) except Exception, err: - if basic.logging: + if BasicConfig.logging: logging.error(traceback.format_exc(err)) return dict(error=err.args[0]) @@ -92,7 +92,7 @@ class FiasFactory: sql_query = self.expand_templ.replace("//aoid", normalized_id) rows = self.db.get_rows(sql_query, True) except Exception, err: - if basic.logging: + if BasicConfig.logging: logging.error(traceback.format_exc(err)) return dict(error=err.args[0]) diff --git a/aore/search/search.py b/aore/search/search.py index d734e64..0b879ca 100644 --- a/aore/search/search.py +++ b/aore/search/search.py @@ -6,9 +6,10 @@ import time import Levenshtein import sphinxapi -from aore.config import basic -from aore.config import sphinx_conf +from aore.config import BasicConfig +from aore.config import SphinxConfig from aore.miscutils.exceptions import FiasException +from aore.miscutils.fysearch import violet_ratio from aore.miscutils.trigram import trigram from wordentry import WordEntry from wordvariation import VariationType @@ -28,12 +29,12 @@ class SphinxSearch: def __init__(self, db): self.db = db - sphinx_host = sphinx_conf.listen + sphinx_host = SphinxConfig.listen sphinx_port = None # Получаем строку подключения для Sphinx - if ":" in sphinx_conf.listen and "unix:/" not in sphinx_conf.listen: - sphinx_host, sphinx_port = sphinx_conf.listen.split(":") + if ":" in SphinxConfig.listen and "unix:/" not in SphinxConfig.listen: + sphinx_host, sphinx_port = SphinxConfig.listen.split(":") sphinx_port = int(sphinx_port) # Настраиваем подключение для подсказок @@ -50,7 +51,7 @@ class SphinxSearch: def __configure(self, index_name, word_len): self.client_sugg.ResetFilters() - if index_name == sphinx_conf.index_sugg: + if index_name == SphinxConfig.index_sugg: self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT) self.client_sugg.SetFilterRange("len", int(word_len) - self.delta_len, int(word_len) + self.delta_len) self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, word_len)) @@ -64,8 +65,8 @@ class SphinxSearch: word_len = str(len(word) / 2) trigrammed_word = '"{}"/1'.format(trigram(word)) - self.__configure(sphinx_conf.index_sugg, word_len) - result = self.client_sugg.Query(trigrammed_word, sphinx_conf.index_sugg) + self.__configure(SphinxConfig.index_sugg, word_len) + result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg) # Если по данному слову не найдено подсказок (а такое бывает?) # возвращаем [] @@ -135,16 +136,16 @@ class SphinxSearch: good_vars_word_count = len(set([v.parent for v in good_vars])) freq_vars_word_count = len(set([v.parent for v in freq_vars])) - self.__configure(sphinx_conf.index_addjobj, word_count) + self.__configure(SphinxConfig.index_addjobj, word_count) # формируем строки для поиска в Сфинксе for i in range(good_vars_word_count, max(0, good_vars_word_count - 3), -1): first_q = "@fullname \"{}\"/{}".format(" ".join(good_var.text for good_var in good_vars), i) if self.search_freq_words and freq_vars_word_count: second_q = " @sname {}".format(" ".join(freq_var.text for freq_var in freq_vars)) - self.client_show.AddQuery(first_q + second_q, sphinx_conf.index_addjobj) + self.client_show.AddQuery(first_q + second_q, SphinxConfig.index_addjobj) del second_q - self.client_show.AddQuery(first_q, sphinx_conf.index_addjobj) + self.client_show.AddQuery(first_q, SphinxConfig.index_addjobj) del first_q start_t = time.time() @@ -154,7 +155,7 @@ class SphinxSearch: if rs is None: raise FiasException("Cannot find sentence.") - if basic.logging: + if BasicConfig.logging: logging.info("Sphinx time for {} = {}".format(text, elapsed_t)) results = [] @@ -172,4 +173,19 @@ class SphinxSearch: ratio=match['attrs']['krank'], cort=i)) + # При строгом поиске нам надо еще добавить fuzzy и выбрать самое большое значение при отклонении + # выше заданного + if strong: + for result in results: + result['strong_rank'] = violet_ratio(text, result['text'].lower()) + + # Сортируем по убыванию признака + results.sort(key=lambda x: x['strong_rank'], reverse=True) + + # Если подряд два одинаково релеватных результата - это плохо, на автомат такое отдавать нельзя + if abs(results[0]['strong_rank'] - results[1]['strong_rank']) == 0.0: + raise FiasException("No matches") + else: + return results[0] + return results diff --git a/aore/search/wordentry.py b/aore/search/wordentry.py index 300008b..99a3b07 100644 --- a/aore/search/wordentry.py +++ b/aore/search/wordentry.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import re -from aore.config import sphinx_conf +from aore.config import SphinxConfig from aore.search.wordvariation import WordVariation, VariationType @@ -68,7 +68,7 @@ class WordEntry: self.MT_AS_IS = False # Строка слишком котроткая, то по лайку не ищем, сфинкс такого не прожует - if self.MT_LAST_STAR and self.word_len < sphinx_conf.min_length_to_star: + if self.MT_LAST_STAR and self.word_len < SphinxConfig.min_length_to_star: self.MT_LAST_STAR = False self.MT_AS_IS = True diff --git a/aore/updater/aodataparser.py b/aore/updater/aodataparser.py index 32c6ff8..e0e2522 100644 --- a/aore/updater/aodataparser.py +++ b/aore/updater/aodataparser.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import os -from aore.config import folders +from aore.config import Folders from aore.dbutils.dbschemas import db_shemas from aore.miscutils.exceptions import FiasException from aore.updater.xmlparser import XMLParser @@ -16,8 +16,8 @@ class AoDataParser: self.allowed_fields = db_shemas[self.datasource.table_name].fields # Создаем временную папку, если ее нет - if not os.path.exists(folders.temp): - os.makedirs(folders.temp) + if not os.path.exists(Folders.temp): + os.makedirs(Folders.temp) self.pagesize = pagesize self.currentpage = 0 @@ -56,7 +56,7 @@ class AoDataParser: self.data_bereit_callback = data_callback self.currentpage = 0 self.base_filename = \ - folders.temp + "/fd_" + \ + Folders.temp + "/fd_" + \ str(self.datasource.operation_type) + "_" + \ self.datasource.table_name + ".csv.part{}" self.counter = self.pagesize + 1 diff --git a/aore/updater/aorar.py b/aore/updater/aorar.py index b8b095d..05c1d36 100644 --- a/aore/updater/aorar.py +++ b/aore/updater/aorar.py @@ -7,14 +7,14 @@ from traceback import format_exc import rarfile import requests -from aore.config import folders, unrar_config +from aore.config import Folders, UnrarConfig from aore.miscutils.exceptions import FiasException from aoxmltableentry import AoXmlTableEntry class AoRar: def __init__(self): - rarfile.UNRAR_TOOL = unrar_config.path + rarfile.UNRAR_TOOL = UnrarConfig.path self.fname = None self.mode = None @@ -25,12 +25,12 @@ class AoRar: def download(self, url): logging.info("Downloading %s", url) try: - local_filename = os.path.abspath(folders.temp + "/" + url.split('/')[-1]) + local_filename = os.path.abspath(Folders.temp + "/" + url.split('/')[-1]) if os.path.isfile(local_filename): os.remove(local_filename) else: - if not os.path.exists(folders.temp): - os.makedirs(folders.temp) + if not os.path.exists(Folders.temp): + os.makedirs(Folders.temp) request = requests.get(url, stream=True) with open(local_filename, 'wb') as f: diff --git a/aore/updater/dbhandler.py b/aore/updater/dbhandler.py index 5e99337..96e00f6 100644 --- a/aore/updater/dbhandler.py +++ b/aore/updater/dbhandler.py @@ -5,7 +5,7 @@ import logging import psycopg2 from bottle import template -from aore.config import db_conf +from aore.config import DatabaseConfig from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbschemas import db_shemas from aore.updater.aoxmltableentry import AoXmlTableEntry @@ -13,7 +13,7 @@ from aore.updater.aoxmltableentry import AoXmlTableEntry class DbHandler: def __init__(self): - self.db = DBImpl(psycopg2, db_conf) + self.db = DBImpl(psycopg2, DatabaseConfig) def bulk_csv(self, operation_type, table_name, processed_count, csv_file_name): sql_query = None diff --git a/aore/updater/updater.py b/aore/updater/updater.py index 0568819..5babd7f 100644 --- a/aore/updater/updater.py +++ b/aore/updater/updater.py @@ -5,7 +5,7 @@ from os import walk, path import psycopg2 -from aore.config import db_conf +from aore.config import DatabaseConfig from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbschemas import allowed_tables, db_shemas from aore.updater.aodataparser import AoDataParser @@ -36,7 +36,7 @@ class Updater: def get_current_fias_version(cls): db = None try: - db = DBImpl(psycopg2, db_conf) + db = DBImpl(psycopg2, DatabaseConfig) rows = db.get_rows('SELECT version FROM "CONFIG" WHERE id=0', True) assert len(rows) > 0, "Cannot get a version" return rows[0]['version'] @@ -48,7 +48,7 @@ class Updater: @classmethod def __set__update_version(cls, updver=0): - db = DBImpl(psycopg2, db_conf) + db = DBImpl(psycopg2, DatabaseConfig) try: assert isinstance(updver, int), "Update version must be of int type." db.execute('UPDATE "CONFIG" SET version={} WHERE id=0'.format(updver)) @@ -69,9 +69,10 @@ class Updater: return mode def __get_updates_from_folder(self, foldername): - # TODO: Вычислять версию, если берем данные из каталога - yield dict(intver=self.__get_update_version_from_console(), - textver="Unknown", delta_url=foldername, + fias_db_version = self.__get_update_version_from_console() + yield dict(intver=fias_db_version, + textver="Version {}".format(fias_db_version), + delta_url=foldername, complete_url=foldername) @staticmethod diff --git a/config.example.py b/config.example.py index 23c87b6..8c43376 100644 --- a/config.example.py +++ b/config.example.py @@ -2,17 +2,24 @@ from aore import config # Config section -config.sphinx_conf.listen = "127.0.0.1:9312" -config.sphinx_conf.var_dir = "C:\\Sphinx" -config.db_conf.database = "pyfias" -config.db_conf.host = "192.168.0.1" -config.db_conf.port = 5432 -config.db_conf.user = "postgres" -config.db_conf.password = "postgres" +# Address and port where sphinx was listening, +# may be a unix socket like 'unix://tmp/pyphias.sock' +config.SphinxConfig.listen = "127.0.0.1:9312" +# Base sphinx folder +config.SphinxConfig.var_dir = "C:\\Sphinx" -config.unrar_config.path = "C:\\Program Files\\WinRAR\\unrar.exe" -config.folders.temp = "E:\\!TEMP" +# DB config +config.DatabaseConfig.database = "fias_db" +config.DatabaseConfig.host = "192.168.0.1" +config.DatabaseConfig.port = 5432 +config.DatabaseConfig.user = "postgres" +config.DatabaseConfig.password = "postgres" -config.basic.logging = True -config.basic.logfile = "pyphias.log" +# Path to unrar, in Linux may be 'unrar' +config.UnrarConfig.path = "C:\\Program Files\\WinRAR\\unrar.exe" +# Temp folder, in Linux may be '/tmp/myfolder' +config.Folders.temp = "E:\\!TEMP" + +config.BasicConfig.logging = True +config.BasicConfig.logfile = "pyphias.log" \ No newline at end of file diff --git a/passenger_wsgi.py b/passenger_wsgi.py index 2ddf04b..da2543f 100644 --- a/passenger_wsgi.py +++ b/passenger_wsgi.py @@ -9,7 +9,7 @@ except ImportError: assert "No config" # Define main app -phias_app = phias.App(config.basic.logfile) +phias_app = phias.App(config.BasicConfig.logfile) # Define wsgi app application = phias_app.get_app() diff --git a/requirements.txt b/requirements.txt index 4c5c823..669c6a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ enum34>=1.0.0 rarfile requests>=2.8.1 soap2py==1.16 +sphinxapi