Merge pull request #9 from jar3b/strong_search

Implemented "strong" search
This commit is contained in:
jar3b 2016-03-17 17:14:50 +04:00
commit aa604cc7b7
13 changed files with 153 additions and 82 deletions

View File

@ -1,29 +1,45 @@
# -*- coding: utf-8 -*-
class basic:
class BasicConfig:
logging = False
logfile = ""
def __init__(self):
pass
class sphinx_conf:
class SphinxConfig:
listen = "127.0.0.1:9312"
index_addjobj = "idx_fias_addrobj"
index_sugg = "idx_fias_sugg"
var_dir = None
min_length_to_star = 3
def __init__(self):
pass
class db_conf:
class DatabaseConfig:
host = None
user = None
password = None
database = None
port = None
def __init__(self):
pass
class unrar_config:
class UnrarConfig:
path = None
def __init__(self):
pass
class folders:
class Folders:
temp = None
def __init__(self):
pass

View File

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
import re
import Levenshtein
def violet_ratio(pattern, candidate):
arr_pattern = re.split(r"[ ,:.#$-]+", pattern)
arr_candidate = re.split(r"[ ,:.#$-]+", candidate)
result = list()
for i in range(len(arr_pattern) - 1, -1, -1):
max_j = -1
max_ratio = -1
allowed_nums = range(len(arr_candidate) - 1, -1, -1)
for j in allowed_nums:
ratio = Levenshtein.ratio(arr_pattern[i], arr_candidate[j])
if max_ratio < ratio:
max_ratio = ratio
max_j = j
result.append(max_j*abs(max_ratio))
if max_j > -1:
allowed_nums.remove(max_j)
del arr_candidate[max_j]
return sum(result) - len(arr_candidate)

View File

@ -5,7 +5,7 @@ import os
from bottle import template
from aore.config import folders, db_conf, sphinx_conf
from aore.config import Folders, DatabaseConfig, SphinxConfig
from aore.miscutils.trigram import trigram
from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.updater.dbhandler import DbHandler
@ -18,16 +18,16 @@ class SphinxHelper:
self.aodp = DbHandler()
# Создаем временную папку, если ее нет
if not os.path.exists(folders.temp):
os.makedirs(folders.temp)
if not os.path.exists(Folders.temp):
os.makedirs(Folders.temp)
# оздаем 3 папки для Сфинкса
if not os.path.exists(sphinx_conf.var_dir+'/run'):
os.makedirs(sphinx_conf.var_dir+'/run')
if not os.path.exists(sphinx_conf.var_dir+'/log'):
os.makedirs(sphinx_conf.var_dir+'/log')
if not os.path.exists(sphinx_conf.var_dir+'/data'):
os.makedirs(sphinx_conf.var_dir+'/data')
if not os.path.exists(SphinxConfig.var_dir+ '/run'):
os.makedirs(SphinxConfig.var_dir + '/run')
if not os.path.exists(SphinxConfig.var_dir+ '/log'):
os.makedirs(SphinxConfig.var_dir + '/log')
if not os.path.exists(SphinxConfig.var_dir+ '/data'):
os.makedirs(SphinxConfig.var_dir + '/data')
def configure_indexer(self, indexer_binary, config_filename):
logging.info("Start configuring Sphinx...")
@ -64,15 +64,15 @@ class SphinxHelper:
logging.info("Successfully configured. Please restart searchd.")
def __create_sugg_index_config(self):
fname = os.path.abspath(folders.temp + "/suggest.conf")
fname = os.path.abspath(Folders.temp + "/suggest.conf")
logging.info("Creating config %s", fname)
conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=db_conf.host,
db_user=db_conf.user,
db_password=db_conf.password,
db_name=db_conf.database, db_port=db_conf.port,
index_name=sphinx_conf.index_sugg,
sphinx_var_path=sphinx_conf.var_dir)
conf_data = template('aore/templates/sphinx/idx_suggest.conf', db_host=DatabaseConfig.host,
db_user=DatabaseConfig.user,
db_password=DatabaseConfig.password,
db_name=DatabaseConfig.database, db_port=DatabaseConfig.port,
index_name=SphinxConfig.index_sugg,
sphinx_var_path=SphinxConfig.var_dir)
f = open(fname, "w")
f.write(conf_data)
@ -84,7 +84,7 @@ class SphinxHelper:
def __dbexport_sugg_dict(self):
logging.info("Place suggestion dict to DB %s...", self.files['dict.txt'])
dict_dat_fname = os.path.abspath(folders.temp + "/suggdict.csv")
dict_dat_fname = os.path.abspath(Folders.temp + "/suggdict.csv")
csv_counter = 0
with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file:
@ -115,17 +115,17 @@ class SphinxHelper:
logging.info("Done.")
def __create_ao_index_config(self):
fname = os.path.abspath(folders.temp + "/addrobj.conf")
fname = os.path.abspath(Folders.temp + "/addrobj.conf")
logging.info("Creating config %s", fname)
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=db_conf.host,
db_user=db_conf.user,
db_password=db_conf.password,
db_name=db_conf.database, db_port=db_conf.port,
conf_data = template('aore/templates/sphinx/idx_addrobj.conf', db_host=DatabaseConfig.host,
db_user=DatabaseConfig.user,
db_password=DatabaseConfig.password,
db_name=DatabaseConfig.database, db_port=DatabaseConfig.port,
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n", " \\\n"),
index_name=sphinx_conf.index_addjobj,
sphinx_var_path=sphinx_conf.var_dir,
min_length_to_star=sphinx_conf.min_length_to_star)
index_name=SphinxConfig.index_addjobj,
sphinx_var_path=SphinxConfig.var_dir,
min_length_to_star=SphinxConfig.min_length_to_star)
f = open(fname, "w")
f.write(conf_data)
@ -136,11 +136,11 @@ class SphinxHelper:
return fname
def __create_suggestion_dict(self):
fname = os.path.abspath(folders.temp + "/suggdict.txt")
fname = os.path.abspath(Folders.temp + "/suggdict.txt")
logging.info("Make suggestion dict (%s)...", fname)
run_builddict_cmd = "{} {} -c {} --buildstops {} 200000 --buildfreqs".format(self.index_binary,
sphinx_conf.index_addjobj,
SphinxConfig.index_addjobj,
self.files['addrobj.conf'], fname)
os.system(run_builddict_cmd)
logging.info("Done.")
@ -152,8 +152,8 @@ class SphinxHelper:
logging.info("Creating main config %s...", out_filename)
conf_data = template('aore/templates/sphinx/sphinx.conf',
sphinx_listen=sphinx_conf.listen.replace("unix://", ''),
sphinx_var_path=sphinx_conf.var_dir)
sphinx_listen=SphinxConfig.listen.replace("unix://", ''),
sphinx_var_path=SphinxConfig.var_dir)
f = open(out_filename, "w")
for fname, fpath in self.files.iteritems():

View File

@ -8,14 +8,14 @@ import psycopg2
import traceback
from bottle import template
from aore.config import db_conf, basic
from aore.config import DatabaseConfig, BasicConfig
from aore.dbutils.dbimpl import DBImpl
from search import SphinxSearch
class FiasFactory:
def __init__(self):
self.db = DBImpl(psycopg2, db_conf)
self.db = DBImpl(psycopg2, DatabaseConfig)
self.searcher = SphinxSearch(self.db)
self.expand_templ = template('aore/templates/postgre/expand_query.sql', aoid="//aoid")
self.normalize_templ = template('aore/templates/postgre/normalize_query.sql', aoid="//aoid")
@ -57,7 +57,7 @@ class FiasFactory:
results = self.searcher.find(text, strong)
except Exception, err:
if basic.logging:
if BasicConfig.logging:
logging.error(traceback.format_exc(err))
return dict(error=err.args[0])
@ -71,7 +71,7 @@ class FiasFactory:
sql_query = self.normalize_templ.replace("//aoid", aoid_guid)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
if basic.logging:
if BasicConfig.logging:
logging.error(traceback.format_exc(err))
return dict(error=err.args[0])
@ -92,7 +92,7 @@ class FiasFactory:
sql_query = self.expand_templ.replace("//aoid", normalized_id)
rows = self.db.get_rows(sql_query, True)
except Exception, err:
if basic.logging:
if BasicConfig.logging:
logging.error(traceback.format_exc(err))
return dict(error=err.args[0])

View File

@ -6,9 +6,10 @@ import time
import Levenshtein
import sphinxapi
from aore.config import basic
from aore.config import sphinx_conf
from aore.config import BasicConfig
from aore.config import SphinxConfig
from aore.miscutils.exceptions import FiasException
from aore.miscutils.fysearch import violet_ratio
from aore.miscutils.trigram import trigram
from wordentry import WordEntry
from wordvariation import VariationType
@ -28,12 +29,12 @@ class SphinxSearch:
def __init__(self, db):
self.db = db
sphinx_host = sphinx_conf.listen
sphinx_host = SphinxConfig.listen
sphinx_port = None
# Получаем строку подключения для Sphinx
if ":" in sphinx_conf.listen and "unix:/" not in sphinx_conf.listen:
sphinx_host, sphinx_port = sphinx_conf.listen.split(":")
if ":" in SphinxConfig.listen and "unix:/" not in SphinxConfig.listen:
sphinx_host, sphinx_port = SphinxConfig.listen.split(":")
sphinx_port = int(sphinx_port)
# Настраиваем подключение для подсказок
@ -50,7 +51,7 @@ class SphinxSearch:
def __configure(self, index_name, word_len):
self.client_sugg.ResetFilters()
if index_name == sphinx_conf.index_sugg:
if index_name == SphinxConfig.index_sugg:
self.client_sugg.SetRankingMode(sphinxapi.SPH_RANK_WORDCOUNT)
self.client_sugg.SetFilterRange("len", int(word_len) - self.delta_len, int(word_len) + self.delta_len)
self.client_sugg.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, word_len))
@ -64,8 +65,8 @@ class SphinxSearch:
word_len = str(len(word) / 2)
trigrammed_word = '"{}"/1'.format(trigram(word))
self.__configure(sphinx_conf.index_sugg, word_len)
result = self.client_sugg.Query(trigrammed_word, sphinx_conf.index_sugg)
self.__configure(SphinxConfig.index_sugg, word_len)
result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg)
# Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем []
@ -135,16 +136,16 @@ class SphinxSearch:
good_vars_word_count = len(set([v.parent for v in good_vars]))
freq_vars_word_count = len(set([v.parent for v in freq_vars]))
self.__configure(sphinx_conf.index_addjobj, word_count)
self.__configure(SphinxConfig.index_addjobj, word_count)
# формируем строки для поиска в Сфинксе
for i in range(good_vars_word_count, max(0, good_vars_word_count - 3), -1):
first_q = "@fullname \"{}\"/{}".format(" ".join(good_var.text for good_var in good_vars), i)
if self.search_freq_words and freq_vars_word_count:
second_q = " @sname {}".format(" ".join(freq_var.text for freq_var in freq_vars))
self.client_show.AddQuery(first_q + second_q, sphinx_conf.index_addjobj)
self.client_show.AddQuery(first_q + second_q, SphinxConfig.index_addjobj)
del second_q
self.client_show.AddQuery(first_q, sphinx_conf.index_addjobj)
self.client_show.AddQuery(first_q, SphinxConfig.index_addjobj)
del first_q
start_t = time.time()
@ -154,7 +155,7 @@ class SphinxSearch:
if rs is None:
raise FiasException("Cannot find sentence.")
if basic.logging:
if BasicConfig.logging:
logging.info("Sphinx time for {} = {}".format(text, elapsed_t))
results = []
@ -172,4 +173,19 @@ class SphinxSearch:
ratio=match['attrs']['krank'],
cort=i))
# При строгом поиске нам надо еще добавить fuzzy и выбрать самое большое значение при отклонении
# выше заданного
if strong:
for result in results:
result['strong_rank'] = violet_ratio(text, result['text'].lower())
# Сортируем по убыванию признака
results.sort(key=lambda x: x['strong_rank'], reverse=True)
# Если подряд два одинаково релеватных результата - это плохо, на автомат такое отдавать нельзя
if abs(results[0]['strong_rank'] - results[1]['strong_rank']) == 0.0:
raise FiasException("No matches")
else:
return results[0]
return results

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import re
from aore.config import sphinx_conf
from aore.config import SphinxConfig
from aore.search.wordvariation import WordVariation, VariationType
@ -68,7 +68,7 @@ class WordEntry:
self.MT_AS_IS = False
# Строка слишком котроткая, то по лайку не ищем, сфинкс такого не прожует
if self.MT_LAST_STAR and self.word_len < sphinx_conf.min_length_to_star:
if self.MT_LAST_STAR and self.word_len < SphinxConfig.min_length_to_star:
self.MT_LAST_STAR = False
self.MT_AS_IS = True

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import os
from aore.config import folders
from aore.config import Folders
from aore.dbutils.dbschemas import db_shemas
from aore.miscutils.exceptions import FiasException
from aore.updater.xmlparser import XMLParser
@ -16,8 +16,8 @@ class AoDataParser:
self.allowed_fields = db_shemas[self.datasource.table_name].fields
# Создаем временную папку, если ее нет
if not os.path.exists(folders.temp):
os.makedirs(folders.temp)
if not os.path.exists(Folders.temp):
os.makedirs(Folders.temp)
self.pagesize = pagesize
self.currentpage = 0
@ -56,7 +56,7 @@ class AoDataParser:
self.data_bereit_callback = data_callback
self.currentpage = 0
self.base_filename = \
folders.temp + "/fd_" + \
Folders.temp + "/fd_" + \
str(self.datasource.operation_type) + "_" + \
self.datasource.table_name + ".csv.part{}"
self.counter = self.pagesize + 1

View File

@ -7,14 +7,14 @@ from traceback import format_exc
import rarfile
import requests
from aore.config import folders, unrar_config
from aore.config import Folders, UnrarConfig
from aore.miscutils.exceptions import FiasException
from aoxmltableentry import AoXmlTableEntry
class AoRar:
def __init__(self):
rarfile.UNRAR_TOOL = unrar_config.path
rarfile.UNRAR_TOOL = UnrarConfig.path
self.fname = None
self.mode = None
@ -25,12 +25,12 @@ class AoRar:
def download(self, url):
logging.info("Downloading %s", url)
try:
local_filename = os.path.abspath(folders.temp + "/" + url.split('/')[-1])
local_filename = os.path.abspath(Folders.temp + "/" + url.split('/')[-1])
if os.path.isfile(local_filename):
os.remove(local_filename)
else:
if not os.path.exists(folders.temp):
os.makedirs(folders.temp)
if not os.path.exists(Folders.temp):
os.makedirs(Folders.temp)
request = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:

View File

@ -5,7 +5,7 @@ import logging
import psycopg2
from bottle import template
from aore.config import db_conf
from aore.config import DatabaseConfig
from aore.dbutils.dbimpl import DBImpl
from aore.dbutils.dbschemas import db_shemas
from aore.updater.aoxmltableentry import AoXmlTableEntry
@ -13,7 +13,7 @@ from aore.updater.aoxmltableentry import AoXmlTableEntry
class DbHandler:
def __init__(self):
self.db = DBImpl(psycopg2, db_conf)
self.db = DBImpl(psycopg2, DatabaseConfig)
def bulk_csv(self, operation_type, table_name, processed_count, csv_file_name):
sql_query = None

View File

@ -5,7 +5,7 @@ from os import walk, path
import psycopg2
from aore.config import db_conf
from aore.config import DatabaseConfig
from aore.dbutils.dbimpl import DBImpl
from aore.dbutils.dbschemas import allowed_tables, db_shemas
from aore.updater.aodataparser import AoDataParser
@ -36,7 +36,7 @@ class Updater:
def get_current_fias_version(cls):
db = None
try:
db = DBImpl(psycopg2, db_conf)
db = DBImpl(psycopg2, DatabaseConfig)
rows = db.get_rows('SELECT version FROM "CONFIG" WHERE id=0', True)
assert len(rows) > 0, "Cannot get a version"
return rows[0]['version']
@ -48,7 +48,7 @@ class Updater:
@classmethod
def __set__update_version(cls, updver=0):
db = DBImpl(psycopg2, db_conf)
db = DBImpl(psycopg2, DatabaseConfig)
try:
assert isinstance(updver, int), "Update version must be of int type."
db.execute('UPDATE "CONFIG" SET version={} WHERE id=0'.format(updver))
@ -69,9 +69,10 @@ class Updater:
return mode
def __get_updates_from_folder(self, foldername):
# TODO: Вычислять версию, если берем данные из каталога
yield dict(intver=self.__get_update_version_from_console(),
textver="Unknown", delta_url=foldername,
fias_db_version = self.__get_update_version_from_console()
yield dict(intver=fias_db_version,
textver="Version {}".format(fias_db_version),
delta_url=foldername,
complete_url=foldername)
@staticmethod

View File

@ -2,17 +2,24 @@
from aore import config
# Config section
config.sphinx_conf.listen = "127.0.0.1:9312"
config.sphinx_conf.var_dir = "C:\\Sphinx"
config.db_conf.database = "pyfias"
config.db_conf.host = "192.168.0.1"
config.db_conf.port = 5432
config.db_conf.user = "postgres"
config.db_conf.password = "postgres"
# Address and port where sphinx was listening,
# may be a unix socket like 'unix://tmp/pyphias.sock'
config.SphinxConfig.listen = "127.0.0.1:9312"
# Base sphinx folder
config.SphinxConfig.var_dir = "C:\\Sphinx"
config.unrar_config.path = "C:\\Program Files\\WinRAR\\unrar.exe"
config.folders.temp = "E:\\!TEMP"
# DB config
config.DatabaseConfig.database = "fias_db"
config.DatabaseConfig.host = "192.168.0.1"
config.DatabaseConfig.port = 5432
config.DatabaseConfig.user = "postgres"
config.DatabaseConfig.password = "postgres"
config.basic.logging = True
config.basic.logfile = "pyphias.log"
# Path to unrar, in Linux may be 'unrar'
config.UnrarConfig.path = "C:\\Program Files\\WinRAR\\unrar.exe"
# Temp folder, in Linux may be '/tmp/myfolder'
config.Folders.temp = "E:\\!TEMP"
config.BasicConfig.logging = True
config.BasicConfig.logfile = "pyphias.log"

View File

@ -9,7 +9,7 @@ except ImportError:
assert "No config"
# Define main app
phias_app = phias.App(config.basic.logfile)
phias_app = phias.App(config.BasicConfig.logfile)
# Define wsgi app
application = phias_app.get_app()

View File

@ -7,3 +7,4 @@ enum34>=1.0.0
rarfile
requests>=2.8.1
soap2py==1.16
sphinxapi