Начало работы над логикой релевантности.
This commit is contained in:
parent
6c09dd2cdb
commit
586768b315
@ -24,7 +24,6 @@ class DBImpl:
|
|||||||
def execute(self, sql_query):
|
def execute(self, sql_query):
|
||||||
try:
|
try:
|
||||||
cur = self.get_cursor()
|
cur = self.get_cursor()
|
||||||
print sql_query
|
|
||||||
cur.execute(sql_query)
|
cur.execute(sql_query)
|
||||||
self.transaction_commit()
|
self.transaction_commit()
|
||||||
except:
|
except:
|
||||||
|
@ -1,25 +1,35 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import sphinxapi
|
import aore.sphinxapi as sphinxapi
|
||||||
|
|
||||||
from aore.config import db as dbparams
|
from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj
|
||||||
from aore.dbutils.dbimpl import DBImpl
|
from aore.dbutils.dbimpl import DBImpl
|
||||||
from aore.fias.word import WordEntry
|
from aore.fias.wordentry import WordEntry
|
||||||
from aore.miscutils.trigram import trigram
|
from aore.miscutils.trigram import trigram
|
||||||
|
|
||||||
|
|
||||||
class SphinxSearch:
|
class SphinxSearch:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.delta_len = 2
|
self.delta_len = 2
|
||||||
|
self.rating_limit_soft = 0.4
|
||||||
|
self.rating_limit_hard = 0.82
|
||||||
|
self.default_rating_delta = 2
|
||||||
|
self.regression_coef = 0.04
|
||||||
|
|
||||||
self.db = DBImpl(psycopg2, dbparams)
|
self.db = DBImpl(psycopg2, dbparams)
|
||||||
self.client = sphinxapi.SphinxClient()
|
self.client = sphinxapi.SphinxClient()
|
||||||
self.client.SetServer("localhost", 9312)
|
self.client.SetServer("localhost", 9312)
|
||||||
self.client.SetLimits(0, 10)
|
self.client.SetLimits(0, 10)
|
||||||
|
|
||||||
|
self.client1 = sphinxapi.SphinxClient()
|
||||||
|
self.client1.SetServer("localhost", 9312)
|
||||||
|
self.client1.SetLimits(0, 10)
|
||||||
|
self.client1.SetConnectTimeout(7.0)
|
||||||
|
|
||||||
def __configure(self, index_name, wlen=None):
|
def __configure(self, index_name, wlen=None):
|
||||||
if index_name == "idx_fias_sugg":
|
if index_name == "idx_fias_sugg":
|
||||||
if wlen:
|
if wlen:
|
||||||
@ -28,15 +38,13 @@ class SphinxSearch:
|
|||||||
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
|
||||||
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
|
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
|
||||||
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
|
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
|
||||||
else:
|
|
||||||
self.client.SetMatchMode(sphinxapi.MA)
|
|
||||||
|
|
||||||
def __get_suggest(self, word):
|
def __get_suggest(self, word, rating_limit, count):
|
||||||
word_len = str(len(word) / 2)
|
word_len = str(len(word) / 2)
|
||||||
trigrammed_word = '"{}"/1'.format(trigram(word))
|
trigrammed_word = '"{}"/1'.format(trigram(word))
|
||||||
|
|
||||||
self.__configure("idx_fias_sugg", word_len)
|
self.__configure(sphinx_index_sugg, word_len)
|
||||||
result = self.client.Query(trigrammed_word, 'idx_fias_sugg')
|
result = self.client.Query(trigrammed_word, sphinx_index_sugg)
|
||||||
|
|
||||||
# Если по данному слову не найдено подсказок (а такое бывает?)
|
# Если по данному слову не найдено подсказок (а такое бывает?)
|
||||||
# возвращаем []
|
# возвращаем []
|
||||||
@ -44,29 +52,63 @@ class SphinxSearch:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
maxrank = result['matches'][0]['attrs']['krank']
|
maxrank = result['matches'][0]['attrs']['krank']
|
||||||
|
maxleven = None
|
||||||
|
|
||||||
outlist = list()
|
outlist = list()
|
||||||
for match in result['matches']:
|
for match in result['matches']:
|
||||||
if maxrank - match['attrs']['krank'] < 2:
|
if len(outlist) >= count:
|
||||||
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])])
|
break;
|
||||||
|
|
||||||
|
if maxrank - match['attrs']['krank'] < self.default_rating_delta:
|
||||||
|
jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
|
||||||
|
if not maxleven:
|
||||||
|
maxleven = jaro_rating - jaro_rating * self.regression_coef
|
||||||
|
if jaro_rating >= rating_limit and jaro_rating >= maxleven:
|
||||||
|
outlist.append([match['attrs']['word'], jaro_rating])
|
||||||
|
|
||||||
outlist.sort(key=lambda x: x[1], reverse=True)
|
outlist.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
for x in outlist:
|
|
||||||
print x[0], x[1]
|
|
||||||
return outlist
|
return outlist
|
||||||
|
|
||||||
def __split_phrase(self, phrase):
|
def __split_phrase(self, phrase):
|
||||||
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
|
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
|
||||||
return re.split(r"[ ,:.]+", phrase)
|
return re.split(r"[ ,:.#$]+", phrase)
|
||||||
|
|
||||||
def __process_words(self, words):
|
def __add_word_variations(self, word_entry):
|
||||||
|
if word_entry.MT_MANY_SUGG:
|
||||||
|
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6)
|
||||||
|
for suggestion in suggs:
|
||||||
|
word_entry.add_variation(suggestion[0])
|
||||||
|
if word_entry.MT_SOME_SUGG:
|
||||||
|
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3)
|
||||||
|
for suggestion in suggs:
|
||||||
|
word_entry.add_variation(suggestion[0])
|
||||||
|
if word_entry.MT_LAST_STAR:
|
||||||
|
word_entry.add_variation(word_entry.word+'*')
|
||||||
|
if word_entry.MT_AS_IS:
|
||||||
|
word_entry.add_variation(word_entry.word)
|
||||||
|
if word_entry.MT_ADD_SOCR:
|
||||||
|
word_entry.add_variation_socr()
|
||||||
|
|
||||||
|
|
||||||
|
def __get_word_entries(self, words):
|
||||||
for word in words:
|
for word in words:
|
||||||
yield WordEntry(self.db, word)
|
if word != '':
|
||||||
|
we = WordEntry(self.db, word)
|
||||||
|
self.__add_word_variations(we)
|
||||||
|
yield we
|
||||||
|
|
||||||
|
|
||||||
def find(self, text):
|
def find(self, text):
|
||||||
words = self.__split_phrase(text)
|
words = self.__split_phrase(text)
|
||||||
word_entries = self.__process_words(words)
|
word_entries = self.__get_word_entries(words)
|
||||||
for word_entry in word_entries:
|
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))
|
||||||
print word_entry, word_entry.get_type()
|
#self.__configure(sphinx_index_addjobj)
|
||||||
# result = self.client.Query(text)
|
self.client1.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
|
||||||
# print json.dumps(result)
|
self.client1.SetRankingMode(sphinxapi.SPH_RANK_SPH04)
|
||||||
# logging.info("12")
|
#self.client1.SetF
|
||||||
|
rs = self.client1.Query(sentence, sphinx_index_addjobj)
|
||||||
|
print rs
|
||||||
|
for ma in rs['matches']:
|
||||||
|
print ma['attrs']['fullname'], ma['weight']
|
||||||
|
print sentence
|
||||||
|
@ -41,33 +41,52 @@ class WordEntry:
|
|||||||
|
|
||||||
def __init__(self, db, word):
|
def __init__(self, db, word):
|
||||||
self.db = db
|
self.db = db
|
||||||
self.word = word
|
self.word = str(word)
|
||||||
self.ranks = self.__get_word_entity()
|
self.variations = []
|
||||||
|
self.scname = None
|
||||||
|
self.ranks = self.__get_ranks()
|
||||||
|
|
||||||
|
|
||||||
for x, y in self.match_types.iteritems():
|
for x, y in self.match_types.iteritems():
|
||||||
self.__dict__[x] = False
|
self.__dict__[x] = False
|
||||||
for z in y:
|
for z in y:
|
||||||
self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
|
self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
|
||||||
|
|
||||||
def __get_word_entity(self):
|
if self.MT_LAST_STAR:
|
||||||
|
self.MT_AS_IS = False
|
||||||
|
|
||||||
|
def add_variation_socr(self):
|
||||||
|
if self.scname:
|
||||||
|
self.add_variation(self.scname)
|
||||||
|
|
||||||
|
def add_variation(self, variation_string):
|
||||||
|
self.variations.append(variation_string)
|
||||||
|
|
||||||
|
def get_variations(self):
|
||||||
|
#if len(self.variations) == 1:
|
||||||
|
# return "\"{}\"".format(self.variations[0])
|
||||||
|
return "({})".format(" | ".join(self.variations))
|
||||||
|
|
||||||
|
def __get_ranks(self):
|
||||||
word_len = len(self.word)
|
word_len = len(self.word)
|
||||||
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
|
||||||
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}' " \
|
"UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
|
||||||
"UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
|
"UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
|
||||||
"UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
|
"UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
|
||||||
self.word, word_len, self.word, self.word, self.word)
|
self.word, word_len, self.word, self.word, self.word)
|
||||||
|
|
||||||
result = self.db.get_rows(sql_qry)
|
result = self.db.get_rows(sql_qry)
|
||||||
|
if not self.scname:
|
||||||
|
self.scname = result[2][1]
|
||||||
|
|
||||||
outmask = ""
|
outmask = ""
|
||||||
for ra in result:
|
for ra in result:
|
||||||
if ra[0] > 1:
|
if ra[0] > 1:
|
||||||
outmask += 'x'
|
outmask += 'x'
|
||||||
else:
|
else:
|
||||||
outmask += str(ra[0])
|
outmask += str(ra[0])
|
||||||
|
|
||||||
return outmask
|
return outmask
|
||||||
|
|
||||||
def get_type(self):
|
def get_type(self):
|
||||||
return ", ".join([x for x in self.match_types if self.__dict__[x]])
|
return ", ".join([x for x in self.match_types if self.__dict__[x]])
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self.word)
|
|
@ -1 +1 @@
|
|||||||
COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'
|
COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'
|
||||||
|
@ -2,4 +2,4 @@ DROP TABLE IF EXISTS "{{tablename}}_TEMP";
|
|||||||
CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
|
CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
|
||||||
FROM "{{tablename}}" WITH NO DATA;
|
FROM "{{tablename}}" WITH NO DATA;
|
||||||
COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
|
COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
|
||||||
DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP");
|
DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP");
|
||||||
|
@ -7,4 +7,4 @@ FROM
|
|||||||
"{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}};
|
"{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}};
|
||||||
% if tablename=="ADDROBJ":
|
% if tablename=="ADDROBJ":
|
||||||
DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL;
|
DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL;
|
||||||
% end
|
% end
|
||||||
|
@ -2,4 +2,4 @@ CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel");
|
|||||||
CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
|
CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
|
||||||
CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
|
CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
|
||||||
CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");
|
CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");
|
||||||
CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word");
|
CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word");
|
||||||
|
@ -37,4 +37,4 @@ CREATE TABLE "AOTRIG" (
|
|||||||
CONSTRAINT "id_aotrig" PRIMARY KEY ("id")
|
CONSTRAINT "id_aotrig" PRIMARY KEY ("id")
|
||||||
)
|
)
|
||||||
WITH (OIDS =FALSE
|
WITH (OIDS =FALSE
|
||||||
);
|
);
|
||||||
|
@ -27,11 +27,11 @@ def main():
|
|||||||
p.add_option('--source', '-s', default="http",
|
p.add_option('--source', '-s', default="http",
|
||||||
help="Create/update DB from source. Value: \"http\" or absolute path to folder")
|
help="Create/update DB from source. Value: \"http\" or absolute path to folder")
|
||||||
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
|
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
|
||||||
help="Configure sphinx. Creates sphinx.conf in working direcory")
|
help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'")
|
||||||
p.add_option('--indexer-path', '-i',
|
p.add_option('--indexer-path', '-i',
|
||||||
help="Path to sphinx indexer binary. Must be specified for '--sphinx-configure'")
|
help="Path to sphinx indexer binary. Required for '--sphinx-configure'")
|
||||||
p.add_option('--output-conf', '-o',
|
p.add_option('--output-conf', '-o',
|
||||||
help="Output config filename. Must be specified for '--sphinx-configure'")
|
help="Output config filename. Required for '--sphinx-configure'")
|
||||||
p.add_option('--test', '-t', action="store_true", dest="test",
|
p.add_option('--test', '-t', action="store_true", dest="test",
|
||||||
help="Test")
|
help="Test")
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ def main():
|
|||||||
# 4 Debug purposes..
|
# 4 Debug purposes..
|
||||||
if options.test:
|
if options.test:
|
||||||
sph = SphinxSearch()
|
sph = SphinxSearch()
|
||||||
sph.find('гор Горно-алтайск проспект Ленина')
|
sph.find('#москва$#северное тушино$$$ул#туристская')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user