Начало работы над логикой релевантности.

This commit is contained in:
Jack Stdin 2016-01-17 01:18:22 +03:00
parent 6c09dd2cdb
commit 586768b315
9 changed files with 102 additions and 42 deletions

View File

@ -24,7 +24,6 @@ class DBImpl:
def execute(self, sql_query): def execute(self, sql_query):
try: try:
cur = self.get_cursor() cur = self.get_cursor()
print sql_query
cur.execute(sql_query) cur.execute(sql_query)
self.transaction_commit() self.transaction_commit()
except: except:

View File

@ -1,25 +1,35 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
import re import re
import Levenshtein import Levenshtein
import psycopg2 import psycopg2
import sphinxapi import aore.sphinxapi as sphinxapi
from aore.config import db as dbparams from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj
from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbimpl import DBImpl
from aore.fias.word import WordEntry from aore.fias.wordentry import WordEntry
from aore.miscutils.trigram import trigram from aore.miscutils.trigram import trigram
class SphinxSearch: class SphinxSearch:
def __init__(self): def __init__(self):
self.delta_len = 2 self.delta_len = 2
self.rating_limit_soft = 0.4
self.rating_limit_hard = 0.82
self.default_rating_delta = 2
self.regression_coef = 0.04
self.db = DBImpl(psycopg2, dbparams) self.db = DBImpl(psycopg2, dbparams)
self.client = sphinxapi.SphinxClient() self.client = sphinxapi.SphinxClient()
self.client.SetServer("localhost", 9312) self.client.SetServer("localhost", 9312)
self.client.SetLimits(0, 10) self.client.SetLimits(0, 10)
self.client1 = sphinxapi.SphinxClient()
self.client1.SetServer("localhost", 9312)
self.client1.SetLimits(0, 10)
self.client1.SetConnectTimeout(7.0)
def __configure(self, index_name, wlen=None): def __configure(self, index_name, wlen=None):
if index_name == "idx_fias_sugg": if index_name == "idx_fias_sugg":
if wlen: if wlen:
@ -28,15 +38,13 @@ class SphinxSearch:
self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len) self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len)
self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen)) self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen))
self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC") self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC")
else:
self.client.SetMatchMode(sphinxapi.MA)
def __get_suggest(self, word): def __get_suggest(self, word, rating_limit, count):
word_len = str(len(word) / 2) word_len = str(len(word) / 2)
trigrammed_word = '"{}"/1'.format(trigram(word)) trigrammed_word = '"{}"/1'.format(trigram(word))
self.__configure("idx_fias_sugg", word_len) self.__configure(sphinx_index_sugg, word_len)
result = self.client.Query(trigrammed_word, 'idx_fias_sugg') result = self.client.Query(trigrammed_word, sphinx_index_sugg)
# Если по данному слову не найдено подсказок (а такое бывает?) # Если по данному слову не найдено подсказок (а такое бывает?)
# возвращаем [] # возвращаем []
@ -44,29 +52,63 @@ class SphinxSearch:
return [] return []
maxrank = result['matches'][0]['attrs']['krank'] maxrank = result['matches'][0]['attrs']['krank']
maxleven = None
outlist = list() outlist = list()
for match in result['matches']: for match in result['matches']:
if maxrank - match['attrs']['krank'] < 2: if len(outlist) >= count:
outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])]) break;
if maxrank - match['attrs']['krank'] < self.default_rating_delta:
jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
if not maxleven:
maxleven = jaro_rating - jaro_rating * self.regression_coef
if jaro_rating >= rating_limit and jaro_rating >= maxleven:
outlist.append([match['attrs']['word'], jaro_rating])
outlist.sort(key=lambda x: x[1], reverse=True) outlist.sort(key=lambda x: x[1], reverse=True)
for x in outlist:
print x[0], x[1]
return outlist return outlist
def __split_phrase(self, phrase): def __split_phrase(self, phrase):
phrase = unicode(phrase).replace('-', '').replace('@', '').lower() phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.]+", phrase) return re.split(r"[ ,:.#$]+", phrase)
def __process_words(self, words): def __add_word_variations(self, word_entry):
if word_entry.MT_MANY_SUGG:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_SOME_SUGG:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_LAST_STAR:
word_entry.add_variation(word_entry.word+'*')
if word_entry.MT_AS_IS:
word_entry.add_variation(word_entry.word)
if word_entry.MT_ADD_SOCR:
word_entry.add_variation_socr()
def __get_word_entries(self, words):
for word in words: for word in words:
yield WordEntry(self.db, word) if word != '':
we = WordEntry(self.db, word)
self.__add_word_variations(we)
yield we
def find(self, text): def find(self, text):
words = self.__split_phrase(text) words = self.__split_phrase(text)
word_entries = self.__process_words(words) word_entries = self.__get_word_entries(words)
for word_entry in word_entries: sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))
print word_entry, word_entry.get_type() #self.__configure(sphinx_index_addjobj)
# result = self.client.Query(text) self.client1.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2)
# print json.dumps(result) self.client1.SetRankingMode(sphinxapi.SPH_RANK_SPH04)
# logging.info("12") #self.client1.SetF
rs = self.client1.Query(sentence, sphinx_index_addjobj)
print rs
for ma in rs['matches']:
print ma['attrs']['fullname'], ma['weight']
print sentence

View File

@ -41,33 +41,52 @@ class WordEntry:
def __init__(self, db, word): def __init__(self, db, word):
self.db = db self.db = db
self.word = word self.word = str(word)
self.ranks = self.__get_word_entity() self.variations = []
self.scname = None
self.ranks = self.__get_ranks()
for x, y in self.match_types.iteritems(): for x, y in self.match_types.iteritems():
self.__dict__[x] = False self.__dict__[x] = False
for z in y: for z in y:
self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
def __get_word_entity(self): if self.MT_LAST_STAR:
self.MT_AS_IS = False
def add_variation_socr(self):
if self.scname:
self.add_variation(self.scname)
def add_variation(self, variation_string):
self.variations.append(variation_string)
def get_variations(self):
#if len(self.variations) == 1:
# return "\"{}\"".format(self.variations[0])
return "({})".format(" | ".join(self.variations))
def __get_ranks(self):
word_len = len(self.word) word_len = len(self.word)
sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
"UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}' " \ "UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \
"UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \ "UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
"UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format( "UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
self.word, word_len, self.word, self.word, self.word) self.word, word_len, self.word, self.word, self.word)
result = self.db.get_rows(sql_qry) result = self.db.get_rows(sql_qry)
if not self.scname:
self.scname = result[2][1]
outmask = "" outmask = ""
for ra in result: for ra in result:
if ra[0] > 1: if ra[0] > 1:
outmask += 'x' outmask += 'x'
else: else:
outmask += str(ra[0]) outmask += str(ra[0])
return outmask return outmask
def get_type(self): def get_type(self):
return ", ".join([x for x in self.match_types if self.__dict__[x]]) return ", ".join([x for x in self.match_types if self.__dict__[x]])
def __str__(self):
return str(self.word)

View File

@ -1 +1 @@
COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL' COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'

View File

@ -2,4 +2,4 @@ DROP TABLE IF EXISTS "{{tablename}}_TEMP";
CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT * CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
FROM "{{tablename}}" WITH NO DATA; FROM "{{tablename}}" WITH NO DATA;
COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'; COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP"); DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP");

View File

@ -7,4 +7,4 @@ FROM
"{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}}; "{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}};
% if tablename=="ADDROBJ": % if tablename=="ADDROBJ":
DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL; DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL;
% end % end

View File

@ -2,4 +2,4 @@ CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel");
CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid"); CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus"); CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid"); CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");
CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word"); CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word");

View File

@ -37,4 +37,4 @@ CREATE TABLE "AOTRIG" (
CONSTRAINT "id_aotrig" PRIMARY KEY ("id") CONSTRAINT "id_aotrig" PRIMARY KEY ("id")
) )
WITH (OIDS =FALSE WITH (OIDS =FALSE
); );

View File

@ -27,11 +27,11 @@ def main():
p.add_option('--source', '-s', default="http", p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: \"http\" or absolute path to folder") help="Create/update DB from source. Value: \"http\" or absolute path to folder")
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False", p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
help="Configure sphinx. Creates sphinx.conf in working direcory") help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'")
p.add_option('--indexer-path', '-i', p.add_option('--indexer-path', '-i',
help="Path to sphinx indexer binary. Must be specified for '--sphinx-configure'") help="Path to sphinx indexer binary. Required for '--sphinx-configure'")
p.add_option('--output-conf', '-o', p.add_option('--output-conf', '-o',
help="Output config filename. Must be specified for '--sphinx-configure'") help="Output config filename. Required for '--sphinx-configure'")
p.add_option('--test', '-t', action="store_true", dest="test", p.add_option('--test', '-t', action="store_true", dest="test",
help="Test") help="Test")
@ -54,7 +54,7 @@ def main():
# 4 Debug purposes.. # 4 Debug purposes..
if options.test: if options.test:
sph = SphinxSearch() sph = SphinxSearch()
sph.find('гор Горно-алтайск проспект Ленина') sph.find('#москва$#северное тушино$$$ул#туристская')
if __name__ == '__main__': if __name__ == '__main__':
main() main()