diff --git a/aore/dbutils/dbimpl.py b/aore/dbutils/dbimpl.py index 4e1de9a..75b5281 100644 --- a/aore/dbutils/dbimpl.py +++ b/aore/dbutils/dbimpl.py @@ -24,7 +24,6 @@ class DBImpl: def execute(self, sql_query): try: cur = self.get_cursor() - print sql_query cur.execute(sql_query) self.transaction_commit() except: diff --git a/aore/fias/search.py b/aore/fias/search.py index fa9f7f1..5c9e998 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -1,25 +1,35 @@ # -*- coding: utf-8 -*- - +import json import re import Levenshtein import psycopg2 -import sphinxapi +import aore.sphinxapi as sphinxapi -from aore.config import db as dbparams +from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj from aore.dbutils.dbimpl import DBImpl -from aore.fias.word import WordEntry +from aore.fias.wordentry import WordEntry from aore.miscutils.trigram import trigram class SphinxSearch: def __init__(self): self.delta_len = 2 + self.rating_limit_soft = 0.4 + self.rating_limit_hard = 0.82 + self.default_rating_delta = 2 + self.regression_coef = 0.04 + self.db = DBImpl(psycopg2, dbparams) self.client = sphinxapi.SphinxClient() self.client.SetServer("localhost", 9312) self.client.SetLimits(0, 10) + self.client1 = sphinxapi.SphinxClient() + self.client1.SetServer("localhost", 9312) + self.client1.SetLimits(0, 10) + self.client1.SetConnectTimeout(7.0) + def __configure(self, index_name, wlen=None): if index_name == "idx_fias_sugg": if wlen: @@ -28,15 +38,13 @@ class SphinxSearch: self.client.SetFilterRange("len", int(wlen) - self.delta_len, int(wlen) + self.delta_len) self.client.SetSelect("word, len, @weight+{}-abs(len-{}) AS krank".format(self.delta_len, wlen)) self.client.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "krank DESC") - else: - self.client.SetMatchMode(sphinxapi.MA) - def __get_suggest(self, word): + def __get_suggest(self, word, rating_limit, count): word_len = str(len(word) / 2) trigrammed_word = '"{}"/1'.format(trigram(word)) - self.__configure("idx_fias_sugg", word_len) - result = self.client.Query(trigrammed_word, 'idx_fias_sugg') + self.__configure(sphinx_index_sugg, word_len) + result = self.client.Query(trigrammed_word, sphinx_index_sugg) # Если по данному слову не найдено подсказок (а такое бывает?) # возвращаем [] @@ -44,29 +52,63 @@ class SphinxSearch: return [] maxrank = result['matches'][0]['attrs']['krank'] + maxleven = None + outlist = list() for match in result['matches']: - if maxrank - match['attrs']['krank'] < 2: - outlist.append([match['attrs']['word'], Levenshtein.jaro(word, match['attrs']['word'])]) + if len(outlist) >= count: + break; + + if maxrank - match['attrs']['krank'] < self.default_rating_delta: + jaro_rating = Levenshtein.jaro(word, match['attrs']['word']) + if not maxleven: + maxleven = jaro_rating - jaro_rating * self.regression_coef + if jaro_rating >= rating_limit and jaro_rating >= maxleven: + outlist.append([match['attrs']['word'], jaro_rating]) + outlist.sort(key=lambda x: x[1], reverse=True) - for x in outlist: - print x[0], x[1] return outlist def __split_phrase(self, phrase): phrase = unicode(phrase).replace('-', '').replace('@', '').lower() - return re.split(r"[ ,:.]+", phrase) + return re.split(r"[ ,:.#$]+", phrase) - def __process_words(self, words): + def __add_word_variations(self, word_entry): + if word_entry.MT_MANY_SUGG: + suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6) + for suggestion in suggs: + word_entry.add_variation(suggestion[0]) + if word_entry.MT_SOME_SUGG: + suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3) + for suggestion in suggs: + word_entry.add_variation(suggestion[0]) + if word_entry.MT_LAST_STAR: + word_entry.add_variation(word_entry.word+'*') + if word_entry.MT_AS_IS: + word_entry.add_variation(word_entry.word) + if word_entry.MT_ADD_SOCR: + word_entry.add_variation_socr() + + + def __get_word_entries(self, words): for word in words: - yield WordEntry(self.db, word) + if word != '': + we = WordEntry(self.db, word) + self.__add_word_variations(we) + yield we + def find(self, text): words = self.__split_phrase(text) - word_entries = self.__process_words(words) - for word_entry in word_entries: - print word_entry, word_entry.get_type() - # result = self.client.Query(text) - # print json.dumps(result) - # logging.info("12") + word_entries = self.__get_word_entries(words) + sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries)) + #self.__configure(sphinx_index_addjobj) + self.client1.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) + self.client1.SetRankingMode(sphinxapi.SPH_RANK_SPH04) + #self.client1.SetF + rs = self.client1.Query(sentence, sphinx_index_addjobj) + print rs + for ma in rs['matches']: + print ma['attrs']['fullname'], ma['weight'] + print sentence diff --git a/aore/fias/word.py b/aore/fias/wordentry.py similarity index 77% rename from aore/fias/word.py rename to aore/fias/wordentry.py index cdcb8f6..dd33865 100644 --- a/aore/fias/word.py +++ b/aore/fias/wordentry.py @@ -41,33 +41,52 @@ class WordEntry: def __init__(self, db, word): self.db = db - self.word = word - self.ranks = self.__get_word_entity() + self.word = str(word) + self.variations = [] + self.scname = None + self.ranks = self.__get_ranks() + for x, y in self.match_types.iteritems(): self.__dict__[x] = False for z in y: self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None - def __get_word_entity(self): + if self.MT_LAST_STAR: + self.MT_AS_IS = False + + def add_variation_socr(self): + if self.scname: + self.add_variation(self.scname) + + def add_variation(self, variation_string): + self.variations.append(variation_string) + + def get_variations(self): + #if len(self.variations) == 1: + # return "\"{}\"".format(self.variations[0]) + return "({})".format(" | ".join(self.variations)) + + def __get_ranks(self): word_len = len(self.word) - sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ - "UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}' " \ - "UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \ - "UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format( + sql_qry = "SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \ + "UNION ALL SELECT COUNT(*), NULL FROM \"AOTRIG\" WHERE word='{}' " \ + "UNION ALL SELECT COUNT(*), MAX(scname) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \ + "UNION ALL SELECT COUNT(*), NULL FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format( self.word, word_len, self.word, self.word, self.word) result = self.db.get_rows(sql_qry) + if not self.scname: + self.scname = result[2][1] + outmask = "" for ra in result: if ra[0] > 1: outmask += 'x' else: outmask += str(ra[0]) + return outmask def get_type(self): return ", ".join([x for x in self.match_types if self.__dict__[x]]) - - def __str__(self): - return str(self.word) diff --git a/aore/templates/postgre/bulk_create.sql b/aore/templates/postgre/bulk_create.sql index 23a3af6..f2d6043 100644 --- a/aore/templates/postgre/bulk_create.sql +++ b/aore/templates/postgre/bulk_create.sql @@ -1 +1 @@ -COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL' \ No newline at end of file +COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL' diff --git a/aore/templates/postgre/bulk_delete.sql b/aore/templates/postgre/bulk_delete.sql index 47e27fa..7c6f510 100644 --- a/aore/templates/postgre/bulk_delete.sql +++ b/aore/templates/postgre/bulk_delete.sql @@ -2,4 +2,4 @@ DROP TABLE IF EXISTS "{{tablename}}_TEMP"; CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT * FROM "{{tablename}}" WITH NO DATA; COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'; -DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP"); \ No newline at end of file +DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP"); diff --git a/aore/templates/postgre/bulk_update.sql b/aore/templates/postgre/bulk_update.sql index 43a2d3e..ec1e171 100644 --- a/aore/templates/postgre/bulk_update.sql +++ b/aore/templates/postgre/bulk_update.sql @@ -7,4 +7,4 @@ FROM "{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}}; % if tablename=="ADDROBJ": DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL; -% end \ No newline at end of file +% end diff --git a/aore/templates/postgre/post_create.sql b/aore/templates/postgre/post_create.sql index 2c29013..a35b183 100644 --- a/aore/templates/postgre/post_create.sql +++ b/aore/templates/postgre/post_create.sql @@ -2,4 +2,4 @@ CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel"); CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid"); CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus"); CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid"); -CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word"); \ No newline at end of file +CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word"); diff --git a/aore/templates/postgre/pre_create.sql b/aore/templates/postgre/pre_create.sql index 152b64b..d38ea35 100644 --- a/aore/templates/postgre/pre_create.sql +++ b/aore/templates/postgre/pre_create.sql @@ -37,4 +37,4 @@ CREATE TABLE "AOTRIG" ( CONSTRAINT "id_aotrig" PRIMARY KEY ("id") ) WITH (OIDS =FALSE -); \ No newline at end of file +); diff --git a/manage.py b/manage.py index 79d4f06..5af8f68 100644 --- a/manage.py +++ b/manage.py @@ -27,11 +27,11 @@ def main(): p.add_option('--source', '-s', default="http", help="Create/update DB from source. Value: \"http\" or absolute path to folder") p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False", - help="Configure sphinx. Creates sphinx.conf in working direcory") + help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'") p.add_option('--indexer-path', '-i', - help="Path to sphinx indexer binary. Must be specified for '--sphinx-configure'") + help="Path to sphinx indexer binary. Required for '--sphinx-configure'") p.add_option('--output-conf', '-o', - help="Output config filename. Must be specified for '--sphinx-configure'") + help="Output config filename. Required for '--sphinx-configure'") p.add_option('--test', '-t', action="store_true", dest="test", help="Test") @@ -54,7 +54,7 @@ def main(): # 4 Debug purposes.. if options.test: sph = SphinxSearch() - sph.find('гор Горно-алтайск проспект Ленина') + sph.find('#москва$#северное тушино$$$ул#туристская') if __name__ == '__main__': main()