From 6c09dd2cdb0ceadfad535e1946c0b51da1d7c8a8 Mon Sep 17 00:00:00 2001
From: Jack Stdin <hellotan@live.ru>
Date: Sat, 16 Jan 2016 15:53:13 +0300
Subject: [PATCH] =?UTF-8?q?=D0=9D=D0=B0=D1=87=D0=B0=D0=BB=D0=BE=20=D1=80?=
 =?UTF-8?q?=D0=B0=D0=B7=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B8=20=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B3=D0=B8=D0=BA=D0=B8=20=D0=BE=D0=B1=D1=80=D0=B0=D0=B1?=
 =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B8=20=D1=81=D0=BB=D0=BE=D0=B2.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 aore/dbutils/dbimpl.py                 |  1 +
 aore/dbutils/dbschemas.py              |  2 +-
 aore/fias/search.py                    | 49 +++--------------
 aore/fias/word.py                      | 73 ++++++++++++++++++++++++++
 aore/miscutils/sphinx.py               |  9 ++--
 aore/templates/postgre/post_create.sql |  3 +-
 aore/templates/postgre/pre_create.sql  | 11 ++++
 aore/templates/sphinx/idx_suggest.conf |  3 +-
 manage.py                              |  2 +-
 9 files changed, 104 insertions(+), 49 deletions(-)
 create mode 100644 aore/fias/word.py

diff --git a/aore/dbutils/dbimpl.py b/aore/dbutils/dbimpl.py
index 75b5281..4e1de9a 100644
--- a/aore/dbutils/dbimpl.py
+++ b/aore/dbutils/dbimpl.py
@@ -24,6 +24,7 @@ class DBImpl:
     def execute(self, sql_query):
         try:
             cur = self.get_cursor()
+            print sql_query
             cur.execute(sql_query)
             self.transaction_commit()
         except:
diff --git a/aore/dbutils/dbschemas.py b/aore/dbutils/dbschemas.py
index 02fdd28..45b5a8f 100644
--- a/aore/dbutils/dbschemas.py
+++ b/aore/dbutils/dbschemas.py
@@ -18,7 +18,7 @@ db_shemas['ADDROBJ'] = DbSchema("ADDROBJ",
 db_shemas['SOCRBASE'] = DbSchema("SOCRBASE", ["LEVEL", "SOCRNAME", "SCNAME", "KOD_T_ST"], "kod_t_st",
                                  "AddressObjectType")
 
-db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM"], "word",
+db_shemas['AOTRIG'] = DbSchema("AOTRIG", ["WORD", "TRIGRAMM", "FREQUENCY"], "word",
                                  None)
 
 allowed_tables = ["ADDROBJ", "SOCRBASE"]
diff --git a/aore/fias/search.py b/aore/fias/search.py
index 2ac06a6..fa9f7f1 100644
--- a/aore/fias/search.py
+++ b/aore/fias/search.py
@@ -8,6 +8,7 @@ import sphinxapi
 
 from aore.config import db as dbparams
 from aore.dbutils.dbimpl import DBImpl
+from aore.fias.word import WordEntry
 from aore.miscutils.trigram import trigram
 
 
@@ -30,44 +31,6 @@ class SphinxSearch:
             else:
                 self.client.SetMatchMode(sphinxapi.MA)
 
-    # Types =
-    class SRankType:
-        names = dict(
-            SRANK_EXACTLY_MISSPRINT=['00'],  # Точно - опечатка, нужно много подсказок, без word*
-            SRANK_EXACTLY_TYPING=['01', '11'],  # Точно - слово недопечатано, не надо подсказок, только word*
-            SRANK_PROBABLY_TYPING=['0*'],  # Возможно - слово недопечатано, немного подсказок и word*
-            SRANK_PROBABLY_FOUND=['10'],  # Возможно - слово введено точно, немного подсказок, без word*
-            SRANK_PROBABLY_COMPLEX=['1*'],
-            # Возможно, слово сложное, есть и точное совпадние, по маске Нужно немного подсказок и word*
-            SRANK_PROBABLY_SOCR=['1!']  # Возможно - сокращение, не трогаем вообще
-        )
-
-        def __init__(self, rtype):
-            self.rtype = rtype
-            for x, y in self.names.iteritems():
-                self.__dict__[x] = self.rtype in y
-
-        def __str__(self):
-            return ", ".join([x for x in self.names if self.__dict__[x]])
-
-    def __get_strong_and_uncomplete_ranks(self, word):
-        word_len = len(word)
-        sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
-                  "UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}'".format(
-            word, word_len, word)
-
-        result = self.db.get_rows(sql_qry)
-        strong_rank = result[1][0]
-        uncomplete_rank = result[0][0]
-
-        if uncomplete_rank > 1000 and word_len < 4:
-            uncomplete_rank = '!'
-        else:
-            if uncomplete_rank > 1:
-                uncomplete_rank = '*'
-
-        return self.SRankType(str(strong_rank) + str(uncomplete_rank))
-
     def __get_suggest(self, word):
         word_len = str(len(word) / 2)
         trigrammed_word = '"{}"/1'.format(trigram(word))
@@ -95,13 +58,15 @@ class SphinxSearch:
         phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
         return re.split(r"[ ,:.]+", phrase)
 
-    def __process_word(self, word):
-        print word, self.__get_strong_and_uncomplete_ranks(word)
+    def __process_words(self, words):
+        for word in words:
+            yield WordEntry(self.db, word)
 
     def find(self, text):
         words = self.__split_phrase(text)
-        for word in words:
-            self.__process_word(word)
+        word_entries = self.__process_words(words)
+        for word_entry in word_entries:
+            print word_entry, word_entry.get_type()
             # result = self.client.Query(text)
             # print json.dumps(result)
             # logging.info("12")
diff --git a/aore/fias/word.py b/aore/fias/word.py
new file mode 100644
index 0000000..cdcb8f6
--- /dev/null
+++ b/aore/fias/word.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+import re
+
+
+class WordEntry:
+    # Варианты распеределния для слов с первыми двумя символами, где:
+    # 0 - не найдено, 1 - найдено одно, x - найдено много (>1)
+    # 1st - кол-во слов по LIKE 'word%'
+    # 2nd - кол-во слов по точному совпадению
+    #
+    # 00 - не найдено ничего вообще. Опечатка или дряное слово. Ищем с подсказками (много)
+    # 01 - найдено одно точное совпадение, но нет лайков. Оставляем как есть.
+    # -0x - найдено много точных совпадений и... быть не может, там уник.
+    # 10 - найден один по лайку и ни одного точного. Недопечатка. * и немного подсказок.
+    # 11 - одно по лайку и одно точное. Нашли. Оставляем слово как есть.
+    # -1x - одно по лайку и много точных. Быть не может.
+    # x0 - много по лайку и нет точных. Недопечатка. Немного подсказок и *.
+    # x1 - много по лайку и один точный. Чет нашли. Как есть и *.
+    # xx - много по лайку и много точных. Оставляем как есть и *
+    #
+    # Теперь по сокращениям. Они работюат отдельно (ПОКА ЧТО)
+    # 3rd - кол-во слов по точному совпдению по полному сокращению.
+    # 4th - кол-во слов по точному совпадению по малому сокращению.
+    #
+    # 00 - ни найдено нигде. Значит, не сокращение (или с опечаткой). Не обрабатываем.
+    # 01 - найдено одно малое сокращение. Оставляем как есть (малые и так в словаре)
+    # 0x - найдено много малых. Не обрабатываем.
+    # 10 - найдено одно полное и 0 малых. Добавляем малое.
+    # 11 - найдено одно полное и одно малое. Бывает (допустим, 'сад'). Добавляем как есть.
+    # -1x - найдено одно полное и куча малых. Ну бред.
+    # x0 - найдено куча полных и ни одного малого. Добавляем малое.
+    # x1 - Куча полных и 1 малое. TODO Хз, бывает ли. Не обрабатываем.
+    # xx - Куча полных и куча малых. Не обрабатываем.
+    match_types = dict(
+        MT_MANY_SUGG=['0000'],
+        MT_SOME_SUGG=['10..', 'x0..'],
+        MT_LAST_STAR=['10..', 'x...'],
+        MT_AS_IS=['.1..', '...1', '...x'],
+        MT_ADD_SOCR=['..10', '..x0']
+    )
+
+    def __init__(self, db, word):
+        self.db = db
+        self.word = word
+        self.ranks = self.__get_word_entity()
+
+        for x, y in self.match_types.iteritems():
+            self.__dict__[x] = False
+            for z in y:
+                self.__dict__[x] = self.__dict__[x] or re.search(z, self.ranks) is not None
+
+    def __get_word_entity(self):
+        word_len = len(self.word)
+        sql_qry = "SELECT COUNT(*) FROM \"AOTRIG\" WHERE word LIKE '{}%' AND LENGTH(word) > {} " \
+                  "UNION ALL SELECT COUNT(*) FROM \"AOTRIG\" WHERE word='{}' " \
+                  "UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE socrname ILIKE '{}'" \
+                  "UNION ALL SELECT COUNT(*) FROM \"SOCRBASE\" WHERE scname ILIKE '{}'".format(
+            self.word, word_len, self.word, self.word, self.word)
+
+        result = self.db.get_rows(sql_qry)
+        outmask = ""
+        for ra in result:
+            if ra[0] > 1:
+                outmask += 'x'
+            else:
+                outmask += str(ra[0])
+        return outmask
+
+    def get_type(self):
+        return ", ".join([x for x in self.match_types if self.__dict__[x]])
+
+    def __str__(self):
+        return str(self.word)
diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py
index c4e2c07..fd54ad1 100644
--- a/aore/miscutils/sphinx.py
+++ b/aore/miscutils/sphinx.py
@@ -36,7 +36,7 @@ class SphinxHelper:
         out_fname = self.__create_main_config(config_filename)
 
         # Indexing both configs
-        run_index_cmd = "{} -c {} --all".format(self.index_binary, out_fname)
+        run_index_cmd = "{} -c {} --all --rotate".format(self.index_binary, out_fname)
         logging.info("Indexing main ({})...".format(out_fname))
         os.system(run_index_cmd)
         logging.info("All indexes were created.".format(out_fname))
@@ -81,12 +81,15 @@ class SphinxHelper:
                 if line == '':
                     break
 
-                keyword = line.split(' ')[0]
-                if not keyword:
+                splitting_seq = line.split(' ')
+                keyword = splitting_seq[0]
+                freq = splitting_seq[1].rstrip('\n')
+                if not keyword or not freq:
                     raise BaseException("Cannot process {}".format(self.files['dict.txt']))
 
                 nodes.append(keyword)
                 nodes.append(trigram(keyword))
+                nodes.append(freq)
 
                 exit_file.write("\t".join(nodes) + "\n")
 
diff --git a/aore/templates/postgre/post_create.sql b/aore/templates/postgre/post_create.sql
index 49e52fb..2c29013 100644
--- a/aore/templates/postgre/post_create.sql
+++ b/aore/templates/postgre/post_create.sql
@@ -1,4 +1,5 @@
 CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel");
 CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
 CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
-CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");
\ No newline at end of file
+CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");
+CREATE INDEX "AOTRIG_word_idx" ON "AOTRIG" USING btree ("word");
\ No newline at end of file
diff --git a/aore/templates/postgre/pre_create.sql b/aore/templates/postgre/pre_create.sql
index aff9d28..152b64b 100644
--- a/aore/templates/postgre/pre_create.sql
+++ b/aore/templates/postgre/pre_create.sql
@@ -26,4 +26,15 @@ CREATE TABLE "SOCRBASE" (
   CONSTRAINT "id_socrbase" PRIMARY KEY ("id")
 )
 WITH (OIDS =FALSE
+);
+DROP TABLE IF EXISTS "AOTRIG";
+CREATE TABLE "AOTRIG" (
+  "id"        SERIAL4 NOT NULL,
+  "word"      VARCHAR(50),
+  "trigramm"  VARCHAR(180),
+  "frequency" INT4,
+  CONSTRAINT "word" UNIQUE ("word"),
+  CONSTRAINT "id_aotrig" PRIMARY KEY ("id")
+)
+WITH (OIDS =FALSE
 );
\ No newline at end of file
diff --git a/aore/templates/sphinx/idx_suggest.conf b/aore/templates/sphinx/idx_suggest.conf
index ce3e0f7..1249518 100644
--- a/aore/templates/sphinx/idx_suggest.conf
+++ b/aore/templates/sphinx/idx_suggest.conf
@@ -7,11 +7,12 @@ source {{index_name}}
     sql_db              = {{db_name}}
     sql_port            = {{db_port}}
 
-	sql_query				= SELECT id, trigramm, word, LENGTH(word) AS len FROM "AOTRIG"
+	sql_query				= SELECT id, trigramm, word, LENGTH(word) AS len, frequency FROM "AOTRIG"
 
     sql_field_string        = trigramm
 	sql_attr_uint			= len
 	sql_attr_string			= word
+	sql_attr_string			= frequency
 }
 
 index {{index_name}}
diff --git a/manage.py b/manage.py
index f31daec..79d4f06 100644
--- a/manage.py
+++ b/manage.py
@@ -54,7 +54,7 @@ def main():
     # 4 Debug purposes..
     if options.test:
         sph = SphinxSearch()
-        sph.find('город Гавно д. пидарская, ул Кощеева')
+        sph.find('гор Горно-алтайск проспект Ленина')
 
 if __name__ == '__main__':
     main()