Add indexes and sphinx.conf (conf and query stolen from https://github.com/Yuego/django-fias)

This commit is contained in:
Jack Stdin 2016-01-14 01:36:34 +03:00
parent 67f6943dce
commit 759efc43ee
14 changed files with 216 additions and 57 deletions

View File

@ -66,6 +66,8 @@ class AoUpdater:
table_entry.operation_type = AoXmlTableEntry.OperationType.create
self.process_single_entry(table_entry.operation_type, table_entry)
self.db_handler.post_create()
logging.info("Create success")
def update(self, count=1):
@ -76,7 +78,7 @@ class AoUpdater:
for update_entry in self.updalist_generator:
counter += 1
if counter > count:
logging.warning("Maximum count of updates are processed - exit")
logging.warning("Maximum count of updates ({}) are processed - exit".format(count))
break
for table_entry in self.tablelist_generator(update_entry['url']):

View File

@ -12,12 +12,14 @@ DB_INSTANCES = dict(
user="postgres",
password="intercon",
database="postgres",
port=5432
),
production=dict(
host="localhost",
user="***",
password="***",
database="***",
port=5432
)
)
@ -26,10 +28,17 @@ UNRAR_PATHES = dict(
production="unrar"
)
SPHINX_VAR_DIRS = dict(
test="C:/Sphinx",
production="/var/sphinx"
)
# Uncomment if you want to specify config_type manually
# config_type = "test"
# Main section
sphinx_index_name="sph_addrobj"
sphinx_var_dir=SPHINX_VAR_DIRS[config_type]
db = DB_INSTANCES[config_type]
unrar = UNRAR_PATHES[config_type]
trashfolder = "files/"

View File

@ -3,6 +3,7 @@
import logging
import psycopg2
from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.config import db as dbparams
@ -14,28 +15,13 @@ class DbHandler:
def __init__(self):
self.db = DBImpl(psycopg2, dbparams)
f = open("aore/templates/postgre/bulk_create.sql")
self.syntax_bulk_create = f.read()
f.close()
f = open("aore/templates/postgre/bulk_update.sql")
self.syntax_bulk_update = f.read()
f.close()
f = open("aore/templates/postgre/bulk_delete.sql")
self.syntax_bulk_delete = f.read()
f.close()
def bulk_csv(self, operation_type, table_name, processed_count, csv_file_name):
sql_query = None
# simple add new reocrds
if operation_type == AoXmlTableEntry.OperationType.create:
sql_query = self.syntax_bulk_create \
.replace("%tab%", "\t") \
.replace("%tablename%", table_name) \
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name)
sql_query = template('aore/templates/postgre/bulk_create.sql', delim='\t', tablename=table_name,
fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name)
# update table
if operation_type == AoXmlTableEntry.OperationType.update:
@ -45,41 +31,34 @@ class DbHandler:
fields_update_list += "{}=EXCLUDED.{}, ".format(field, field)
fields_update_list = fields_update_list[:-2]
sql_query = self.syntax_bulk_update \
.replace("%tab%", "\t") \
.replace("%tablename%", table_name) \
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name) \
.replace("%uniquekey%", db_shemas[table_name].unique_field) \
.replace("%updaterule%", fields_update_list)
if table_name == "ADDROBJ":
sql_query += "DELETE FROM \"%tablename%\" WHERE %filterrule%;" \
.replace("%tablename%", table_name) \
.replace("%filterrule%",
"ACTSTATUS = FALSE OR NEXTID IS NOT NULL")
sql_query = template('aore/templates/postgre/bulk_update.sql', delim='\t', tablename=table_name,
fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name,
uniquekey=db_shemas[table_name].unique_field, updaterule=fields_update_list)
# delete records from table
if operation_type == AoXmlTableEntry.OperationType.delete:
sql_query = self.syntax_bulk_delete \
.replace("%tab%", "\t") \
.replace("%tablename%", table_name) \
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name) \
.replace("%uniquekey%", db_shemas[table_name].unique_field)
sql_query = template('aore/templates/postgre/bulk_delete.sql', delim='\t', tablename=table_name,
fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name,
uniquekey=db_shemas[table_name].unique_field)
assert sql_query, "Invalid operation type: {}".format(operation_type)
self.db.execute(sql_query)
logging.info("Processed {} queries FROM {}".format(processed_count-1, csv_file_name))
logging.info("Processed {} queries FROM {}".format(processed_count - 1, csv_file_name))
def pre_create(self):
f = open("aore/templates/postgre/pre_create.sql")
sql_query = f.read()
f.close()
logging.info("Prepare to create DB structure...")
sql_query = template("aore/templates/postgre/pre_create.sql")
self.db.execute(sql_query)
def post_create(self):
logging.info("Indexing ADDROBJ...")
sql_query = template("aore/templates/postgre/post_create.sql")
self.db.execute(sql_query)
logging.info("Indexing done.")
def pre_update(self):
# TODO: update actions
pass

View File

38
aore/miscutils/sphinx.py Normal file
View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
import logging
import os
from bottle import template
from aore.config import db as dbconfig, sphinx_index_name, sphinx_var_dir
def produce_sphinx_config(config_name):
logging.info("Creating {}".format(config_name))
conf_data = template('aore/templates/sphinx/data.conf', db_host=dbconfig['host'], db_user=dbconfig['user'],
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n"," \\\n"), index_name=sphinx_index_name,
sphinx_var_path=sphinx_var_dir)
conf_data += "\n" + template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
if os.path.isfile(config_name):
choice = raw_input(
"WARNING! File {} already exists. It will be overwritten, "
"all settings all setting will be lost! Are you sure? [y/n]: ".format(
config_name))
if choice.lower() != 'y':
logging.warning("Aborted.")
return
conf_file = open(config_name, "w")
conf_file.write(conf_data)
conf_file.close()
logging.info("Success! Re-index db: \n"
"\t$indexer -c {} --all --rotate\n"
"and then re/start your Sphinx:\n"
"\t$/etc/init.d/sphinxsearch stop\n"
"\t$/etc/init.d/sphinxsearch start".format(config_name))

View File

@ -1 +1 @@
COPY "%tablename%" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL'
COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'

View File

@ -1,5 +1,5 @@
DROP TABLE IF EXISTS "%tablename%_TEMP";
CREATE TEMP TABLE "%tablename%_TEMP" ON COMMIT DROP AS SELECT *
FROM "%tablename%" WITH NO DATA;
COPY "%tablename%_TEMP" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL';
DELETE FROM "%tablename%" WHERE %uniquekey% IN (SELECT %uniquekey% FROM "%tablename%_TEMP");
DROP TABLE IF EXISTS "{{tablename}}_TEMP";
CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
FROM "{{tablename}}" WITH NO DATA;
COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP");

View File

@ -1,7 +1,10 @@
DROP TABLE IF EXISTS "%tablename%_TEMP";
CREATE TEMP TABLE "%tablename%_TEMP" ON COMMIT DROP AS SELECT *
FROM "%tablename%" WITH NO DATA;
COPY "%tablename%_TEMP" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL';
INSERT INTO "%tablename%" (%fieldslist%) SELECT %fieldslist%
DROP TABLE IF EXISTS "{{tablename}}_TEMP";
CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
FROM "{{tablename}}" WITH NO DATA;
COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
INSERT INTO "{{tablename}}" ({{fieldslist}}) SELECT {{fieldslist}}
FROM
"%tablename%_TEMP" ON CONFLICT (%uniquekey%) DO UPDATE SET %updaterule%;
"{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}};
% if tablename=="ADDROBJ":
DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL;
% end

View File

@ -0,0 +1,4 @@
CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel");
CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");

View File

@ -1,5 +1,5 @@
DROP TABLE IF EXISTS "public"."ADDROBJ";
CREATE TABLE "public"."ADDROBJ" (
DROP TABLE IF EXISTS "ADDROBJ";
CREATE TABLE "ADDROBJ" (
"id" SERIAL4 NOT NULL,
"aoid" UUID NOT NULL,
"aoguid" UUID,
@ -15,8 +15,8 @@ CREATE TABLE "public"."ADDROBJ" (
)
WITH (OIDS =FALSE
);
DROP TABLE IF EXISTS "public"."SOCRBASE";
CREATE TABLE "public"."SOCRBASE" (
DROP TABLE IF EXISTS "SOCRBASE";
CREATE TABLE "SOCRBASE" (
"id" SERIAL4 NOT NULL,
"level" INT2,
"scname" VARCHAR(10),

View File

@ -0,0 +1,14 @@
WITH RECURSIVE PATH (cnt, aoid, aoguid, aolevel, fullname) AS (
SELECT ao.id as cnt, ao.aoid, ao.aoguid, ao.aolevel,
ao.shortname || ' ' || ao.formalname AS fullname
FROM "ADDROBJ" AS ao
WHERE aolevel = 1 AND livestatus = TRUE
UNION
SELECT child.id as cnt, child.aoid, child.aoguid, child.aolevel,
PATH.fullname || ', ' || child.shortname || ' ' || child.formalname AS fullname
FROM "ADDROBJ" AS child
, PATH
WHERE child.parentguid = PATH.aoguid AND livestatus = TRUE
)
SELECT * FROM PATH WHERE AOLEVEL NOT IN (1,3)

View File

@ -0,0 +1,41 @@
source src_{{index_name}}
{
type = pgsql
sql_host = {{db_host}}
sql_user = {{db_user}}
sql_pass = {{db_password}}
sql_db = {{db_name}}
sql_port = {{db_port}}
sql_query = {{!sql_query}}
sql_field_string = fullname
sql_attr_string = aoid
sql_attr_string = aoguid
sql_attr_uint = aolevel
}
index index_{{ index_name }}
{
docinfo = extern
morphology = stem_ru
min_stemming_len = 2
stopwords =
min_word_len = 2
charset_type = utf-8
min_prefix_len = 1
min_infix_len = 0
enable_star = 1
# strip html by default
html_strip = 1
ignore_chars = @, -
charset_table = 0..9, A..Z->a..z, _, a..z, \
U+0401->U+0435, U+0451->U+0435, \
U+410..U+42F->U+430..U+44F, U+430..U+44F
source = src_{{index_name}}
path = {{sphinx_var_path}}/data/index_{{index_name}}
}

View File

@ -0,0 +1,63 @@
indexer
{
# memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
# optional, default is 32M, max is 2047M, recommended is 256M to 1024M
mem_limit = 256M
# maximum IO calls per second (for I/O throttling)
# optional, default is 0 (unlimited)
#
# max_iops = 40
# maximum IO call size, bytes (for I/O throttling)
# optional, default is 0 (unlimited)
#
max_iosize = 524288
}
searchd
{
listen = 127.0.0.1:9312
# required by RT-indexes
workers = threads
# log file, searchd run info is logged here
# optional, default is 'searchd.log'
log = {{sphinx_var_path}}/log/searchd.log
# query log file, all search queries are logged here
# optional, default is empty (do not log queries)
query_log = {{sphinx_var_path}}/log/query.log
# client read timeout, seconds
# optional, default is 5
read_timeout = 5
# maximum amount of children to fork (concurrent searches to run)
# optional, default is 0 (unlimited)
max_children = 30
# PID file, searchd process ID file name
# mandatory
pid_file = {{sphinx_var_path}}/run/searchd.pid
# max amount of matches the daemon ever keeps in RAM, per-index
# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
# default is 1000 (just like Google)
max_matches = 1000
# seamless rotate, prevents rotate stalls if precaching huge datasets
# optional, default is 1
seamless_rotate = 1
# whether to forcibly preopen all indexes on startup
# optional, default is 0 (do not preopen)
preopen_indexes = 0
# whether to unlink .old index copies on succesful rotation.
# optional, default is 1 (do unlink)
unlink_old = 1
}

View File

@ -3,6 +3,7 @@
import optparse
from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import produce_sphinx_config
def update_base(xml_source, updates_count):
@ -24,6 +25,8 @@ def main():
help="Count of updates to process, only for '--database update' option")
p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: \"http\" or absolute path to folder")
p.add_option('--sphinx-configure', '-c', action="store", type="string",
help="Get Sphinx config. Value: /path/to/sphinx.conf")
options, arguments = p.parse_args()
if options.database:
@ -34,6 +37,9 @@ def main():
if options.database == "update":
update_base(options.source, int(options.update_count))
if options.sphinx_configure:
produce_sphinx_config(options.sphinx_configure)
if __name__ == '__main__':
main()