Add indexes and sphinx.conf (conf and query stolen from https://github.com/Yuego/django-fias)

This commit is contained in:
Jack Stdin 2016-01-14 01:36:34 +03:00
parent 67f6943dce
commit 759efc43ee
14 changed files with 216 additions and 57 deletions

View File

@ -66,6 +66,8 @@ class AoUpdater:
table_entry.operation_type = AoXmlTableEntry.OperationType.create table_entry.operation_type = AoXmlTableEntry.OperationType.create
self.process_single_entry(table_entry.operation_type, table_entry) self.process_single_entry(table_entry.operation_type, table_entry)
self.db_handler.post_create()
logging.info("Create success") logging.info("Create success")
def update(self, count=1): def update(self, count=1):
@ -76,7 +78,7 @@ class AoUpdater:
for update_entry in self.updalist_generator: for update_entry in self.updalist_generator:
counter += 1 counter += 1
if counter > count: if counter > count:
logging.warning("Maximum count of updates are processed - exit") logging.warning("Maximum count of updates ({}) are processed - exit".format(count))
break break
for table_entry in self.tablelist_generator(update_entry['url']): for table_entry in self.tablelist_generator(update_entry['url']):

View File

@ -12,12 +12,14 @@ DB_INSTANCES = dict(
user="postgres", user="postgres",
password="intercon", password="intercon",
database="postgres", database="postgres",
port=5432
), ),
production=dict( production=dict(
host="localhost", host="localhost",
user="***", user="***",
password="***", password="***",
database="***", database="***",
port=5432
) )
) )
@ -26,10 +28,17 @@ UNRAR_PATHES = dict(
production="unrar" production="unrar"
) )
SPHINX_VAR_DIRS = dict(
test="C:/Sphinx",
production="/var/sphinx"
)
# Uncomment if you want to specify config_type manually # Uncomment if you want to specify config_type manually
# config_type = "test" # config_type = "test"
# Main section # Main section
sphinx_index_name="sph_addrobj"
sphinx_var_dir=SPHINX_VAR_DIRS[config_type]
db = DB_INSTANCES[config_type] db = DB_INSTANCES[config_type]
unrar = UNRAR_PATHES[config_type] unrar = UNRAR_PATHES[config_type]
trashfolder = "files/" trashfolder = "files/"

View File

@ -3,6 +3,7 @@
import logging import logging
import psycopg2 import psycopg2
from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.config import db as dbparams from aore.config import db as dbparams
@ -14,28 +15,13 @@ class DbHandler:
def __init__(self): def __init__(self):
self.db = DBImpl(psycopg2, dbparams) self.db = DBImpl(psycopg2, dbparams)
f = open("aore/templates/postgre/bulk_create.sql")
self.syntax_bulk_create = f.read()
f.close()
f = open("aore/templates/postgre/bulk_update.sql")
self.syntax_bulk_update = f.read()
f.close()
f = open("aore/templates/postgre/bulk_delete.sql")
self.syntax_bulk_delete = f.read()
f.close()
def bulk_csv(self, operation_type, table_name, processed_count, csv_file_name): def bulk_csv(self, operation_type, table_name, processed_count, csv_file_name):
sql_query = None sql_query = None
# simple add new reocrds # simple add new reocrds
if operation_type == AoXmlTableEntry.OperationType.create: if operation_type == AoXmlTableEntry.OperationType.create:
sql_query = self.syntax_bulk_create \ sql_query = template('aore/templates/postgre/bulk_create.sql', delim='\t', tablename=table_name,
.replace("%tab%", "\t") \ fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name)
.replace("%tablename%", table_name) \
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name)
# update table # update table
if operation_type == AoXmlTableEntry.OperationType.update: if operation_type == AoXmlTableEntry.OperationType.update:
@ -45,41 +31,34 @@ class DbHandler:
fields_update_list += "{}=EXCLUDED.{}, ".format(field, field) fields_update_list += "{}=EXCLUDED.{}, ".format(field, field)
fields_update_list = fields_update_list[:-2] fields_update_list = fields_update_list[:-2]
sql_query = self.syntax_bulk_update \ sql_query = template('aore/templates/postgre/bulk_update.sql', delim='\t', tablename=table_name,
.replace("%tab%", "\t") \ fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name,
.replace("%tablename%", table_name) \ uniquekey=db_shemas[table_name].unique_field, updaterule=fields_update_list)
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name) \
.replace("%uniquekey%", db_shemas[table_name].unique_field) \
.replace("%updaterule%", fields_update_list)
if table_name == "ADDROBJ":
sql_query += "DELETE FROM \"%tablename%\" WHERE %filterrule%;" \
.replace("%tablename%", table_name) \
.replace("%filterrule%",
"ACTSTATUS = FALSE OR NEXTID IS NOT NULL")
# delete records from table # delete records from table
if operation_type == AoXmlTableEntry.OperationType.delete: if operation_type == AoXmlTableEntry.OperationType.delete:
sql_query = self.syntax_bulk_delete \ sql_query = template('aore/templates/postgre/bulk_delete.sql', delim='\t', tablename=table_name,
.replace("%tab%", "\t") \ fieldslist=", ".join(db_shemas[table_name].fields), csvname=csv_file_name,
.replace("%tablename%", table_name) \ uniquekey=db_shemas[table_name].unique_field)
.replace("%fieldslist%", ", ".join(db_shemas[table_name].fields)) \
.replace("%csvname%", csv_file_name) \
.replace("%uniquekey%", db_shemas[table_name].unique_field)
assert sql_query, "Invalid operation type: {}".format(operation_type) assert sql_query, "Invalid operation type: {}".format(operation_type)
self.db.execute(sql_query) self.db.execute(sql_query)
logging.info("Processed {} queries FROM {}".format(processed_count-1, csv_file_name)) logging.info("Processed {} queries FROM {}".format(processed_count - 1, csv_file_name))
def pre_create(self): def pre_create(self):
f = open("aore/templates/postgre/pre_create.sql") logging.info("Prepare to create DB structure...")
sql_query = f.read() sql_query = template("aore/templates/postgre/pre_create.sql")
f.close()
self.db.execute(sql_query) self.db.execute(sql_query)
def post_create(self):
logging.info("Indexing ADDROBJ...")
sql_query = template("aore/templates/postgre/post_create.sql")
self.db.execute(sql_query)
logging.info("Indexing done.")
def pre_update(self): def pre_update(self):
# TODO: update actions # TODO: update actions
pass pass

View File

38
aore/miscutils/sphinx.py Normal file
View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
import logging
import os
from bottle import template
from aore.config import db as dbconfig, sphinx_index_name, sphinx_var_dir
def produce_sphinx_config(config_name):
logging.info("Creating {}".format(config_name))
conf_data = template('aore/templates/sphinx/data.conf', db_host=dbconfig['host'], db_user=dbconfig['user'],
db_password=dbconfig['password'],
db_name=dbconfig['database'], db_port=dbconfig['port'],
sql_query=template('aore/templates/postgre/sphinx_query.sql').replace("\n"," \\\n"), index_name=sphinx_index_name,
sphinx_var_path=sphinx_var_dir)
conf_data += "\n" + template('aore/templates/sphinx/sphinx.conf', sphinx_var_path=sphinx_var_dir)
if os.path.isfile(config_name):
choice = raw_input(
"WARNING! File {} already exists. It will be overwritten, "
"all settings all setting will be lost! Are you sure? [y/n]: ".format(
config_name))
if choice.lower() != 'y':
logging.warning("Aborted.")
return
conf_file = open(config_name, "w")
conf_file.write(conf_data)
conf_file.close()
logging.info("Success! Re-index db: \n"
"\t$indexer -c {} --all --rotate\n"
"and then re/start your Sphinx:\n"
"\t$/etc/init.d/sphinxsearch stop\n"
"\t$/etc/init.d/sphinxsearch start".format(config_name))

View File

@ -1 +1 @@
COPY "%tablename%" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL' COPY "{{tablename}}" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL'

View File

@ -1,5 +1,5 @@
DROP TABLE IF EXISTS "%tablename%_TEMP"; DROP TABLE IF EXISTS "{{tablename}}_TEMP";
CREATE TEMP TABLE "%tablename%_TEMP" ON COMMIT DROP AS SELECT * CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
FROM "%tablename%" WITH NO DATA; FROM "{{tablename}}" WITH NO DATA;
COPY "%tablename%_TEMP" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL'; COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
DELETE FROM "%tablename%" WHERE %uniquekey% IN (SELECT %uniquekey% FROM "%tablename%_TEMP"); DELETE FROM "{{tablename}}" WHERE {{uniquekey}} IN (SELECT {{uniquekey}} FROM "{{tablename}}_TEMP");

View File

@ -1,7 +1,10 @@
DROP TABLE IF EXISTS "%tablename%_TEMP"; DROP TABLE IF EXISTS "{{tablename}}_TEMP";
CREATE TEMP TABLE "%tablename%_TEMP" ON COMMIT DROP AS SELECT * CREATE TEMP TABLE "{{tablename}}_TEMP" ON COMMIT DROP AS SELECT *
FROM "%tablename%" WITH NO DATA; FROM "{{tablename}}" WITH NO DATA;
COPY "%tablename%_TEMP" (%fieldslist%) FROM '%csvname%' DELIMITER '%tab%' NULL 'NULL'; COPY "{{tablename}}_TEMP" ({{fieldslist}}) FROM '{{csvname}}' DELIMITER '{{delim}}' NULL 'NULL';
INSERT INTO "%tablename%" (%fieldslist%) SELECT %fieldslist% INSERT INTO "{{tablename}}" ({{fieldslist}}) SELECT {{fieldslist}}
FROM FROM
"%tablename%_TEMP" ON CONFLICT (%uniquekey%) DO UPDATE SET %updaterule%; "{{tablename}}_TEMP" ON CONFLICT ({{uniquekey}}) DO UPDATE SET {{updaterule}};
% if tablename=="ADDROBJ":
DELETE FROM "{{tablename}}" WHERE ACTSTATUS = FALSE OR NEXTID IS NOT NULL;
% end

View File

@ -0,0 +1,4 @@
CREATE INDEX "sphinx_ind_aolevel" ON "ADDROBJ" USING btree ("aolevel");
CREATE INDEX "sphinx_ind_parentguid" ON "ADDROBJ" USING btree ("parentguid");
CREATE INDEX "sphinx_ind_livestatus" ON "ADDROBJ" USING btree ("livestatus");
CREATE INDEX "sphinx_ind_aoguid" ON "ADDROBJ" USING btree ("aoguid");

View File

@ -1,5 +1,5 @@
DROP TABLE IF EXISTS "public"."ADDROBJ"; DROP TABLE IF EXISTS "ADDROBJ";
CREATE TABLE "public"."ADDROBJ" ( CREATE TABLE "ADDROBJ" (
"id" SERIAL4 NOT NULL, "id" SERIAL4 NOT NULL,
"aoid" UUID NOT NULL, "aoid" UUID NOT NULL,
"aoguid" UUID, "aoguid" UUID,
@ -15,8 +15,8 @@ CREATE TABLE "public"."ADDROBJ" (
) )
WITH (OIDS =FALSE WITH (OIDS =FALSE
); );
DROP TABLE IF EXISTS "public"."SOCRBASE"; DROP TABLE IF EXISTS "SOCRBASE";
CREATE TABLE "public"."SOCRBASE" ( CREATE TABLE "SOCRBASE" (
"id" SERIAL4 NOT NULL, "id" SERIAL4 NOT NULL,
"level" INT2, "level" INT2,
"scname" VARCHAR(10), "scname" VARCHAR(10),

View File

@ -0,0 +1,14 @@
WITH RECURSIVE PATH (cnt, aoid, aoguid, aolevel, fullname) AS (
SELECT ao.id as cnt, ao.aoid, ao.aoguid, ao.aolevel,
ao.shortname || ' ' || ao.formalname AS fullname
FROM "ADDROBJ" AS ao
WHERE aolevel = 1 AND livestatus = TRUE
UNION
SELECT child.id as cnt, child.aoid, child.aoguid, child.aolevel,
PATH.fullname || ', ' || child.shortname || ' ' || child.formalname AS fullname
FROM "ADDROBJ" AS child
, PATH
WHERE child.parentguid = PATH.aoguid AND livestatus = TRUE
)
SELECT * FROM PATH WHERE AOLEVEL NOT IN (1,3)

View File

@ -0,0 +1,41 @@
source src_{{index_name}}
{
type = pgsql
sql_host = {{db_host}}
sql_user = {{db_user}}
sql_pass = {{db_password}}
sql_db = {{db_name}}
sql_port = {{db_port}}
sql_query = {{!sql_query}}
sql_field_string = fullname
sql_attr_string = aoid
sql_attr_string = aoguid
sql_attr_uint = aolevel
}
index index_{{ index_name }}
{
docinfo = extern
morphology = stem_ru
min_stemming_len = 2
stopwords =
min_word_len = 2
charset_type = utf-8
min_prefix_len = 1
min_infix_len = 0
enable_star = 1
# strip html by default
html_strip = 1
ignore_chars = @, -
charset_table = 0..9, A..Z->a..z, _, a..z, \
U+0401->U+0435, U+0451->U+0435, \
U+410..U+42F->U+430..U+44F, U+430..U+44F
source = src_{{index_name}}
path = {{sphinx_var_path}}/data/index_{{index_name}}
}

View File

@ -0,0 +1,63 @@
indexer
{
# memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
# optional, default is 32M, max is 2047M, recommended is 256M to 1024M
mem_limit = 256M
# maximum IO calls per second (for I/O throttling)
# optional, default is 0 (unlimited)
#
# max_iops = 40
# maximum IO call size, bytes (for I/O throttling)
# optional, default is 0 (unlimited)
#
max_iosize = 524288
}
searchd
{
listen = 127.0.0.1:9312
# required by RT-indexes
workers = threads
# log file, searchd run info is logged here
# optional, default is 'searchd.log'
log = {{sphinx_var_path}}/log/searchd.log
# query log file, all search queries are logged here
# optional, default is empty (do not log queries)
query_log = {{sphinx_var_path}}/log/query.log
# client read timeout, seconds
# optional, default is 5
read_timeout = 5
# maximum amount of children to fork (concurrent searches to run)
# optional, default is 0 (unlimited)
max_children = 30
# PID file, searchd process ID file name
# mandatory
pid_file = {{sphinx_var_path}}/run/searchd.pid
# max amount of matches the daemon ever keeps in RAM, per-index
# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
# default is 1000 (just like Google)
max_matches = 1000
# seamless rotate, prevents rotate stalls if precaching huge datasets
# optional, default is 1
seamless_rotate = 1
# whether to forcibly preopen all indexes on startup
# optional, default is 0 (do not preopen)
preopen_indexes = 0
# whether to unlink .old index copies on succesful rotation.
# optional, default is 1 (do unlink)
unlink_old = 1
}

View File

@ -3,6 +3,7 @@
import optparse import optparse
from aore.aoutils.aoupdater import AoUpdater from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import produce_sphinx_config
def update_base(xml_source, updates_count): def update_base(xml_source, updates_count):
@ -24,6 +25,8 @@ def main():
help="Count of updates to process, only for '--database update' option") help="Count of updates to process, only for '--database update' option")
p.add_option('--source', '-s', default="http", p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: \"http\" or absolute path to folder") help="Create/update DB from source. Value: \"http\" or absolute path to folder")
p.add_option('--sphinx-configure', '-c', action="store", type="string",
help="Get Sphinx config. Value: /path/to/sphinx.conf")
options, arguments = p.parse_args() options, arguments = p.parse_args()
if options.database: if options.database:
@ -34,6 +37,9 @@ def main():
if options.database == "update": if options.database == "update":
update_base(options.source, int(options.update_count)) update_base(options.source, int(options.update_count))
if options.sphinx_configure:
produce_sphinx_config(options.sphinx_configure)
if __name__ == '__main__': if __name__ == '__main__':
main() main()