Добавлена возможность обновления (создания) конкретной версии

This commit is contained in:
Jack Stdin 2016-01-17 21:08:01 +03:00
parent 4d565e5808
commit 0bd79b1311
14 changed files with 176 additions and 102 deletions

View File

@ -1,11 +1,28 @@
# py-fias # py-fias
WSGI application they can serve FIAS (Russian Address Object DB) WSGI application they can serve FIAS (Russian Address Object DB)
Простое приложение для работы с БД ФИАС, написано для Python 2.7 Простое приложение для работы с БД ФИАС, написано для Python 2.7
## Установка ## Установка
Протестирована работа на следующих ОС: Windows (8.1) и Debian Jessie Протестирована работа на следующих ОС: Windows (8.1) и Debian Jessie
Предполагается, что у Вас уже установлена БД PostgreSql версии 9.5, интерпретатор Python 2.7 ### Зависимости
Для работы приложения необходимо достаточное кол-во RAM (1Gb+) и 4.5Gb места на диске
(3-3.5Gb для скачивания архива с базой и 300-400Mb для индексов Sphinx). Также необходимы root права
(или Администратора, для OS Windows), для работы демона Sphinx и предварительной установки.
Предварительно нужно установить и настроить:
1. Python 2.7 [Windows](https://www.python.org/downloads/windows/), [Debian](https://www.python.org/downloads/source/)
(`sudo apt-get install python2.7 python2.7-dev`), pip
2. PostgreSql 9.5 и выше (из-за _ON CONFLICT_)
3. Sphinx 2.2.3 и новее (из-за синтаксиса _MAYBE_)
### Windows
1. Установить sphinxapi последней версии:
`python -m pip install https://github.com/Romamo/sphinxapi/zipball/master`
1. Windows
1. Установим *уйню...

View File

@ -1,39 +0,0 @@
# -*- coding: utf-8 -*-
from pysimplesoap.client import SoapClient
class Importer:
def __init__(self):
self.client = SoapClient(
location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/',
namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
soap_ns='soap', trace=False, ns=False)
def get_current_fias_version(self):
return 224 # TODO FIXIT
def get_full(self):
response = self.client.GetLastDownloadFileInfo()
assert response, "Response is null"
downloadfileinfo = response.GetLastDownloadFileInfoResponse.GetLastDownloadFileInfoResult
assert downloadfileinfo.VersionId < self.get_current_fias_version(), "DB is already up-to-date"
yield dict(intver=int(downloadfileinfo.VersionId), strver=str(downloadfileinfo.TextVersion),
url=str(downloadfileinfo.FiasCompleteXmlUrl))
# return (intver, strver, url)
def get_updates(self):
response = self.client.GetAllDownloadFileInfo()
assert response, "Response is null"
current_fias_version = self.get_current_fias_version()
for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo:
if int(DownloadFileInfo.VersionId) > current_fias_version:
yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion),
url=str(DownloadFileInfo.FiasDeltaXmlUrl))

17
aore/fias/fiasfactory.py Normal file
View File

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
from aore.fias.search import SphinxSearch
class FiasFactory:
def __init__(self):
self.searcher = SphinxSearch()
# text - строка поиска
# strong - строгий поиск или "мягкий" (с допущением ошибок, опечаток)
# out_format - "full" or "simple" - полный (подробно для каждого подпункта) или простой (только строка и AOID)
def find(self, text, strong=False, out_format="simple"):
try:
results = self.searcher.find(text, strong)
except:
return []

View File

@ -4,7 +4,7 @@ import re
import Levenshtein import Levenshtein
import psycopg2 import psycopg2
import aore.sphinxapi as sphinxapi import sphinxapi
from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj
from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbimpl import DBImpl
@ -80,12 +80,12 @@ class SphinxSearch:
phrase = unicode(phrase).replace('-', '').replace('@', '').lower() phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.#$]+", phrase) return re.split(r"[ ,:.#$]+", phrase)
def __add_word_variations(self, word_entry): def __add_word_variations(self, word_entry, strong):
if word_entry.MT_MANY_SUGG: if word_entry.MT_MANY_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6) suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6)
for suggestion in suggs: for suggestion in suggs:
word_entry.add_variation(suggestion[0]) word_entry.add_variation(suggestion[0])
if word_entry.MT_SOME_SUGG: if word_entry.MT_SOME_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3) suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3)
for suggestion in suggs: for suggestion in suggs:
word_entry.add_variation(suggestion[0]) word_entry.add_variation(suggestion[0])
@ -96,16 +96,18 @@ class SphinxSearch:
if word_entry.MT_ADD_SOCR: if word_entry.MT_ADD_SOCR:
word_entry.add_variation_socr() word_entry.add_variation_socr()
def __get_word_entries(self, words): def __get_word_entries(self, words, strong):
for word in words: for word in words:
if word != '': if word != '':
we = WordEntry(self.db, word) we = WordEntry(self.db, word)
self.__add_word_variations(we) self.__add_word_variations(we, strong)
if we.get_variations() == "()":
raise BaseException("Cannot process sentence.")
yield we yield we
def find(self, text): def find(self, text, strong):
words = self.__split_phrase(text) words = self.__split_phrase(text)
word_entries = self.__get_word_entries(words) word_entries = self.__get_word_entries(words, strong)
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries)) sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))
self.__configure(sphinx_index_addjobj) self.__configure(sphinx_index_addjobj)
@ -114,4 +116,4 @@ class SphinxSearch:
results = [] results = []
for ma in rs['matches']: for ma in rs['matches']:
results.append([ma['attrs']['aoid'], ma['attrs']['fullname'], ma['weight']]) results.append([ma['attrs']['aoid'], ma['attrs']['fullname'], ma['weight']])
print results return results

View File

@ -5,9 +5,9 @@ import os
from bottle import template from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.updater.dbhandler import DbHandler
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg
from aore.dbutils.dbhandler import DbHandler
from trigram import trigram from trigram import trigram

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
from aore.aoutils.aoxmltableentry import AoXmlTableEntry from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.config import trashfolder from aore.config import trashfolder
from aore.dbutils.dbschemas import db_shemas from aore.dbutils.dbschemas import db_shemas
from xmlparser import XMLParser from xmlparser import XMLParser
@ -18,18 +18,12 @@ class AoDataParser:
self.pagesize = pagesize self.pagesize = pagesize
self.currentpage = 0 self.currentpage = 0
self.counter = 0 self.counter = 0
self.addrobj_filter = self.datasource.table_name == 'ADDROBJ' and self.datasource.operation_type == AoXmlTableEntry.OperationType.create
self.base_filename = "" self.base_filename = ""
self.csv_file = None self.csv_file = None
self.data_bereit_callback = None self.data_bereit_callback = None
def import_update(self, attr): def import_update(self, attr):
# Addrobj anvanced filter
if self.addrobj_filter:
if attr['ACTSTATUS'] == '0' or 'NEXTID' in attr:
return
if self.counter > self.pagesize: if self.counter > self.pagesize:
# Send old file to DB engine # Send old file to DB engine
if self.csv_file: if self.csv_file:

View File

@ -5,7 +5,7 @@ import logging
import psycopg2 import psycopg2
from bottle import template from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.config import db as dbparams from aore.config import db as dbparams
from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbimpl import DBImpl
from aore.dbutils.dbschemas import db_shemas from aore.dbutils.dbschemas import db_shemas

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from pysimplesoap.client import SoapClient
class SoapReceiver:
def __init__(self):
self.client = SoapClient(
location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/',
namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
soap_ns='soap', trace=False, ns=False)
def get_current_fias_version(self):
return 224 # TODO FIXIT
# return (intver, strver, url)
def get_update_list(self):
response = self.client.GetAllDownloadFileInfo()
assert response, "Response is null"
for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo:
yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion),
delta_url=str(DownloadFileInfo.FiasDeltaXmlUrl),
complete_url=str(DownloadFileInfo.FiasCompleteXmlUrl))

View File

@ -3,15 +3,15 @@
import logging import logging
from os import walk, path from os import walk, path
from aore.aoutils.aodataparser import AoDataParser from aore.updater.aodataparser import AoDataParser
from aore.aoutils.aorar import AoRar from aore.updater.aorar import AoRar
from aore.aoutils.aoxmltableentry import AoXmlTableEntry from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.aoutils.importer import Importer from aore.updater.dbhandler import DbHandler
from aore.dbutils.dbhandler import DbHandler from aore.updater.soapreceiver import SoapReceiver
from aore.dbutils.dbschemas import allowed_tables from aore.dbutils.dbschemas import allowed_tables
class AoUpdater: class Updater:
# Source: "http", directory (as a full path to unpacked xmls) # Source: "http", directory (as a full path to unpacked xmls)
def __init__(self, source="http"): def __init__(self, source="http"):
self.db_handler = DbHandler() self.db_handler = DbHandler()
@ -31,7 +31,7 @@ class AoUpdater:
def __get_updates_from_folder(self, foldername): def __get_updates_from_folder(self, foldername):
# TODO: Вычислять версию, если берем данные из каталога # TODO: Вычислять версию, если берем данные из каталога
yield dict(intver=0, textver="Unknown", url=foldername) yield dict(intver=0, textver="Unknown", delta_url=foldername, complete_url=foldername)
def __get_updates_from_rar(self, url): def __get_updates_from_rar(self, url):
aorar = AoRar() aorar = AoRar()
@ -39,14 +39,11 @@ class AoUpdater:
for table_entry in aorar.get_table_entries(fname, allowed_tables): for table_entry in aorar.get_table_entries(fname, allowed_tables):
yield table_entry yield table_entry
def __init_update_entries(self, full_base): def __init_update_entries(self, updates_generator):
if self.mode == "http": if self.mode == "http":
assert updates_generator
self.tablelist_generator = self.__get_updates_from_rar self.tablelist_generator = self.__get_updates_from_rar
imp = Importer() self.updalist_generator = updates_generator
if full_base:
self.updalist_generator = imp.get_full()
else:
self.updalist_generator = imp.get_updates()
else: else:
assert path.isdir(self.mode), "Invalid directory {}".format(self.mode) assert path.isdir(self.mode), "Invalid directory {}".format(self.mode)
self.updalist_generator = self.__get_updates_from_folder(self.mode) self.updalist_generator = self.__get_updates_from_folder(self.mode)
@ -56,12 +53,13 @@ class AoUpdater:
aoparser = AoDataParser(table_xmlentry, chunck_size) aoparser = AoDataParser(table_xmlentry, chunck_size)
aoparser.parse(lambda x, y: self.db_handler.bulk_csv(operation_type, table_xmlentry.table_name, x, y)) aoparser.parse(lambda x, y: self.db_handler.bulk_csv(operation_type, table_xmlentry.table_name, x, y))
def create(self): def create(self, updates_generator):
self.__init_update_entries(True) self.__init_update_entries(updates_generator)
self.db_handler.pre_create() self.db_handler.pre_create()
for update_entry in self.updalist_generator: for update_entry in self.updalist_generator:
for table_entry in self.tablelist_generator(update_entry['url']): logging.info("Processing update #{}".format(update_entry['intver']))
for table_entry in self.tablelist_generator(update_entry['complete_url']):
if table_entry.operation_type == AoXmlTableEntry.OperationType.update: if table_entry.operation_type == AoXmlTableEntry.OperationType.update:
table_entry.operation_type = AoXmlTableEntry.OperationType.create table_entry.operation_type = AoXmlTableEntry.OperationType.create
self.process_single_entry(table_entry.operation_type, table_entry) self.process_single_entry(table_entry.operation_type, table_entry)
@ -70,18 +68,13 @@ class AoUpdater:
logging.info("Create success") logging.info("Create success")
def update(self, count=1): def update(self, updates_generator):
self.__init_update_entries(False) self.__init_update_entries(updates_generator)
self.db_handler.pre_update() self.db_handler.pre_update()
counter = 0 for update_entry in self.updates_generator:
for update_entry in self.updalist_generator: logging.info("Processing update #{}".format(update_entry['intver']))
counter += 1 for table_entry in self.tablelist_generator(update_entry['delta_url']):
if counter > count:
logging.warning("Maximum count of updates ({}) are processed - exit".format(count))
break
for table_entry in self.tablelist_generator(update_entry['url']):
self.process_single_entry(table_entry.operation_type, table_entry) self.process_single_entry(table_entry.operation_type, table_entry)
logging.info("Update success") logging.info("Update success")

View File

@ -2,30 +2,82 @@
import optparse import optparse
from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import SphinxHelper
from aore.fias.search import SphinxSearch from aore.fias.search import SphinxSearch
from aore.miscutils.sphinx import SphinxHelper
from aore.updater.updater import Updater
from aore.updater.soapreceiver import SoapReceiver
def update_base(xml_source, updates_count): def print_fias_versions():
aoupdater = AoUpdater(xml_source) imp = SoapReceiver()
aoupdater.update(updates_count) current_version = imp.get_current_fias_version()
all_versions = imp.get_update_list()
print("Installed version: {}".format(current_version))
print("Avaliable updates:")
print("Number\t\tDate")
for upd in all_versions:
mark_current = (' ', '*')[int(upd['intver']) == current_version]
print "{}{}\t\t{}".format(mark_current, upd['intver'], upd['strver'])
def create_base(xml_source): def parse_update_str(updates_str):
aoupdater = AoUpdater(xml_source) if updates_str == "all":
aoupdater.create() return None
upd_list = updates_str.lower().replace(' ','').split(',')
out_list = []
for u_entry in upd_list:
if '-' in u_entry:
u_range = u_entry.split('-')
out_list += range(int(u_range[0]), int(u_range[1]))
else:
out_list.append(int(u_entry))
return out_list
def get_allowed_updates(updates_str, mode = "create"):
imp = SoapReceiver()
current_version = imp.get_current_fias_version()
all_versions = [x for x in imp.get_update_list()]
user_defined_list = parse_update_str(updates_str)
out_list = []
if mode == "create" and not user_defined_list:
yield all_versions[-1]
assert (mode == "create" and len(user_defined_list) == 1)
for uv in all_versions:
uv_ver = uv['intver']
if uv_ver > current_version and (not user_defined_list or uv_ver in user_defined_list):
out_list.append(uv)
out_list.sort(key=lambda x: x['intver'])
for ol_entry in out_list:
yield ol_entry
def main(): def main():
# Parse options # Parse options
p = optparse.OptionParser() p = optparse.OptionParser()
p.add_option('--database', '-b', action="store", type="string", p.add_option('--database', '-b', action="store", type="string",
help="Manage database. Value: create - create new DB, update - update existing DB without loose the data") help="Manage database. Values: "
p.add_option('--update-count', '-u', default=1, type="int", "create - create new DB, "
help="Count of updates to process, only for '--database update' option") "update - update existing DB without loose the data")
p.add_option('--update-version', '-u', default="all", type="string",
help="Valid for updating via HTTP. "
"Versions of updates to process. Can be 111 or 111-222 or 111,222,333."
"For '--database-create' only one value is necessary. If not specified, "
"all updates will be processed (for '--database update') or last DB snapshot "
"(for '--database create')")
p.add_option('--show-versions', '-v', action="store_true", dest="show_versions", default=False,
help="Show allowed fias versions")
p.add_option('--source', '-s', default="http", p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: \"http\" or absolute path to folder") help="Create/update DB from source. Value: 'http' or absolute path to folder containing XMLs")
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False", p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'") help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'")
p.add_option('--indexer-path', '-i', p.add_option('--indexer-path', '-i',
@ -37,14 +89,25 @@ def main():
options, arguments = p.parse_args() options, arguments = p.parse_args()
# Show FIAS updates
if options.show_versions:
print_fias_versions()
return
# Manage DB # Manage DB
if options.database: if options.database:
# create new database # create new database
aoupdater = Updater(options.source)
allowed_updates = None
if options.source == "http":
allowed_updates = get_allowed_updates(options.update_version)
if options.database == "create": if options.database == "create":
create_base(options.source) aoupdater.create(allowed_updates)
# update database # update database
if options.database == "update": if options.database == "update":
update_base(options.source, int(options.update_count)) aoupdater.update(allowed_updates)
# Manage Sphinx # Manage Sphinx
if options.sphinx and options.indexer_path and options.output_conf: if options.sphinx and options.indexer_path and options.output_conf:
@ -54,7 +117,8 @@ def main():
# 4 Debug purposes.. # 4 Debug purposes..
if options.test: if options.test:
sph = SphinxSearch() sph = SphinxSearch()
sph.find('кридовая паскаул') sph.find('кедровая пасраул')
if __name__ == '__main__': if __name__ == '__main__':
main() main()