Добавлена возможность обновления (создания) конкретной версии

This commit is contained in:
Jack Stdin 2016-01-17 21:08:01 +03:00
parent 4d565e5808
commit 0bd79b1311
14 changed files with 176 additions and 102 deletions

View File

@ -1,11 +1,28 @@
# py-fias
WSGI application they can serve FIAS (Russian Address Object DB)
Простое приложение для работы с БД ФИАС, написано для Python 2.7
## Установка
Протестирована работа на следующих ОС: Windows (8.1) и Debian Jessie
Предполагается, что у Вас уже установлена БД PostgreSql версии 9.5, интерпретатор Python 2.7
### Зависимости
1. Windows
1. Установим *уйню...
Для работы приложения необходимо достаточное кол-во RAM (1Gb+) и 4.5Gb места на диске
(3-3.5Gb для скачивания архива с базой и 300-400Mb для индексов Sphinx). Также необходимы root права
(или Администратора, для OS Windows), для работы демона Sphinx и предварительной установки.
Предварительно нужно установить и настроить:
1. Python 2.7 [Windows](https://www.python.org/downloads/windows/), [Debian](https://www.python.org/downloads/source/)
(`sudo apt-get install python2.7 python2.7-dev`), pip
2. PostgreSql 9.5 и выше (из-за _ON CONFLICT_)
3. Sphinx 2.2.3 и новее (из-за синтаксиса _MAYBE_)
### Windows
1. Установить sphinxapi последней версии:
`python -m pip install https://github.com/Romamo/sphinxapi/zipball/master`

View File

@ -1,39 +0,0 @@
# -*- coding: utf-8 -*-
from pysimplesoap.client import SoapClient
class Importer:
def __init__(self):
self.client = SoapClient(
location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/',
namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
soap_ns='soap', trace=False, ns=False)
def get_current_fias_version(self):
return 224 # TODO FIXIT
def get_full(self):
response = self.client.GetLastDownloadFileInfo()
assert response, "Response is null"
downloadfileinfo = response.GetLastDownloadFileInfoResponse.GetLastDownloadFileInfoResult
assert downloadfileinfo.VersionId < self.get_current_fias_version(), "DB is already up-to-date"
yield dict(intver=int(downloadfileinfo.VersionId), strver=str(downloadfileinfo.TextVersion),
url=str(downloadfileinfo.FiasCompleteXmlUrl))
# return (intver, strver, url)
def get_updates(self):
response = self.client.GetAllDownloadFileInfo()
assert response, "Response is null"
current_fias_version = self.get_current_fias_version()
for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo:
if int(DownloadFileInfo.VersionId) > current_fias_version:
yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion),
url=str(DownloadFileInfo.FiasDeltaXmlUrl))

17
aore/fias/fiasfactory.py Normal file
View File

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
from aore.fias.search import SphinxSearch
class FiasFactory:
def __init__(self):
self.searcher = SphinxSearch()
# text - строка поиска
# strong - строгий поиск или "мягкий" (с допущением ошибок, опечаток)
# out_format - "full" or "simple" - полный (подробно для каждого подпункта) или простой (только строка и AOID)
def find(self, text, strong=False, out_format="simple"):
try:
results = self.searcher.find(text, strong)
except:
return []

View File

@ -4,7 +4,7 @@ import re
import Levenshtein
import psycopg2
import aore.sphinxapi as sphinxapi
import sphinxapi
from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj
from aore.dbutils.dbimpl import DBImpl
@ -80,12 +80,12 @@ class SphinxSearch:
phrase = unicode(phrase).replace('-', '').replace('@', '').lower()
return re.split(r"[ ,:.#$]+", phrase)
def __add_word_variations(self, word_entry):
if word_entry.MT_MANY_SUGG:
def __add_word_variations(self, word_entry, strong):
if word_entry.MT_MANY_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
if word_entry.MT_SOME_SUGG:
if word_entry.MT_SOME_SUGG and not strong:
suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3)
for suggestion in suggs:
word_entry.add_variation(suggestion[0])
@ -96,16 +96,18 @@ class SphinxSearch:
if word_entry.MT_ADD_SOCR:
word_entry.add_variation_socr()
def __get_word_entries(self, words):
def __get_word_entries(self, words, strong):
for word in words:
if word != '':
we = WordEntry(self.db, word)
self.__add_word_variations(we)
self.__add_word_variations(we, strong)
if we.get_variations() == "()":
raise BaseException("Cannot process sentence.")
yield we
def find(self, text):
def find(self, text, strong):
words = self.__split_phrase(text)
word_entries = self.__get_word_entries(words)
word_entries = self.__get_word_entries(words, strong)
sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries))
self.__configure(sphinx_index_addjobj)
@ -114,4 +116,4 @@ class SphinxSearch:
results = []
for ma in rs['matches']:
results.append([ma['attrs']['aoid'], ma['attrs']['fullname'], ma['weight']])
print results
return results

View File

@ -5,9 +5,9 @@ import os
from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.updater.dbhandler import DbHandler
from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg
from aore.dbutils.dbhandler import DbHandler
from trigram import trigram

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import os
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.config import trashfolder
from aore.dbutils.dbschemas import db_shemas
from xmlparser import XMLParser
@ -18,18 +18,12 @@ class AoDataParser:
self.pagesize = pagesize
self.currentpage = 0
self.counter = 0
self.addrobj_filter = self.datasource.table_name == 'ADDROBJ' and self.datasource.operation_type == AoXmlTableEntry.OperationType.create
self.base_filename = ""
self.csv_file = None
self.data_bereit_callback = None
def import_update(self, attr):
# Addrobj anvanced filter
if self.addrobj_filter:
if attr['ACTSTATUS'] == '0' or 'NEXTID' in attr:
return
if self.counter > self.pagesize:
# Send old file to DB engine
if self.csv_file:

View File

@ -5,7 +5,7 @@ import logging
import psycopg2
from bottle import template
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.config import db as dbparams
from aore.dbutils.dbimpl import DBImpl
from aore.dbutils.dbschemas import db_shemas

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from pysimplesoap.client import SoapClient
class SoapReceiver:
def __init__(self):
self.client = SoapClient(
location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/',
namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx",
soap_ns='soap', trace=False, ns=False)
def get_current_fias_version(self):
return 224 # TODO FIXIT
# return (intver, strver, url)
def get_update_list(self):
response = self.client.GetAllDownloadFileInfo()
assert response, "Response is null"
for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo:
yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion),
delta_url=str(DownloadFileInfo.FiasDeltaXmlUrl),
complete_url=str(DownloadFileInfo.FiasCompleteXmlUrl))

View File

@ -3,15 +3,15 @@
import logging
from os import walk, path
from aore.aoutils.aodataparser import AoDataParser
from aore.aoutils.aorar import AoRar
from aore.aoutils.aoxmltableentry import AoXmlTableEntry
from aore.aoutils.importer import Importer
from aore.dbutils.dbhandler import DbHandler
from aore.updater.aodataparser import AoDataParser
from aore.updater.aorar import AoRar
from aore.updater.aoxmltableentry import AoXmlTableEntry
from aore.updater.dbhandler import DbHandler
from aore.updater.soapreceiver import SoapReceiver
from aore.dbutils.dbschemas import allowed_tables
class AoUpdater:
class Updater:
# Source: "http", directory (as a full path to unpacked xmls)
def __init__(self, source="http"):
self.db_handler = DbHandler()
@ -31,7 +31,7 @@ class AoUpdater:
def __get_updates_from_folder(self, foldername):
# TODO: Вычислять версию, если берем данные из каталога
yield dict(intver=0, textver="Unknown", url=foldername)
yield dict(intver=0, textver="Unknown", delta_url=foldername, complete_url=foldername)
def __get_updates_from_rar(self, url):
aorar = AoRar()
@ -39,14 +39,11 @@ class AoUpdater:
for table_entry in aorar.get_table_entries(fname, allowed_tables):
yield table_entry
def __init_update_entries(self, full_base):
def __init_update_entries(self, updates_generator):
if self.mode == "http":
assert updates_generator
self.tablelist_generator = self.__get_updates_from_rar
imp = Importer()
if full_base:
self.updalist_generator = imp.get_full()
else:
self.updalist_generator = imp.get_updates()
self.updalist_generator = updates_generator
else:
assert path.isdir(self.mode), "Invalid directory {}".format(self.mode)
self.updalist_generator = self.__get_updates_from_folder(self.mode)
@ -56,12 +53,13 @@ class AoUpdater:
aoparser = AoDataParser(table_xmlentry, chunck_size)
aoparser.parse(lambda x, y: self.db_handler.bulk_csv(operation_type, table_xmlentry.table_name, x, y))
def create(self):
self.__init_update_entries(True)
def create(self, updates_generator):
self.__init_update_entries(updates_generator)
self.db_handler.pre_create()
for update_entry in self.updalist_generator:
for table_entry in self.tablelist_generator(update_entry['url']):
logging.info("Processing update #{}".format(update_entry['intver']))
for table_entry in self.tablelist_generator(update_entry['complete_url']):
if table_entry.operation_type == AoXmlTableEntry.OperationType.update:
table_entry.operation_type = AoXmlTableEntry.OperationType.create
self.process_single_entry(table_entry.operation_type, table_entry)
@ -70,18 +68,13 @@ class AoUpdater:
logging.info("Create success")
def update(self, count=1):
self.__init_update_entries(False)
def update(self, updates_generator):
self.__init_update_entries(updates_generator)
self.db_handler.pre_update()
counter = 0
for update_entry in self.updalist_generator:
counter += 1
if counter > count:
logging.warning("Maximum count of updates ({}) are processed - exit".format(count))
break
for table_entry in self.tablelist_generator(update_entry['url']):
for update_entry in self.updates_generator:
logging.info("Processing update #{}".format(update_entry['intver']))
for table_entry in self.tablelist_generator(update_entry['delta_url']):
self.process_single_entry(table_entry.operation_type, table_entry)
logging.info("Update success")

View File

@ -2,30 +2,82 @@
import optparse
from aore.aoutils.aoupdater import AoUpdater
from aore.miscutils.sphinx import SphinxHelper
from aore.fias.search import SphinxSearch
from aore.miscutils.sphinx import SphinxHelper
from aore.updater.updater import Updater
from aore.updater.soapreceiver import SoapReceiver
def update_base(xml_source, updates_count):
aoupdater = AoUpdater(xml_source)
aoupdater.update(updates_count)
def print_fias_versions():
imp = SoapReceiver()
current_version = imp.get_current_fias_version()
all_versions = imp.get_update_list()
print("Installed version: {}".format(current_version))
print("Avaliable updates:")
print("Number\t\tDate")
for upd in all_versions:
mark_current = (' ', '*')[int(upd['intver']) == current_version]
print "{}{}\t\t{}".format(mark_current, upd['intver'], upd['strver'])
def create_base(xml_source):
aoupdater = AoUpdater(xml_source)
aoupdater.create()
def parse_update_str(updates_str):
if updates_str == "all":
return None
upd_list = updates_str.lower().replace(' ','').split(',')
out_list = []
for u_entry in upd_list:
if '-' in u_entry:
u_range = u_entry.split('-')
out_list += range(int(u_range[0]), int(u_range[1]))
else:
out_list.append(int(u_entry))
return out_list
def get_allowed_updates(updates_str, mode = "create"):
imp = SoapReceiver()
current_version = imp.get_current_fias_version()
all_versions = [x for x in imp.get_update_list()]
user_defined_list = parse_update_str(updates_str)
out_list = []
if mode == "create" and not user_defined_list:
yield all_versions[-1]
assert (mode == "create" and len(user_defined_list) == 1)
for uv in all_versions:
uv_ver = uv['intver']
if uv_ver > current_version and (not user_defined_list or uv_ver in user_defined_list):
out_list.append(uv)
out_list.sort(key=lambda x: x['intver'])
for ol_entry in out_list:
yield ol_entry
def main():
# Parse options
p = optparse.OptionParser()
p.add_option('--database', '-b', action="store", type="string",
help="Manage database. Value: create - create new DB, update - update existing DB without loose the data")
p.add_option('--update-count', '-u', default=1, type="int",
help="Count of updates to process, only for '--database update' option")
help="Manage database. Values: "
"create - create new DB, "
"update - update existing DB without loose the data")
p.add_option('--update-version', '-u', default="all", type="string",
help="Valid for updating via HTTP. "
"Versions of updates to process. Can be 111 or 111-222 or 111,222,333."
"For '--database-create' only one value is necessary. If not specified, "
"all updates will be processed (for '--database update') or last DB snapshot "
"(for '--database create')")
p.add_option('--show-versions', '-v', action="store_true", dest="show_versions", default=False,
help="Show allowed fias versions")
p.add_option('--source', '-s', default="http",
help="Create/update DB from source. Value: \"http\" or absolute path to folder")
help="Create/update DB from source. Value: 'http' or absolute path to folder containing XMLs")
p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False",
help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'")
p.add_option('--indexer-path', '-i',
@ -37,14 +89,25 @@ def main():
options, arguments = p.parse_args()
# Show FIAS updates
if options.show_versions:
print_fias_versions()
return
# Manage DB
if options.database:
# create new database
aoupdater = Updater(options.source)
allowed_updates = None
if options.source == "http":
allowed_updates = get_allowed_updates(options.update_version)
if options.database == "create":
create_base(options.source)
aoupdater.create(allowed_updates)
# update database
if options.database == "update":
update_base(options.source, int(options.update_count))
aoupdater.update(allowed_updates)
# Manage Sphinx
if options.sphinx and options.indexer_path and options.output_conf:
@ -54,7 +117,8 @@ def main():
# 4 Debug purposes..
if options.test:
sph = SphinxSearch()
sph.find('кридовая паскаул')
sph.find('кедровая пасраул')
if __name__ == '__main__':
main()