From 0bd79b1311fafbc31dcf1df5a1ee5eb251337dd2 Mon Sep 17 00:00:00 2001 From: Jack Stdin Date: Sun, 17 Jan 2016 21:08:01 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B0=20=D0=B2=D0=BE=D0=B7=D0=BC=D0=BE=D0=B6=D0=BD=D0=BE?= =?UTF-8?q?=D1=81=D1=82=D1=8C=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F=20(=D1=81=D0=BE=D0=B7=D0=B4=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F)=20=D0=BA=D0=BE=D0=BD=D0=BA=D1=80=D0=B5=D1=82?= =?UTF-8?q?=D0=BD=D0=BE=D0=B9=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 23 ++++- aore/aoutils/importer.py | 39 -------- aore/fias/fiasfactory.py | 17 ++++ aore/fias/search.py | 20 ++-- aore/miscutils/sphinx.py | 4 +- aore/{aoutils => updater}/__init__.py | 0 aore/{aoutils => updater}/aodataparser.py | 8 +- aore/{aoutils => updater}/aorar.py | 0 aore/{aoutils => updater}/aoxmltableentry.py | 0 aore/{dbutils => updater}/dbhandler.py | 2 +- aore/updater/soapreceiver.py | 26 +++++ .../aoupdater.py => updater/updater.py} | 45 ++++----- aore/{aoutils => updater}/xmlparser.py | 0 manage.py | 94 ++++++++++++++++--- 14 files changed, 176 insertions(+), 102 deletions(-) delete mode 100644 aore/aoutils/importer.py create mode 100644 aore/fias/fiasfactory.py rename aore/{aoutils => updater}/__init__.py (100%) rename aore/{aoutils => updater}/aodataparser.py (86%) rename aore/{aoutils => updater}/aorar.py (100%) rename aore/{aoutils => updater}/aoxmltableentry.py (100%) rename aore/{dbutils => updater}/dbhandler.py (97%) create mode 100644 aore/updater/soapreceiver.py rename aore/{aoutils/aoupdater.py => updater/updater.py} (72%) rename aore/{aoutils => updater}/xmlparser.py (100%) diff --git a/README.md b/README.md index 0a55797..b0596aa 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,28 @@ # py-fias WSGI application they can serve FIAS (Russian Address Object DB) + Простое приложение для работы с БД ФИАС, написано для Python 2.7 ## Установка Протестирована работа на следующих ОС: Windows (8.1) и Debian Jessie -Предполагается, что у Вас уже установлена БД PostgreSql версии 9.5, интерпретатор Python 2.7 +### Зависимости -1. Windows - 1. Установим *уйню... \ No newline at end of file +Для работы приложения необходимо достаточное кол-во RAM (1Gb+) и 4.5Gb места на диске +(3-3.5Gb для скачивания архива с базой и 300-400Mb для индексов Sphinx). Также необходимы root права +(или Администратора, для OS Windows), для работы демона Sphinx и предварительной установки. + +Предварительно нужно установить и настроить: + +1. Python 2.7 [Windows](https://www.python.org/downloads/windows/), [Debian](https://www.python.org/downloads/source/) +(`sudo apt-get install python2.7 python2.7-dev`), pip + +2. PostgreSql 9.5 и выше (из-за _ON CONFLICT_) + +3. Sphinx 2.2.3 и новее (из-за синтаксиса _MAYBE_) + +### Windows +1. Установить sphinxapi последней версии: + +`python -m pip install https://github.com/Romamo/sphinxapi/zipball/master` + \ No newline at end of file diff --git a/aore/aoutils/importer.py b/aore/aoutils/importer.py deleted file mode 100644 index 828648f..0000000 --- a/aore/aoutils/importer.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- - -from pysimplesoap.client import SoapClient - - -class Importer: - def __init__(self): - self.client = SoapClient( - location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx", - action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/', - namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx", - soap_ns='soap', trace=False, ns=False) - - def get_current_fias_version(self): - return 224 # TODO FIXIT - - def get_full(self): - response = self.client.GetLastDownloadFileInfo() - - assert response, "Response is null" - downloadfileinfo = response.GetLastDownloadFileInfoResponse.GetLastDownloadFileInfoResult - - assert downloadfileinfo.VersionId < self.get_current_fias_version(), "DB is already up-to-date" - - yield dict(intver=int(downloadfileinfo.VersionId), strver=str(downloadfileinfo.TextVersion), - url=str(downloadfileinfo.FiasCompleteXmlUrl)) - - # return (intver, strver, url) - def get_updates(self): - response = self.client.GetAllDownloadFileInfo() - - assert response, "Response is null" - - current_fias_version = self.get_current_fias_version() - - for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo: - if int(DownloadFileInfo.VersionId) > current_fias_version: - yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion), - url=str(DownloadFileInfo.FiasDeltaXmlUrl)) diff --git a/aore/fias/fiasfactory.py b/aore/fias/fiasfactory.py new file mode 100644 index 0000000..7f9791d --- /dev/null +++ b/aore/fias/fiasfactory.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +from aore.fias.search import SphinxSearch + + +class FiasFactory: + def __init__(self): + self.searcher = SphinxSearch() + + # text - строка поиска + # strong - строгий поиск или "мягкий" (с допущением ошибок, опечаток) + # out_format - "full" or "simple" - полный (подробно для каждого подпункта) или простой (только строка и AOID) + def find(self, text, strong=False, out_format="simple"): + try: + results = self.searcher.find(text, strong) + + except: + return [] diff --git a/aore/fias/search.py b/aore/fias/search.py index 293097d..a6171c2 100644 --- a/aore/fias/search.py +++ b/aore/fias/search.py @@ -4,7 +4,7 @@ import re import Levenshtein import psycopg2 -import aore.sphinxapi as sphinxapi +import sphinxapi from aore.config import db as dbparams, sphinx_index_sugg, sphinx_index_addjobj from aore.dbutils.dbimpl import DBImpl @@ -80,12 +80,12 @@ class SphinxSearch: phrase = unicode(phrase).replace('-', '').replace('@', '').lower() return re.split(r"[ ,:.#$]+", phrase) - def __add_word_variations(self, word_entry): - if word_entry.MT_MANY_SUGG: + def __add_word_variations(self, word_entry, strong): + if word_entry.MT_MANY_SUGG and not strong: suggs = self.__get_suggest(word_entry.word, self.rating_limit_soft, 6) for suggestion in suggs: word_entry.add_variation(suggestion[0]) - if word_entry.MT_SOME_SUGG: + if word_entry.MT_SOME_SUGG and not strong: suggs = self.__get_suggest(word_entry.word, self.rating_limit_hard, 3) for suggestion in suggs: word_entry.add_variation(suggestion[0]) @@ -96,16 +96,18 @@ class SphinxSearch: if word_entry.MT_ADD_SOCR: word_entry.add_variation_socr() - def __get_word_entries(self, words): + def __get_word_entries(self, words, strong): for word in words: if word != '': we = WordEntry(self.db, word) - self.__add_word_variations(we) + self.__add_word_variations(we, strong) + if we.get_variations() == "()": + raise BaseException("Cannot process sentence.") yield we - def find(self, text): + def find(self, text, strong): words = self.__split_phrase(text) - word_entries = self.__get_word_entries(words) + word_entries = self.__get_word_entries(words, strong) sentence = "{}".format(" MAYBE ".join(x.get_variations() for x in word_entries)) self.__configure(sphinx_index_addjobj) @@ -114,4 +116,4 @@ class SphinxSearch: results = [] for ma in rs['matches']: results.append([ma['attrs']['aoid'], ma['attrs']['fullname'], ma['weight']]) - print results + return results diff --git a/aore/miscutils/sphinx.py b/aore/miscutils/sphinx.py index fd54ad1..f511660 100644 --- a/aore/miscutils/sphinx.py +++ b/aore/miscutils/sphinx.py @@ -5,9 +5,9 @@ import os from bottle import template -from aore.aoutils.aoxmltableentry import AoXmlTableEntry +from aore.updater.aoxmltableentry import AoXmlTableEntry +from aore.updater.dbhandler import DbHandler from aore.config import db as dbconfig, sphinx_index_addjobj, sphinx_var_dir, trashfolder, sphinx_index_sugg -from aore.dbutils.dbhandler import DbHandler from trigram import trigram diff --git a/aore/aoutils/__init__.py b/aore/updater/__init__.py similarity index 100% rename from aore/aoutils/__init__.py rename to aore/updater/__init__.py diff --git a/aore/aoutils/aodataparser.py b/aore/updater/aodataparser.py similarity index 86% rename from aore/aoutils/aodataparser.py rename to aore/updater/aodataparser.py index cae19c6..1126e66 100644 --- a/aore/aoutils/aodataparser.py +++ b/aore/updater/aodataparser.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import os -from aore.aoutils.aoxmltableentry import AoXmlTableEntry +from aore.updater.aoxmltableentry import AoXmlTableEntry from aore.config import trashfolder from aore.dbutils.dbschemas import db_shemas from xmlparser import XMLParser @@ -18,18 +18,12 @@ class AoDataParser: self.pagesize = pagesize self.currentpage = 0 self.counter = 0 - self.addrobj_filter = self.datasource.table_name == 'ADDROBJ' and self.datasource.operation_type == AoXmlTableEntry.OperationType.create self.base_filename = "" self.csv_file = None self.data_bereit_callback = None def import_update(self, attr): - # Addrobj anvanced filter - if self.addrobj_filter: - if attr['ACTSTATUS'] == '0' or 'NEXTID' in attr: - return - if self.counter > self.pagesize: # Send old file to DB engine if self.csv_file: diff --git a/aore/aoutils/aorar.py b/aore/updater/aorar.py similarity index 100% rename from aore/aoutils/aorar.py rename to aore/updater/aorar.py diff --git a/aore/aoutils/aoxmltableentry.py b/aore/updater/aoxmltableentry.py similarity index 100% rename from aore/aoutils/aoxmltableentry.py rename to aore/updater/aoxmltableentry.py diff --git a/aore/dbutils/dbhandler.py b/aore/updater/dbhandler.py similarity index 97% rename from aore/dbutils/dbhandler.py rename to aore/updater/dbhandler.py index b877cad..755d020 100644 --- a/aore/dbutils/dbhandler.py +++ b/aore/updater/dbhandler.py @@ -5,7 +5,7 @@ import logging import psycopg2 from bottle import template -from aore.aoutils.aoxmltableentry import AoXmlTableEntry +from aore.updater.aoxmltableentry import AoXmlTableEntry from aore.config import db as dbparams from aore.dbutils.dbimpl import DBImpl from aore.dbutils.dbschemas import db_shemas diff --git a/aore/updater/soapreceiver.py b/aore/updater/soapreceiver.py new file mode 100644 index 0000000..9265cb3 --- /dev/null +++ b/aore/updater/soapreceiver.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +from pysimplesoap.client import SoapClient + + +class SoapReceiver: + def __init__(self): + self.client = SoapClient( + location="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx", + action='http://fias.nalog.ru/WebServices/Public/DownloadService.asmx/', + namespace="http://fias.nalog.ru/WebServices/Public/DownloadService.asmx", + soap_ns='soap', trace=False, ns=False) + + def get_current_fias_version(self): + return 224 # TODO FIXIT + + # return (intver, strver, url) + def get_update_list(self): + response = self.client.GetAllDownloadFileInfo() + + assert response, "Response is null" + + for DownloadFileInfo in response.GetAllDownloadFileInfoResponse.GetAllDownloadFileInfoResult.DownloadFileInfo: + yield dict(intver=int(DownloadFileInfo.VersionId), strver=str(DownloadFileInfo.TextVersion), + delta_url=str(DownloadFileInfo.FiasDeltaXmlUrl), + complete_url=str(DownloadFileInfo.FiasCompleteXmlUrl)) diff --git a/aore/aoutils/aoupdater.py b/aore/updater/updater.py similarity index 72% rename from aore/aoutils/aoupdater.py rename to aore/updater/updater.py index 28eabf5..dc0baab 100644 --- a/aore/aoutils/aoupdater.py +++ b/aore/updater/updater.py @@ -3,15 +3,15 @@ import logging from os import walk, path -from aore.aoutils.aodataparser import AoDataParser -from aore.aoutils.aorar import AoRar -from aore.aoutils.aoxmltableentry import AoXmlTableEntry -from aore.aoutils.importer import Importer -from aore.dbutils.dbhandler import DbHandler +from aore.updater.aodataparser import AoDataParser +from aore.updater.aorar import AoRar +from aore.updater.aoxmltableentry import AoXmlTableEntry +from aore.updater.dbhandler import DbHandler +from aore.updater.soapreceiver import SoapReceiver from aore.dbutils.dbschemas import allowed_tables -class AoUpdater: +class Updater: # Source: "http", directory (as a full path to unpacked xmls) def __init__(self, source="http"): self.db_handler = DbHandler() @@ -31,7 +31,7 @@ class AoUpdater: def __get_updates_from_folder(self, foldername): # TODO: Вычислять версию, если берем данные из каталога - yield dict(intver=0, textver="Unknown", url=foldername) + yield dict(intver=0, textver="Unknown", delta_url=foldername, complete_url=foldername) def __get_updates_from_rar(self, url): aorar = AoRar() @@ -39,14 +39,11 @@ class AoUpdater: for table_entry in aorar.get_table_entries(fname, allowed_tables): yield table_entry - def __init_update_entries(self, full_base): + def __init_update_entries(self, updates_generator): if self.mode == "http": + assert updates_generator self.tablelist_generator = self.__get_updates_from_rar - imp = Importer() - if full_base: - self.updalist_generator = imp.get_full() - else: - self.updalist_generator = imp.get_updates() + self.updalist_generator = updates_generator else: assert path.isdir(self.mode), "Invalid directory {}".format(self.mode) self.updalist_generator = self.__get_updates_from_folder(self.mode) @@ -56,12 +53,13 @@ class AoUpdater: aoparser = AoDataParser(table_xmlentry, chunck_size) aoparser.parse(lambda x, y: self.db_handler.bulk_csv(operation_type, table_xmlentry.table_name, x, y)) - def create(self): - self.__init_update_entries(True) + def create(self, updates_generator): + self.__init_update_entries(updates_generator) self.db_handler.pre_create() for update_entry in self.updalist_generator: - for table_entry in self.tablelist_generator(update_entry['url']): + logging.info("Processing update #{}".format(update_entry['intver'])) + for table_entry in self.tablelist_generator(update_entry['complete_url']): if table_entry.operation_type == AoXmlTableEntry.OperationType.update: table_entry.operation_type = AoXmlTableEntry.OperationType.create self.process_single_entry(table_entry.operation_type, table_entry) @@ -70,18 +68,13 @@ class AoUpdater: logging.info("Create success") - def update(self, count=1): - self.__init_update_entries(False) + def update(self, updates_generator): + self.__init_update_entries(updates_generator) self.db_handler.pre_update() - counter = 0 - for update_entry in self.updalist_generator: - counter += 1 - if counter > count: - logging.warning("Maximum count of updates ({}) are processed - exit".format(count)) - break - - for table_entry in self.tablelist_generator(update_entry['url']): + for update_entry in self.updates_generator: + logging.info("Processing update #{}".format(update_entry['intver'])) + for table_entry in self.tablelist_generator(update_entry['delta_url']): self.process_single_entry(table_entry.operation_type, table_entry) logging.info("Update success") diff --git a/aore/aoutils/xmlparser.py b/aore/updater/xmlparser.py similarity index 100% rename from aore/aoutils/xmlparser.py rename to aore/updater/xmlparser.py diff --git a/manage.py b/manage.py index e0fe823..cbe6549 100644 --- a/manage.py +++ b/manage.py @@ -2,30 +2,82 @@ import optparse -from aore.aoutils.aoupdater import AoUpdater -from aore.miscutils.sphinx import SphinxHelper from aore.fias.search import SphinxSearch +from aore.miscutils.sphinx import SphinxHelper +from aore.updater.updater import Updater +from aore.updater.soapreceiver import SoapReceiver -def update_base(xml_source, updates_count): - aoupdater = AoUpdater(xml_source) - aoupdater.update(updates_count) +def print_fias_versions(): + imp = SoapReceiver() + current_version = imp.get_current_fias_version() + all_versions = imp.get_update_list() + + print("Installed version: {}".format(current_version)) + print("Avaliable updates:") + print("Number\t\tDate") + for upd in all_versions: + mark_current = (' ', '*')[int(upd['intver']) == current_version] + print "{}{}\t\t{}".format(mark_current, upd['intver'], upd['strver']) -def create_base(xml_source): - aoupdater = AoUpdater(xml_source) - aoupdater.create() +def parse_update_str(updates_str): + if updates_str == "all": + return None + + upd_list = updates_str.lower().replace(' ','').split(',') + out_list = [] + + for u_entry in upd_list: + if '-' in u_entry: + u_range = u_entry.split('-') + out_list += range(int(u_range[0]), int(u_range[1])) + else: + out_list.append(int(u_entry)) + + return out_list + + +def get_allowed_updates(updates_str, mode = "create"): + imp = SoapReceiver() + current_version = imp.get_current_fias_version() + all_versions = [x for x in imp.get_update_list()] + + user_defined_list = parse_update_str(updates_str) + out_list = [] + + if mode == "create" and not user_defined_list: + yield all_versions[-1] + + assert (mode == "create" and len(user_defined_list) == 1) + + for uv in all_versions: + uv_ver = uv['intver'] + if uv_ver > current_version and (not user_defined_list or uv_ver in user_defined_list): + out_list.append(uv) + + out_list.sort(key=lambda x: x['intver']) + for ol_entry in out_list: + yield ol_entry def main(): # Parse options p = optparse.OptionParser() p.add_option('--database', '-b', action="store", type="string", - help="Manage database. Value: create - create new DB, update - update existing DB without loose the data") - p.add_option('--update-count', '-u', default=1, type="int", - help="Count of updates to process, only for '--database update' option") + help="Manage database. Values: " + "create - create new DB, " + "update - update existing DB without loose the data") + p.add_option('--update-version', '-u', default="all", type="string", + help="Valid for updating via HTTP. " + "Versions of updates to process. Can be 111 or 111-222 or 111,222,333." + "For '--database-create' only one value is necessary. If not specified, " + "all updates will be processed (for '--database update') or last DB snapshot " + "(for '--database create')") + p.add_option('--show-versions', '-v', action="store_true", dest="show_versions", default=False, + help="Show allowed fias versions") p.add_option('--source', '-s', default="http", - help="Create/update DB from source. Value: \"http\" or absolute path to folder") + help="Create/update DB from source. Value: 'http' or absolute path to folder containing XMLs") p.add_option('--sphinx-configure', '-c', action="store_true", dest="sphinx", default="False", help="Configure sphinx. Creates sphinx.conf specified in '--output-conf'") p.add_option('--indexer-path', '-i', @@ -37,14 +89,25 @@ def main(): options, arguments = p.parse_args() + # Show FIAS updates + if options.show_versions: + print_fias_versions() + return + # Manage DB if options.database: # create new database + aoupdater = Updater(options.source) + allowed_updates = None + if options.source == "http": + allowed_updates = get_allowed_updates(options.update_version) + if options.database == "create": - create_base(options.source) + aoupdater.create(allowed_updates) + # update database if options.database == "update": - update_base(options.source, int(options.update_count)) + aoupdater.update(allowed_updates) # Manage Sphinx if options.sphinx and options.indexer_path and options.output_conf: @@ -54,7 +117,8 @@ def main(): # 4 Debug purposes.. if options.test: sph = SphinxSearch() - sph.find('кридовая паскаул') + sph.find('кедровая пасраул') + if __name__ == '__main__': main()