|
|
|
@ -2,13 +2,16 @@ import logging
|
|
|
|
from pathlib import Path
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import Dict
|
|
|
|
from typing import Dict
|
|
|
|
|
|
|
|
|
|
|
|
from actesdataset import EtreeXMLDataSet
|
|
|
|
from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
|
|
|
|
def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
|
|
|
|
"node function entry point, performs batch processing"
|
|
|
|
"node function entry point, performs batch processing"
|
|
|
|
|
|
|
|
# collection mapping
|
|
|
|
|
|
|
|
datasets = datasetcollection.datasets
|
|
|
|
|
|
|
|
outputfolderpath = datasetcollection.outputfolderpath
|
|
|
|
output_datasets = dict()
|
|
|
|
output_datasets = dict()
|
|
|
|
for dataset_filenamestem, dataset in datasets.items():
|
|
|
|
for dataset_filenamestem, dataset in datasets.items():
|
|
|
|
# a manual load is required here, because
|
|
|
|
# a manual load is required here, because
|
|
|
|
@ -18,9 +21,9 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, Etre
|
|
|
|
logger.info(f"dataset {descr} loaded")
|
|
|
|
logger.info(f"dataset {descr} loaded")
|
|
|
|
output_source_doc = dataset.transform()
|
|
|
|
output_source_doc = dataset.transform()
|
|
|
|
# set dataset's output filepath
|
|
|
|
# set dataset's output filepath
|
|
|
|
# output_filepath = _outputfolderpath
|
|
|
|
# output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
|
|
|
|
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
|
|
|
|
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
|
|
|
|
output_xmldataset = EtreeXMLDataSet(output_filepath)
|
|
|
|
output_xmldataset = EtreeXMLDataSet(str(output_filepath))
|
|
|
|
# let's create subfolders now, if they don't exist
|
|
|
|
# let's create subfolders now, if they don't exist
|
|
|
|
output_filepath = Path(output_filepath)
|
|
|
|
output_filepath = Path(output_filepath)
|
|
|
|
output_xmldataset_dir = output_filepath.parent
|
|
|
|
output_xmldataset_dir = output_filepath.parent
|
|
|
|
|