|
|
|
|
@ -5,7 +5,8 @@ from typing import Dict
|
|
|
|
|
from kedro.framework.session import KedroSession
|
|
|
|
|
|
|
|
|
|
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
|
|
|
|
|
from actesdataset import (XMLDataSetCollection, JSONDataSetCollection)
|
|
|
|
|
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection)
|
|
|
|
|
# JSONDataSetCollection)
|
|
|
|
|
# FullJSONDataSetCollection)
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
@ -14,14 +15,15 @@ with KedroSession.create() as session:
|
|
|
|
|
context = session.load_context()
|
|
|
|
|
catalog = context.get_catalog()
|
|
|
|
|
|
|
|
|
|
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
|
|
|
|
|
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
|
|
|
|
|
"node function entry point, performs batch processing"
|
|
|
|
|
datasets = datasetcol.datasets
|
|
|
|
|
housename = datasetcol._housename
|
|
|
|
|
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
|
|
|
|
|
output_catalog = catalog[housename + '_xmlcontent']
|
|
|
|
|
outputfolderpath = output_catalog['folderpath']
|
|
|
|
|
output_datasets = dict()
|
|
|
|
|
# output_datasets = dict()
|
|
|
|
|
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
|
|
|
|
|
for dataset_filenamestem, dataset in datasets.items():
|
|
|
|
|
# a manual load is required here, because
|
|
|
|
|
# the dataset **is not** registered in kedro's catalog
|
|
|
|
|
@ -35,11 +37,11 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
|
|
|
|
|
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
# save on file
|
|
|
|
|
output_xmldataset._save(output_source_doc)
|
|
|
|
|
output_datasets[dataset_filenamestem] = output_xmldataset
|
|
|
|
|
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
|
|
|
|
|
return output_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]:
|
|
|
|
|
def parse_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]:
|
|
|
|
|
"node function entry point, performs batch processing"
|
|
|
|
|
datasets = datasetcol.datasets
|
|
|
|
|
housename = datasetcol._housename
|
|
|
|
|
|