From 83f39b8986910dcdc4732d9179fa5a7da5a4fdd0 Mon Sep 17 00:00:00 2001 From: gwen Date: Sat, 8 Jul 2023 16:38:35 +0200 Subject: [PATCH] output dataset --- actes-princiers/conf/base/catalog.yml | 6 +++--- .../pipelines/xml_processing/nodes.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index b2047a8..1e410ca 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -6,19 +6,19 @@ bourbon: housename: bourbon folderpath: data/01_raw/houses/bourbon -# output dataset +# output (write) dataset bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon/xml -# input (read only dataset) +# input (read only) dataset bourbon_json: type: actesdataset.BsXMLDataSetCollection housename: bourbon folderpath: data/01_raw/houses/bourbon -# output dataset +# output (write) dataset bourbon_jsonoutput: type: actesdataset.JSONDataSetCollection housename: bourbon diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 92df7ec..38a9b88 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -5,9 +5,8 @@ from typing import Dict from kedro.framework.session import KedroSession from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet -from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection) -# JSONDataSetCollection) -# FullJSONDataSetCollection) +from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection, + JSONDataSetCollection) logger = logging.getLogger(__name__) @@ -22,7 +21,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti # outputfolderpath = f"data/02_intermediate/houses/{housename}/xml" output_catalog = catalog[housename + '_xmlcontent'] outputfolderpath = output_catalog['folderpath'] -# output_datasets = dict() output_datasets = XMLDataSetCollection(housename, str(outputfolderpath)) for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because @@ -41,13 +39,13 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti return output_datasets -def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]: +def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetCollection: "node function entry point, performs batch processing" datasets = datasetcol.datasets housename = datasetcol._housename output_catalog = catalog[housename + '_jsonoutput'] outputfolderpath = output_catalog['folderpath'] - output_datasets = dict() + output_datasets = JSONDataSetCollection(housename, str(outputfolderpath)) for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because # the dataset **is not** registered in kedro's catalog @@ -55,13 +53,13 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLD output_source_doc = dataset.transform() # set dataset's output filepath output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json") - output_xmldataset = BsXMLDataSet(str(output_filepath)) + output_xmldataset = JSONDataSet(str(output_filepath)) # let's create subfolders, if they don't exist output_xmldataset_dir = output_filepath.parent output_xmldataset_dir.mkdir(parents=True, exist_ok=True) # save on file output_xmldataset._save(output_source_doc) - output_datasets[dataset_filenamestem] = output_xmldataset + output_datasets.datasets[dataset_filenamestem] = output_xmldataset return output_datasets #def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]: