From 1991698e5cc0f5bd2bc571ab5bc1b9fc77194811 Mon Sep 17 00:00:00 2001 From: gwen Date: Thu, 6 Jul 2023 15:08:58 +0200 Subject: [PATCH] output - node --- actes-princiers/conf/base/catalog.yml | 36 +++++++++---------- .../pipelines/xml_processing/nodes.py | 13 ++++--- .../pipelines/xml_processing/pipeline.py | 26 +++++++------- actes-princiers/src/actesdataset.py | 7 ++-- 4 files changed, 44 insertions(+), 38 deletions(-) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index e3e110d..4825ff3 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -6,7 +6,7 @@ bourbon: folderpath: data/01_raw/houses/bourbon outputfolderpath: data/02_intermediate/houses/bourbon/xml -bourbon_content: +bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon/xml @@ -19,25 +19,25 @@ bourbon_content: # ________________________________________________________________________ -berry: - type: actesdataset.XMLDataSetCollection - housename: berry - folderpath: data/01_raw/houses/berry +#berry: +# type: actesdataset.XMLDataSetCollection +# housename: berry +# folderpath: data/01_raw/houses/berry -berry_content: - type: actesdataset.XMLDataSetCollection - housename: berry - folderpath: data/02_intermediate/houses/berry +#berry_content: +# type: actesdataset.XMLDataSetCollection +# housename: berry +# folderpath: data/02_intermediate/houses/berry -# ________________________________________________________________________ +## ________________________________________________________________________ -anjou: - type: actesdataset.XMLDataSetCollection - housename: berry - folderpath: data/01_raw/houses/anjou +#anjou: +# type: actesdataset.XMLDataSetCollection +# housename: berry +# folderpath: data/01_raw/houses/anjou -anjou_content: - type: actesdataset.XMLDataSetCollection - housename: berry - folderpath: data/02_intermediate/houses/anjou +#anjou_content: +# type: actesdataset.XMLDataSetCollection +# housename: berry +# folderpath: data/02_intermediate/houses/anjou diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 147c04d..64fd0c4 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -2,13 +2,16 @@ import logging from pathlib import Path from typing import Dict -from actesdataset import EtreeXMLDataSet +from actesdataset import EtreeXMLDataSet, XMLDataSetCollection logger = logging.getLogger(__name__) -def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]: +def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]: "node function entry point, performs batch processing" + # collection mapping + datasets = datasetcollection.datasets + outputfolderpath = datasetcollection.outputfolderpath output_datasets = dict() for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because @@ -18,9 +21,9 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, Etre logger.info(f"dataset {descr} loaded") output_source_doc = dataset.transform() # set dataset's output filepath -# output_filepath = _outputfolderpath - output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") - output_xmldataset = EtreeXMLDataSet(output_filepath) +# output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") + output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml") + output_xmldataset = EtreeXMLDataSet(str(output_filepath)) # let's create subfolders now, if they don't exist output_filepath = Path(output_filepath) output_xmldataset_dir = output_filepath.parent diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 4e5e9e0..bc04674 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -9,21 +9,21 @@ def create_pipeline(**kwargs) -> Pipeline: node( func=parse_xml_collection, inputs=["bourbon"], - outputs="bourbon_content", + outputs="bourbon_xmlcontent", name="bourbon_ds_collection", ), - node( - func=parse_xml_collection, - inputs=["berry"], - outputs="berry_content", - name="berry_ds_collection", - ), - node( - func=parse_xml_collection, - inputs=["anjou"], - outputs="anjou_content", - name="anjou_ds_collection", - ), +# node( +# func=parse_xml_collection, +# inputs=["berry"], +# outputs="berry_content", +# name="berry_ds_collection", +# ), +# node( +# func=parse_xml_collection, +# inputs=["anjou"], +# outputs="anjou_content", +# name="anjou_ds_collection", +# ), ] ) diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 7d7840a..da6a7d9 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -125,7 +125,8 @@ class XMLDataSetCollection(AbstractDataSet): outputfolderpath: Optional[str]=None) -> None: self._housename = housename self._folderpath = Path(folderpath) - self._outputfolderpath = outputfolderpath + if outputfolderpath is not None: + self.outputfolderpath = Path(outputfolderpath) def _load(self) -> dict[str, EtreeXMLDataSet]: "kedro's API loader method" @@ -133,7 +134,9 @@ class XMLDataSetCollection(AbstractDataSet): for filepath in sorted(self._folderpath.glob("*.xml")): self.datasets[filepath.stem] = EtreeXMLDataSet( filepath=str(filepath)) - return self.datasets + # return self.datasets + # we need the object itself during transformation + return self def _save(self, data) -> None: """kedro's API saver method