output - node

3 years ago · 1991698e5c
parent 4c687c25dd
commit 1991698e5c
4 changed files with 44 additions and 38 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -6,7 +6,7 @@ bourbon:
  folderpath: data/01_raw/houses/bourbon
  outputfolderpath: data/02_intermediate/houses/bourbon/xml 

-bourbon_content:
+bourbon_xmlcontent:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/xml
@ -19,25 +19,25 @@ bourbon_content:

 # ________________________________________________________________________

-berry:
-  type: actesdataset.XMLDataSetCollection
-  housename: berry
-  folderpath: data/01_raw/houses/berry
+#berry:
+#  type: actesdataset.XMLDataSetCollection
+#  housename: berry
+#  folderpath: data/01_raw/houses/berry

-berry_content:
-  type: actesdataset.XMLDataSetCollection
-  housename: berry
-  folderpath: data/02_intermediate/houses/berry
+#berry_content:
+#  type: actesdataset.XMLDataSetCollection
+#  housename: berry
+#  folderpath: data/02_intermediate/houses/berry

-# ________________________________________________________________________
+## ________________________________________________________________________

-anjou:
-  type: actesdataset.XMLDataSetCollection
-  housename: berry
-  folderpath: data/01_raw/houses/anjou
+#anjou:
+#  type: actesdataset.XMLDataSetCollection
+#  housename: berry
+#  folderpath: data/01_raw/houses/anjou

-anjou_content:
-  type: actesdataset.XMLDataSetCollection
-  housename: berry
-  folderpath: data/02_intermediate/houses/anjou
+#anjou_content:
+#  type: actesdataset.XMLDataSetCollection
+#  housename: berry
+#  folderpath: data/02_intermediate/houses/anjou

--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -2,13 +2,16 @@ import logging
 from pathlib import Path
 from typing import Dict

-from actesdataset import EtreeXMLDataSet
+from actesdataset import EtreeXMLDataSet, XMLDataSetCollection

 logger = logging.getLogger(__name__)


-def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
+def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
    "node function entry point, performs batch processing"
+    # collection mapping
+    datasets = datasetcollection.datasets
+    outputfolderpath = datasetcollection.outputfolderpath
    output_datasets = dict()
    for dataset_filenamestem, dataset in datasets.items():
        # a manual load is required here, because
@ -18,9 +21,9 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, Etre
        logger.info(f"dataset {descr} loaded")
        output_source_doc = dataset.transform() 
        # set dataset's output filepath
-#        output_filepath = _outputfolderpath
-        output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
-        output_xmldataset = EtreeXMLDataSet(output_filepath)
+#        output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
+        output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
+        output_xmldataset = EtreeXMLDataSet(str(output_filepath))
        # let's create subfolders now, if they don't exist
        output_filepath = Path(output_filepath)
        output_xmldataset_dir = output_filepath.parent
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
@ -9,21 +9,21 @@ def create_pipeline(**kwargs) -> Pipeline:
            node(
                func=parse_xml_collection,
                inputs=["bourbon"],
-                outputs="bourbon_content",
+                outputs="bourbon_xmlcontent",
                name="bourbon_ds_collection",
            ),
-            node(
-                func=parse_xml_collection,
-                inputs=["berry"],
-                outputs="berry_content",
-                name="berry_ds_collection",
-            ),
-            node(
-                func=parse_xml_collection,
-                inputs=["anjou"],
-                outputs="anjou_content",
-                name="anjou_ds_collection",
-            ),
+#            node(
+#                func=parse_xml_collection,
+#                inputs=["berry"],
+#                outputs="berry_content",
+#                name="berry_ds_collection",
+#            ),
+#            node(
+#                func=parse_xml_collection,
+#                inputs=["anjou"],
+#                outputs="anjou_content",
+#                name="anjou_ds_collection",
+#            ),

        ]
    )
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -125,7 +125,8 @@ class XMLDataSetCollection(AbstractDataSet):
        outputfolderpath: Optional[str]=None) -> None:
        self._housename = housename
        self._folderpath = Path(folderpath)
-        self._outputfolderpath = outputfolderpath
+        if outputfolderpath is not None:
+            self.outputfolderpath = Path(outputfolderpath)
            
    def _load(self) -> dict[str, EtreeXMLDataSet]:
        "kedro's API loader method"
@ -133,7 +134,9 @@ class XMLDataSetCollection(AbstractDataSet):
        for filepath in sorted(self._folderpath.glob("*.xml")):
            self.datasets[filepath.stem] = EtreeXMLDataSet(
                filepath=str(filepath))
-        return self.datasets
+        # return self.datasets
+        # we need the object itself during transformation
+        return self

    def _save(self, data) -> None:
        """kedro's API saver method