full json

3 years ago · cf0d386ebf
parent d695c8b8f0
commit cf0d386ebf
3 changed files with 18 additions and 10 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -1,22 +1,26 @@
 # ________________________________________________________________________

+# input (read only) dataset
 bourbon:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon

+# output dataset
 bourbon_xmlcontent:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/xml

+# input (read only dataset)
 bourbon_json:
-  type: actesdataset.JSONDataSetCollection
+  type: actesdataset.BsXMLDataSetCollection
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon

+# output dataset
 bourbon_jsonoutput:
-  type: actesdataset.FullJSONDataSetCollection
+  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/json

--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -5,7 +5,8 @@ from typing import Dict
 from kedro.framework.session import KedroSession

 from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
-from actesdataset import (XMLDataSetCollection, JSONDataSetCollection)
+from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection)
+# JSONDataSetCollection)
 #                         FullJSONDataSetCollection)

 logger = logging.getLogger(__name__)
@ -14,14 +15,15 @@ with KedroSession.create() as session:
    context = session.load_context()
    catalog = context.get_catalog()

-def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
+def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
    "node function entry point, performs batch processing"
    datasets = datasetcol.datasets
    housename = datasetcol._housename
 #    outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"    
    output_catalog = catalog[housename + '_xmlcontent']
    outputfolderpath = output_catalog['folderpath']
-    output_datasets = dict()
+#    output_datasets = dict()
+    output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
    for dataset_filenamestem, dataset in datasets.items():
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
@ -35,11 +37,11 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
        output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
        # save on file
        output_xmldataset._save(output_source_doc)
-        output_datasets[dataset_filenamestem] = output_xmldataset
+        output_datasets.datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets


-def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]:
+def parse_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]:
    "node function entry point, performs batch processing"
    datasets = datasetcol.datasets
    housename = datasetcol._housename
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -132,6 +132,8 @@ class DataSetCollection(AbstractDataSet):
        folderpath: str) -> None:
        self._housename = housename
        self._folderpath = Path(folderpath)
+        # the collections key: file name, value: dataset object
+        self.datasets = dict()
                    
    def _save(self, data) -> None:
        """kedro's API saver method
@ -147,17 +149,17 @@ class DataSetCollection(AbstractDataSet):
        return dict(name=self._housename, 
                    folderpath=str(self._folderpath))

+
 class XMLDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, EtreeXMLDataSet]:
        "kedro's API loader method"
-        self.datasets = dict()
        for filepath in sorted(self._folderpath.glob("*.xml")):
            self.datasets[filepath.stem] = EtreeXMLDataSet(
                filepath=str(filepath))
        return self


-class JSONDataSetCollection(DataSetCollection):
+class BsXMLDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, BsXMLDataSet]:
        "kedro's API loader method"
        self.datasets = dict()
@ -183,7 +185,7 @@ class JSONDataSet: #(AbstractDataSet):
        return dict(filepath=self._filepath)


-class FullJSONDataSetCollection(DataSetCollection):
+class JSONDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, JSONDataSet]:
        "kedro's API loader method"
        self.datasets = dict()