full json

3 years ago · b67739dce8
parent 20cec1e2bd
commit b67739dce8
5 changed files with 90 additions and 52 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -6,24 +6,38 @@ bourbon:
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon

-# output (write) dataset
+# output (write) **pseudo xml** dataset
 bourbon_xmlcontent:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/xml

+# input (read) **pseudo xml** dataset
+bourbon_pseudoxmlcontent:
+  type: actesdataset.TextDataSetCollection
+  housename: bourbon
+  folderpath: data/02_intermediate/houses/bourbon/xml
+
 # input (read only) dataset
 bourbon_json:
  type: actesdataset.BsXMLDataSetCollection
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon

-# output (write) dataset
+# input (read) and output (write) dataset
 bourbon_jsonoutput:
  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/json

+# output (write) dataset
+bourbon_fulljsonoutput:
+  type: actesdataset.JSONDataSetCollection
+  housename: bourbon
+  folderpath: data/02_intermediate/houses/bourbon/fulljson
+
+
+
 ## ________________________________________________________________________

 #berry:
--- a/actes-princiers/src/actes_princiers/customcontext.py
+++ b/actes-princiers/src/actes_princiers/customcontext.py
@ -22,11 +22,11 @@ class ProjectContext(KedroContext):
        houses = self.config_loader.get("houses*")
        return houses['raw_datapath']

-    def get_catalog(self):
-        "catalog loader entry point"
-        # loading yaml defined catalogs 
-        catalog = self.config_loader.get('catalog*')
-        return catalog
+#    def get_catalog(self):
+#        "catalog loader entry point"
+#        # loading yaml defined catalogs 
+#        catalog = self.config_loader.get('catalog*')
+#        return catalog

 #    def _get_catalog(self, *args, **kwargs):
 #        "catalog loader entry point"
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -6,7 +6,7 @@ from kedro.framework.session import KedroSession

 from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
 from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
-                         JSONDataSetCollection)
+                         JSONDataSetCollection, TextDataSetCollection)

 logger = logging.getLogger(__name__)

@ -60,15 +60,29 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
        output_datasets.datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets

-#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]:
-
-#    logger.info("9999999999999999999999" + str(xmlcontent.datasets.keys()))
-#    json_datasets = jsondoc.datasets
-##    xmlcontent._load()
-#    logger.info(str(xmlcontent))
-#    xmlcontent = xmlcontent.datasets
-#    for dataset_filenamestem, dataset in json_datasets.items():
-#        document = dataset._load()
-#        document['xmlcontent'] = xmlcontent[dataset_filenamestem].source_doc
-#    return json_datasets
-#    
+def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: TextDataSetCollection) -> JSONDataSetCollection:
+    "adds xmlcontent to the json"
+    jsondatasets = jsondoc.datasets
+    housename = jsondoc._housename
+    output_datasets = context.catalog.load(housename + '_fulljsonoutput') 
+    outputfolderpath = output_datasets._folderpath
+    xmldatasets = xmlcontent.datasets
+    for dataset_filenamestem, dataset in jsondatasets.items():
+        document = dataset._load()
+        output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
+        output_xmldataset = JSONDataSet(str(output_filepath))
+        # json dict update with xmlcontent
+        if dataset_filenamestem in xmldatasets:
+            xmlds = xmldatasets[dataset_filenamestem]
+#            xmlds._load()
+            document['xmlcontent'] = xmldatasets[dataset_filenamestem]._load()
+        else:
+            raise KeyError(f"xmlcontent datasets does not have the key : {dataset_filenamestem}")
+        # let's create subfolders, if they don't exist
+        output_xmldataset_dir = output_filepath.parent
+        output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
+        # save on file
+        output_xmldataset._save(document)
+        output_datasets.datasets[dataset_filenamestem] = output_xmldataset
+    return output_datasets    
+    
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
@ -1,8 +1,8 @@

 from kedro.pipeline import Pipeline, node, pipeline

-from .nodes import (parse_xml_collection, make_json_collection)
-#                   add_xmlcontent_tojson)
+from .nodes import (parse_xml_collection, make_json_collection,
+                   add_xmlcontent_tojson)


 def create_pipeline(**kwargs) -> Pipeline:
@ -20,17 +20,17 @@ def create_pipeline(**kwargs) -> Pipeline:
                outputs="bourbon_jsonoutput",
                name="bourbon_json_ds_collection",
            ),
-#            node(
-#                func=add_xmlcontent_tojson,
-#                inputs=["bourbon_json", "bourbon_xmlcontent"],
-#                outputs="bourbon_fulljson",
-#                name="bourbon_fulljson_ds_collection",
-#            ),
+            node(
+                func=add_xmlcontent_tojson,
+                inputs=["bourbon_jsonoutput", "bourbon_pseudoxmlcontent"],
+                outputs="bourbon_fulljsonoutput",
+                name="bourbon_fulljson_ds_collection",
+            ),

 #            node(
 #                func=parse_xml_collection,
 #                inputs="berry",
-#                outputs=None, #"berry_xmlcontent",
+#                outputs="berry_xmlcontent",
 #                name="berry_ds_collection",
 #            ),
 #            node(
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -192,26 +192,36 @@ class JSONDataSetCollection(DataSetCollection):
                filepath=str(filepath))
        return self

-#class TextDataSet:
-#    """loads/saves data from/to a text file using an underlying filesystem
-#    example usage
-#    >>> string_to_write = "This will go in a file."
-#    >>>
-#    >>> data_set = TextDataSet(filepath="test.md")
-#    >>> data_set.save(string_to_write)
-#    >>> reloaded = data_set.load()
-#    >>> assert string_to_write == reloaded
-#    """
-#    def __init__(self, filepath: str):
-#        self._filepath = filepath
-# 
-#    def _load(self) -> str:
-#        with open(self._filepath, 'r') as fhandle:
-#            return fhandle.read()
-
-#    def _save(self, data: str) -> None:
-#        with open(self._filepath, 'w') as fhandle:
-#            fhandle.write(data)
-
-#    def _describe(self) -> Dict[str, Any]:
-#        return dict(filepath=self._filepath)
+class TextDataSet:
+    """loads/saves data from/to a text file using an underlying filesystem
+    example usage
+    >>> string_to_write = "This will go in a file."
+    >>>
+    >>> data_set = TextDataSet(filepath="test.md")
+    >>> data_set.save(string_to_write)
+    >>> reloaded = data_set.load()
+    >>> assert string_to_write == reloaded
+    """
+    def __init__(self, filepath: str):
+        self._filepath = filepath
+        
+    def _load(self) -> str:
+        with open(self._filepath, 'r') as fhandle:
+            return fhandle.read()
+        
+    def _save(self, data: str) -> None:
+        with open(self._filepath, 'w') as fhandle:
+            fhandle.write(data)
+
+    def _describe(self) -> Dict[str, Any]:
+        return dict(filepath=self._filepath)
+        
+class TextDataSetCollection(DataSetCollection):
+    def _load(self) -> dict[str, JSONDataSet]:
+        "kedro's API loader method"
+        self.datasets = dict()
+        for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
+            self.datasets[filepath.stem] = TextDataSet(
+                filepath=str(filepath))
+        return self
+