full json

3 years ago · b67739dce8
parent 20cec1e2bd
commit b67739dce8
5 changed files with 90 additions and 52 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -6,24 +6,38 @@ bourbon:
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon
-# output (write) dataset
+# output (write) **pseudo xml** dataset
 bourbon_xmlcontent:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/xml
 # input (read) **pseudo xml** dataset
 bourbon_pseudoxmlcontent:
  type: actesdataset.TextDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/xml
 # input (read only) dataset
 bourbon_json:
  type: actesdataset.BsXMLDataSetCollection
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon
-# output (write) dataset
+# input (read) and output (write) dataset
 bourbon_jsonoutput:
  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/json
 # output (write) dataset
 bourbon_fulljsonoutput:
  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/fulljson
 ## ________________________________________________________________________
 #berry:
--- a/actes-princiers/src/actes_princiers/customcontext.py
+++ b/actes-princiers/src/actes_princiers/customcontext.py
@ -22,11 +22,11 @@ class ProjectContext(KedroContext):
        houses = self.config_loader.get("houses*")
        return houses['raw_datapath']
-    def get_catalog(self):
+#    def get_catalog(self):
-        "catalog loader entry point"
+#        "catalog loader entry point"
-        # loading yaml defined catalogs 
+#        # loading yaml defined catalogs 
-        catalog = self.config_loader.get('catalog*')
+#        catalog = self.config_loader.get('catalog*')
-        return catalog
+#        return catalog
 #    def _get_catalog(self, *args, **kwargs):
 #        "catalog loader entry point"
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -6,7 +6,7 @@ from kedro.framework.session import KedroSession
 from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
 from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
-                         JSONDataSetCollection)
+                         JSONDataSetCollection, TextDataSetCollection)
 logger = logging.getLogger(__name__)
@ -60,15 +60,29 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
        output_datasets.datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets
-#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]:
+def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: TextDataSetCollection) -> JSONDataSetCollection:
    "adds xmlcontent to the json"
    jsondatasets = jsondoc.datasets
    housename = jsondoc._housename
    output_datasets = context.catalog.load(housename + '_fulljsonoutput') 
    outputfolderpath = output_datasets._folderpath
    xmldatasets = xmlcontent.datasets
    for dataset_filenamestem, dataset in jsondatasets.items():
        document = dataset._load()
        output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
        output_xmldataset = JSONDataSet(str(output_filepath))
        # json dict update with xmlcontent
        if dataset_filenamestem in xmldatasets:
            xmlds = xmldatasets[dataset_filenamestem]
 #            xmlds._load()
            document['xmlcontent'] = xmldatasets[dataset_filenamestem]._load()
        else:
            raise KeyError(f"xmlcontent datasets does not have the key : {dataset_filenamestem}")
        # let's create subfolders, if they don't exist
        output_xmldataset_dir = output_filepath.parent
        output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
        # save on file
        output_xmldataset._save(document)
        output_datasets.datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets    
 #    logger.info("9999999999999999999999" + str(xmlcontent.datasets.keys()))
 #    json_datasets = jsondoc.datasets
 ##    xmlcontent._load()
 #    logger.info(str(xmlcontent))
 #    xmlcontent = xmlcontent.datasets
 #    for dataset_filenamestem, dataset in json_datasets.items():
 #        document = dataset._load()
 #        document['xmlcontent'] = xmlcontent[dataset_filenamestem].source_doc
 #    return json_datasets
 #    
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
@ -1,8 +1,8 @@
 from kedro.pipeline import Pipeline, node, pipeline
-from .nodes import (parse_xml_collection, make_json_collection)
+from .nodes import (parse_xml_collection, make_json_collection,
-#                   add_xmlcontent_tojson)
+                   add_xmlcontent_tojson)
 def create_pipeline(**kwargs) -> Pipeline:
@ -20,17 +20,17 @@ def create_pipeline(**kwargs) -> Pipeline:
                outputs="bourbon_jsonoutput",
                name="bourbon_json_ds_collection",
            ),
-#            node(
+            node(
-#                func=add_xmlcontent_tojson,
+                func=add_xmlcontent_tojson,
-#                inputs=["bourbon_json", "bourbon_xmlcontent"],
+                inputs=["bourbon_jsonoutput", "bourbon_pseudoxmlcontent"],
-#                outputs="bourbon_fulljson",
+                outputs="bourbon_fulljsonoutput",
-#                name="bourbon_fulljson_ds_collection",
+                name="bourbon_fulljson_ds_collection",
-#            ),
+            ),
 #            node(
 #                func=parse_xml_collection,
 #                inputs="berry",
-#                outputs=None, #"berry_xmlcontent",
+#                outputs="berry_xmlcontent",
 #                name="berry_ds_collection",
 #            ),
 #            node(
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -192,26 +192,36 @@ class JSONDataSetCollection(DataSetCollection):
                filepath=str(filepath))
        return self
-#class TextDataSet:
+class TextDataSet:
-#    """loads/saves data from/to a text file using an underlying filesystem
+    """loads/saves data from/to a text file using an underlying filesystem
-#    example usage
+    example usage
-#    >>> string_to_write = "This will go in a file."
+    >>> string_to_write = "This will go in a file."
-#    >>>
+    >>>
-#    >>> data_set = TextDataSet(filepath="test.md")
+    >>> data_set = TextDataSet(filepath="test.md")
-#    >>> data_set.save(string_to_write)
+    >>> data_set.save(string_to_write)
-#    >>> reloaded = data_set.load()
+    >>> reloaded = data_set.load()
-#    >>> assert string_to_write == reloaded
+    >>> assert string_to_write == reloaded
-#    """
+    """
-#    def __init__(self, filepath: str):
+    def __init__(self, filepath: str):
-#        self._filepath = filepath
+        self._filepath = filepath
-# 
+        
-#    def _load(self) -> str:
+    def _load(self) -> str:
-#        with open(self._filepath, 'r') as fhandle:
+        with open(self._filepath, 'r') as fhandle:
-#            return fhandle.read()
+            return fhandle.read()
-
+        
-#    def _save(self, data: str) -> None:
+    def _save(self, data: str) -> None:
-#        with open(self._filepath, 'w') as fhandle:
+        with open(self._filepath, 'w') as fhandle:
-#            fhandle.write(data)
+            fhandle.write(data)
-
+
-#    def _describe(self) -> Dict[str, Any]:
+    def _describe(self) -> Dict[str, Any]:
-#        return dict(filepath=self._filepath)
+        return dict(filepath=self._filepath)
 class TextDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, JSONDataSet]:
        "kedro's API loader method"
        self.datasets = dict()
        for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
            self.datasets[filepath.stem] = TextDataSet(
                filepath=str(filepath))
        return self