add json output

3 years ago · c5238793c4
parent 021dcab8fb
commit c5238793c4
4 changed files with 69 additions and 28 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -11,7 +11,12 @@ bourbon_xmlcontent:
  folderpath: data/02_intermediate/houses/bourbon/xml
 bourbon_json:
-  type: actesdataset.XMLDataSetCollection
+  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon
 bourbon_jsonoutput:
  type: actesdataset.JSONDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon/json
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -4,7 +4,8 @@ from typing import Dict
 from kedro.framework.session import KedroSession
-from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
+from actesdataset import EtreeXMLDataSet, BsXMLDataSet
 from actesdataset import XMLDataSetCollection, JSONDataSetCollection
 logger = logging.getLogger(__name__)
@ -24,7 +25,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
        dataset._load()
        descr = dataset._describe()
        output_source_doc = dataset.transform() 
        # set dataset's output filepath
        output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
@ -37,3 +37,28 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
        output_datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets
 def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]:
    "node function entry point, performs batch processing"
    datasets = datasetcol.datasets
    housename = datasetcol._housename
    output_catalog = catalog[housename + '_jsonoutput']
    outputfolderpath = output_catalog['folderpath']
    output_datasets = dict()
    for dataset_filenamestem, dataset in datasets.items():
        logger.info("-------------- parse_json ------------" + str(type(dataset)))
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
        dataset._load()
        output_source_doc = dataset.transform() 
        # set dataset's output filepath
        output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
        output_xmldataset = BsXMLDataSet(str(output_filepath))
        # let's create subfolders, if they don't exist
        output_xmldataset_dir = output_filepath.parent
        output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
        # save on file
        output_xmldataset._save(output_source_doc)
        output_datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
@ -1,7 +1,7 @@
 from kedro.pipeline import Pipeline, node, pipeline
-from .nodes import parse_xml_collection
+from .nodes import parse_xml_collection, parse_json_collection
 def create_pipeline(**kwargs) -> Pipeline:
@ -13,12 +13,12 @@ def create_pipeline(**kwargs) -> Pipeline:
                outputs="bourbon_xmlcontent",
                name="bourbon_ds_collection",
            ),
-#            node(
+            node(
-#                func=parse_json_collection,
+                func=parse_json_collection,
-#                inputs="bourbon",
+                inputs="bourbon_json",
-#                outputs="bourbon_json",
+                outputs="bourbon_jsonoutput",
-#                name="bourbon_json_ds_collection",
+                name="bourbon_json_ds_collection",
-#            ),
+            ),
 #            node(
 #                func=parse_xml_collection,
 #                inputs="berry",
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -68,17 +68,18 @@ class BsXMLDataSet(XMLDataSet):
    def _load(self):
        "from the xml file, loads a internal xml repr (with bsoup)"
        logger.info("------------------------- bsoup loader -------")
        with open(self._filepath, 'r', encoding="utf-8") as fhandle:
            self.soup = BeautifulSoup(fhandle, 'xml')
        ## xml.prettify() is the bsoup str(source_doc)
-# FIXME 
+    def _save(self, data: Dict) -> None:
-#    def _save(self, data: Dict) -> None:
+        "kedro's API-like saver"
-#        "kedro's API-like saver"
+        with open(self._filepath, 'w') as fp:
-#        with open(self._filepath, 'w') as fp:
+            json.dump(data, fp, sort_keys=True, indent=4)
 #            json.dump(data, fp, sort_keys=True, indent=4)
    def transform(self):
        logger.info("---------------- transform --------------")
        #soup = make_soup(os.path.join(folder, acte))
        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
        numb = self.soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
@ -114,9 +115,8 @@ class BsXMLDataSet(XMLDataSet):
 #            "diplo_type_acte": diplo_query[0]
            }
-
+class DataSetCollection(AbstractDataSet):
-class XMLDataSetCollection(AbstractDataSet):
+    """Stores instances of ``DataSetCollection``
    """Stores instances of ``XMLDataSet``
    implementations to provide ``_load`` and ``_save`` capabilities.
    """
    def __init__(self, 
@ -125,15 +125,6 @@ class XMLDataSetCollection(AbstractDataSet):
        self._housename = housename
        self._folderpath = Path(folderpath)
    def _load(self) -> dict[str, EtreeXMLDataSet]:
        "kedro's API loader method"
        self.datasets = dict()
        for filepath in sorted(self._folderpath.glob("*.xml")):
            self.datasets[filepath.stem] = EtreeXMLDataSet(
                filepath=str(filepath))
 #        return self.datasets
        return self
    def _save(self, data) -> None:
        """kedro's API saver method
@ -145,7 +136,27 @@ class XMLDataSetCollection(AbstractDataSet):
    def _describe(self) -> dict[str, Any]:
        "kedro's API repr()"
-        return dict(name=self._housename, folderpath=self._folderpath)
+        return dict(name=self._housename, 
                    folderpath=str(self._folderpath))
 class XMLDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, EtreeXMLDataSet]:
        "kedro's API loader method"
        self.datasets = dict()
        for filepath in sorted(self._folderpath.glob("*.xml")):
            self.datasets[filepath.stem] = EtreeXMLDataSet(
                filepath=str(filepath))
        return self
 class JSONDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, BsXMLDataSet]:
        "kedro's API loader method"
        self.datasets = dict()
        for filepath in sorted(self._folderpath.glob("*.xml")):
            self.datasets[filepath.stem] = BsXMLDataSet(
                filepath=str(filepath))
        return self
 #class TextDataSet:
 #    """loads/saves data from/to a text file using an underlying filesystem