diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 8d305a3..2692997 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -11,7 +11,12 @@ bourbon_xmlcontent: folderpath: data/02_intermediate/houses/bourbon/xml bourbon_json: - type: actesdataset.XMLDataSetCollection + type: actesdataset.JSONDataSetCollection + housename: bourbon + folderpath: data/01_raw/houses/bourbon + +bourbon_jsonoutput: + type: actesdataset.JSONDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon/json diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index a2fa204..e4841a2 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -4,7 +4,8 @@ from typing import Dict from kedro.framework.session import KedroSession -from actesdataset import EtreeXMLDataSet, XMLDataSetCollection +from actesdataset import EtreeXMLDataSet, BsXMLDataSet +from actesdataset import XMLDataSetCollection, JSONDataSetCollection logger = logging.getLogger(__name__) @@ -24,7 +25,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML # a manual load is required here, because # the dataset **is not** registered in kedro's catalog dataset._load() - descr = dataset._describe() output_source_doc = dataset.transform() # set dataset's output filepath output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml") @@ -37,3 +37,28 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML output_datasets[dataset_filenamestem] = output_xmldataset return output_datasets + +def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]: + "node function entry point, performs batch processing" + datasets = datasetcol.datasets + housename = datasetcol._housename + output_catalog = catalog[housename + '_jsonoutput'] + outputfolderpath = output_catalog['folderpath'] + output_datasets = dict() + for dataset_filenamestem, dataset in datasets.items(): + logger.info("-------------- parse_json ------------" + str(type(dataset))) + # a manual load is required here, because + # the dataset **is not** registered in kedro's catalog + dataset._load() + output_source_doc = dataset.transform() + # set dataset's output filepath + output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json") + output_xmldataset = BsXMLDataSet(str(output_filepath)) + # let's create subfolders, if they don't exist + output_xmldataset_dir = output_filepath.parent + output_xmldataset_dir.mkdir(parents=True, exist_ok=True) + # save on file + output_xmldataset._save(output_source_doc) + output_datasets[dataset_filenamestem] = output_xmldataset + return output_datasets + diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 03827c5..ad3d63a 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -1,7 +1,7 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import parse_xml_collection +from .nodes import parse_xml_collection, parse_json_collection def create_pipeline(**kwargs) -> Pipeline: @@ -13,12 +13,12 @@ def create_pipeline(**kwargs) -> Pipeline: outputs="bourbon_xmlcontent", name="bourbon_ds_collection", ), -# node( -# func=parse_json_collection, -# inputs="bourbon", -# outputs="bourbon_json", -# name="bourbon_json_ds_collection", -# ), + node( + func=parse_json_collection, + inputs="bourbon_json", + outputs="bourbon_jsonoutput", + name="bourbon_json_ds_collection", + ), # node( # func=parse_xml_collection, # inputs="berry", diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index ce73e00..3b97c3f 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -68,17 +68,18 @@ class BsXMLDataSet(XMLDataSet): def _load(self): "from the xml file, loads a internal xml repr (with bsoup)" + logger.info("------------------------- bsoup loader -------") with open(self._filepath, 'r', encoding="utf-8") as fhandle: self.soup = BeautifulSoup(fhandle, 'xml') ## xml.prettify() is the bsoup str(source_doc) -# FIXME -# def _save(self, data: Dict) -> None: -# "kedro's API-like saver" -# with open(self._filepath, 'w') as fp: -# json.dump(data, fp, sort_keys=True, indent=4) + def _save(self, data: Dict) -> None: + "kedro's API-like saver" + with open(self._filepath, 'w') as fp: + json.dump(data, fp, sort_keys=True, indent=4) def transform(self): + logger.info("---------------- transform --------------") #soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID @@ -114,9 +115,8 @@ class BsXMLDataSet(XMLDataSet): # "diplo_type_acte": diplo_query[0] } - -class XMLDataSetCollection(AbstractDataSet): - """Stores instances of ``XMLDataSet`` +class DataSetCollection(AbstractDataSet): + """Stores instances of ``DataSetCollection`` implementations to provide ``_load`` and ``_save`` capabilities. """ def __init__(self, @@ -125,15 +125,6 @@ class XMLDataSetCollection(AbstractDataSet): self._housename = housename self._folderpath = Path(folderpath) - def _load(self) -> dict[str, EtreeXMLDataSet]: - "kedro's API loader method" - self.datasets = dict() - for filepath in sorted(self._folderpath.glob("*.xml")): - self.datasets[filepath.stem] = EtreeXMLDataSet( - filepath=str(filepath)) -# return self.datasets - return self - def _save(self, data) -> None: """kedro's API saver method @@ -145,7 +136,27 @@ class XMLDataSetCollection(AbstractDataSet): def _describe(self) -> dict[str, Any]: "kedro's API repr()" - return dict(name=self._housename, folderpath=self._folderpath) + return dict(name=self._housename, + folderpath=str(self._folderpath)) + +class XMLDataSetCollection(DataSetCollection): + def _load(self) -> dict[str, EtreeXMLDataSet]: + "kedro's API loader method" + self.datasets = dict() + for filepath in sorted(self._folderpath.glob("*.xml")): + self.datasets[filepath.stem] = EtreeXMLDataSet( + filepath=str(filepath)) + return self + + +class JSONDataSetCollection(DataSetCollection): + def _load(self) -> dict[str, BsXMLDataSet]: + "kedro's API loader method" + self.datasets = dict() + for filepath in sorted(self._folderpath.glob("*.xml")): + self.datasets[filepath.stem] = BsXMLDataSet( + filepath=str(filepath)) + return self #class TextDataSet: # """loads/saves data from/to a text file using an underlying filesystem