From cc19bd09696814742c582324ea4d3d9367592428 Mon Sep 17 00:00:00 2001 From: gwen Date: Mon, 3 Jul 2023 15:22:15 +0200 Subject: [PATCH] docstrings and typing --- actes-princiers/conf/base/catalog.yml | 25 +++++++++++++++++-- .../pipelines/xml_processing/nodes.py | 10 ++++---- .../pipelines/xml_processing/pipeline.py | 13 ++++++++++ actes-princiers/src/actesdataset.py | 14 +++++++++-- 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 1b3b718..709b692 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,15 +1,36 @@ # ________________________________________________________________________ -# reading raw bourbon dataset + bourbon: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/01_raw/houses/bourbon -# writing bourbon xmlcontent document attribute bourbon_content: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon + # ________________________________________________________________________ +berry: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/01_raw/houses/berry + +berry_content: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/02_intermediate/houses/berry + +# ________________________________________________________________________ + +anjou: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/01_raw/houses/anjou + +anjou_content: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/02_intermediate/houses/anjou diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index dd450db..2cb74b2 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -10,26 +10,26 @@ logger = logging.getLogger(__name__) def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str: + "performs XML transformation on each dataset" xslt_doc = etree.parse(xlststylesheet) xslt_transformer = etree.XSLT(xslt_doc) return str(xslt_transformer(source_doc)) def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]: + "node function entry point, performs batch processing" output_datasets = dict() for dataset_filenamestem, dataset in datasets.items(): - # manually loading the dataset because the collection **is not** - # registered in the catalog + # a manual load is required here, because + # the dataset **is not** registered in kedro's catalog dataset._load() - # transformation on each dataset output_source_doc = transform(dataset.get_source_doc(), param) # set dataset's output filepath output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") output_xmldataset = XMLDataSet(output_filepath) output_xmldataset.set_source_doc(output_source_doc) output_datasets[dataset_filenamestem] = output_xmldataset - # 02_intermediate : - # let's create subfolders if they don't exist + # let's create subfolders now, if they don't exist output_filepath = Path(output_filepath) output_xmldataset_dir = output_filepath.parent output_xmldataset_dir.mkdir(parents=True, exist_ok=True) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index df06283..9fa811f 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -12,6 +12,19 @@ def create_pipeline(**kwargs) -> Pipeline: outputs="bourbon_content", name="bourbon_ds_collection", ), + node( + func=parse_xml_collection, + inputs=["berry", "params:xsltstylesheet"], + outputs="berry_content", + name="berry_ds_collection", + ), + node( + func=parse_xml_collection, + inputs=["anjou", "params:xsltstylesheet"], + outputs="anjou_content", + name="anjou_ds_collection", + ), + ] ) diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index d1df114..62e8c0c 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -1,12 +1,11 @@ +import logging import json from typing import Dict, Any from pathlib import Path -import logging from lxml import etree from kedro.io import AbstractDataSet, DataSetError - from kedro.framework.session import KedroSession logger = logging.getLogger(__name__) @@ -18,9 +17,11 @@ class XMLDataSet: self._filepath = filepath def get_filepath(self) -> str: + "xml file's long filename getters" return self._filepath def get_source_doc(self) -> str: + "XML source_doc (xml as a string) getter" if hasattr(self, 'source_doc'): return self.source_doc else: @@ -28,9 +29,11 @@ class XMLDataSet: raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") def set_source_doc(self, source_doc: str) -> None: + "XML source_doc (xml as a string) setter" self.source_doc = source_doc def _transform_source_doc(self) -> etree._ElementTree: + "xml transformer (with element tree)" # removing namespace query = "descendant-or-self::*[namespace-uri()!='']" for element in self.source_doc.xpath(query): @@ -40,15 +43,18 @@ class XMLDataSet: return self.source_doc def _load(self) -> etree._ElementTree: + "kedro's API-like loader" self.source_doc = etree.parse(self._filepath) self._transform_source_doc() return self.source_doc def _save(self, data:str) -> None: + "kedro's API-like saver" with open(self._filepath, 'w') as fhandle: fhandle.write(data) def _describe(self) -> Dict[str, Any]: + "kedro's API-like repr()" return dict(filepath=self._filepath) @@ -63,6 +69,7 @@ class XMLDataSetCollection(AbstractDataSet): self._folderpath = Path(folderpath) def get_datasets(self) -> Dict[str, Any]: + "datasets mapper getter" if hasattr(self, 'datasets'): return self.datasets else: @@ -70,6 +77,7 @@ class XMLDataSetCollection(AbstractDataSet): raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") def _load(self) -> dict[str, XMLDataSet]: + "kedro's API loader" self.datasets = dict() for filepath in sorted(self._folderpath.glob("*.xml")): self.datasets[filepath.stem] = XMLDataSet( @@ -77,10 +85,12 @@ class XMLDataSetCollection(AbstractDataSet): return self.datasets def _save(self, datasets: dict[str, XMLDataSet]) -> None: + "kedro's API saver" for stemfilename, dataset in datasets.items(): dataset._save(dataset.get_source_doc()) def _describe(self) -> dict[str, Any]: + "kedro's API repr()" return dict(name=self._housename, folderpath=self._folderpath)