diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 48e43f5..dd450db 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -1,34 +1,34 @@ import logging from pathlib import Path +from typing import Dict -from lxml import etree +from lxml import etree from actesdataset import XMLDataSet logger = logging.getLogger(__name__) -def transform(source_doc, xlststylesheet): - # +def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str: xslt_doc = etree.parse(xlststylesheet) xslt_transformer = etree.XSLT(xslt_doc) return str(xslt_transformer(source_doc)) -def parse_xml_collection(datasets, param): - # FIXME set signature - # datasets -> dict - # param -> str +def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]: output_datasets = dict() -# datasets = bourbon.get_datasets() for dataset_filenamestem, dataset in datasets.items(): - # manually loading the dataset + # manually loading the dataset because the collection **is not** + # registered in the catalog dataset._load() + # transformation on each dataset output_source_doc = transform(dataset.get_source_doc(), param) + # set dataset's output filepath output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") output_xmldataset = XMLDataSet(output_filepath) output_xmldataset.set_source_doc(output_source_doc) output_datasets[dataset_filenamestem] = output_xmldataset + # 02_intermediate : # let's create subfolders if they don't exist output_filepath = Path(output_filepath) output_xmldataset_dir = output_filepath.parent diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index d23c6e9..df06283 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -1,5 +1,4 @@ from kedro.pipeline import Pipeline, node, pipeline -from kedro.framework.session import KedroSession from .nodes import parse_xml_collection diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 1942de0..d1df114 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -14,33 +14,32 @@ logger = logging.getLogger(__name__) class XMLDataSet: "lxml.etree._ElementTree loader" - # FIXME set the typing signature - def __init__(self, filepath: str): + def __init__(self, filepath: str) -> None: self._filepath = filepath - def get_filepath(self): + def get_filepath(self) -> str: return self._filepath - def get_source_doc(self): + def get_source_doc(self) -> str: if hasattr(self, 'source_doc'): return self.source_doc else: attr_error_msg = str(self._describe()) raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") - def set_source_doc(self, source_doc): + def set_source_doc(self, source_doc: str) -> None: self.source_doc = source_doc - def _transform_source_doc(self): - # remove namespace : + def _transform_source_doc(self) -> etree._ElementTree: + # removing namespace query = "descendant-or-self::*[namespace-uri()!='']" for element in self.source_doc.xpath(query): - #replace element name with its local name + #replacing element name with its local name element.tag = etree.QName(element).localname etree.cleanup_namespaces(self.source_doc) return self.source_doc - def _load(self): + def _load(self) -> etree._ElementTree: self.source_doc = etree.parse(self._filepath) self._transform_source_doc() return self.source_doc @@ -55,104 +54,34 @@ class XMLDataSet: class XMLDataSetCollection(AbstractDataSet): """Stores instances of ``XMLDataSet`` - implementations to provide ``load`` and ``save`` capabilities. - anywhere in the program. To use a ``DataCatalog``, you need to - instantiate it with a file system folder path, it "reflects" - this file system of XML files. - It loads a dictionary of XML data sets. - - Args: - data_sets: A dictionary of data set names and data set instances. - - Example:: - - >>> from .actesdatasets import XMLDataSet, XMLCatalogReflector - >>> - >>> cars = XMLDataSet(filepath="cars.xml") - >>> io = XMLCatalogReflector(housename='bourbon', folderpath='/tmp/mydir', data_sets={'cars': cars}) -# filepath, load_args=None, save_args=None): + implementations to provide ``_load`` and ``_save`` capabilities. """ - # FIXME set the typing signature def __init__(self, housename: str, - folderpath: str): + folderpath: str) -> None: self._housename = housename self._folderpath = Path(folderpath) - def get_datasets(self): + def get_datasets(self) -> Dict[str, Any]: if hasattr(self, 'datasets'): return self.datasets else: attr_error_msg = str(self._describe()) raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") - # FIXME : set the signature - def _load(self): - ":return: dict[str, XMLDataSet]" + def _load(self) -> dict[str, XMLDataSet]: self.datasets = dict() for filepath in sorted(self._folderpath.glob("*.xml")): self.datasets[filepath.stem] = XMLDataSet( filepath=str(filepath)) return self.datasets - # FIXME : set the signature - def _save(self, datasets): - # faire une méthode save et pas _save + def _save(self, datasets: dict[str, XMLDataSet]) -> None: for stemfilename, dataset in datasets.items(): - # FIXME XXX -> pas besoin refaire un get_source_doc !!!!!! dataset._save(dataset.get_source_doc()) - def _describe(self): + def _describe(self) -> dict[str, Any]: return dict(name=self._housename, folderpath=self._folderpath) - -# def load(self, name: str) -> Any: -# """Loads a registered data set. - -# Args: -# name: A data set to be loaded. -# version: Optional argument for concrete data version to be loaded. -# Works only with versioned datasets. - -# Returns: -# The loaded data as configured. -# """ -# return result -# -## def save(self, name: str, data: Any) -> None: -## """Save data to a registered data set. - -## Args: -## name: A data set to be saved to. -## data: A data object to be saved as configured in the registered -## data set. - -## Raises: -## DatasetNotFoundError: When a data set with the given name -## has not yet been registered. - -## Example: -## :: - -## >>> import pandas as pd -## >>> -## >>> from kedro.extras.datasets.pandas import CSVDataSet -## >>> -## >>> cars = CSVDataSet(filepath="cars.csv", -## >>> load_args=None, -## >>> save_args={"index": False}) -## >>> io = DataCatalog(data_sets={'cars': cars}) -## >>> -## >>> df = pd.DataFrame({'col1': [1, 2], -## >>> 'col2': [4, 5], -## >>> 'col3': [5, 6]}) -## >>> io.save("cars", df) -## """ -## dataset = self._get_dataset(name) -### self._print("Saving data to '%s' (%s)...", name, type(dataset).__name__) -## dataset.save(data) - -# def _describe(self) -> Dict[str, Any]: -# return dict(filepath=self._housename) #class JSONDataSet(AbstractDataSet):