set typing signature

develop
gwen 3 years ago
parent c47620101c
commit 5de2279d28

@ -1,5 +1,6 @@
import logging
from pathlib import Path
from typing import Dict
from lxml import etree
@ -8,27 +9,26 @@ from actesdataset import XMLDataSet
logger = logging.getLogger(__name__)
def transform(source_doc, xlststylesheet):
#<class 'lxml.etree._XSLTResultTree'>
def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str:
xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc))
def parse_xml_collection(datasets, param):
# FIXME set signature
# datasets -> dict
# param -> str
def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]:
output_datasets = dict()
# datasets = bourbon.get_datasets()
for dataset_filenamestem, dataset in datasets.items():
# manually loading the dataset
# manually loading the dataset because the collection **is not**
# registered in the catalog
dataset._load()
# transformation on each dataset
output_source_doc = transform(dataset.get_source_doc(), param)
# set dataset's output filepath
output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate")
output_xmldataset = XMLDataSet(output_filepath)
output_xmldataset.set_source_doc(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset
# 02_intermediate :
# let's create subfolders if they don't exist
output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent

@ -1,5 +1,4 @@
from kedro.pipeline import Pipeline, node, pipeline
from kedro.framework.session import KedroSession
from .nodes import parse_xml_collection

@ -14,33 +14,32 @@ logger = logging.getLogger(__name__)
class XMLDataSet:
"lxml.etree._ElementTree loader"
# FIXME set the typing signature
def __init__(self, filepath: str):
def __init__(self, filepath: str) -> None:
self._filepath = filepath
def get_filepath(self):
def get_filepath(self) -> str:
return self._filepath
def get_source_doc(self):
def get_source_doc(self) -> str:
if hasattr(self, 'source_doc'):
return self.source_doc
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc):
def set_source_doc(self, source_doc: str) -> None:
self.source_doc = source_doc
def _transform_source_doc(self):
# remove namespace :
def _transform_source_doc(self) -> etree._ElementTree:
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
#replace element name with its local name
#replacing element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc)
return self.source_doc
def _load(self):
def _load(self) -> etree._ElementTree:
self.source_doc = etree.parse(self._filepath)
self._transform_source_doc()
return self.source_doc
@ -55,105 +54,35 @@ class XMLDataSet:
class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet``
implementations to provide ``load`` and ``save`` capabilities.
anywhere in the program. To use a ``DataCatalog``, you need to
instantiate it with a file system folder path, it "reflects"
this file system of XML files.
It loads a dictionary of XML data sets.
Args:
data_sets: A dictionary of data set names and data set instances.
Example::
>>> from .actesdatasets import XMLDataSet, XMLCatalogReflector
>>>
>>> cars = XMLDataSet(filepath="cars.xml")
>>> io = XMLCatalogReflector(housename='bourbon', folderpath='/tmp/mydir', data_sets={'cars': cars})
# filepath, load_args=None, save_args=None):
implementations to provide ``_load`` and ``_save`` capabilities.
"""
# FIXME set the typing signature
def __init__(self,
housename: str,
folderpath: str):
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
def get_datasets(self):
def get_datasets(self) -> Dict[str, Any]:
if hasattr(self, 'datasets'):
return self.datasets
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
# FIXME : set the signature
def _load(self):
":return: dict[str, XMLDataSet]"
def _load(self) -> dict[str, XMLDataSet]:
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = XMLDataSet(
filepath=str(filepath))
return self.datasets
# FIXME : set the signature
def _save(self, datasets):
# faire une méthode save et pas _save
def _save(self, datasets: dict[str, XMLDataSet]) -> None:
for stemfilename, dataset in datasets.items():
# FIXME XXX -> pas besoin refaire un get_source_doc !!!!!!
dataset._save(dataset.get_source_doc())
def _describe(self):
def _describe(self) -> dict[str, Any]:
return dict(name=self._housename, folderpath=self._folderpath)
# def load(self, name: str) -> Any:
# """Loads a registered data set.
# Args:
# name: A data set to be loaded.
# version: Optional argument for concrete data version to be loaded.
# Works only with versioned datasets.
# Returns:
# The loaded data as configured.
# """
# return result
#
## def save(self, name: str, data: Any) -> None:
## """Save data to a registered data set.
## Args:
## name: A data set to be saved to.
## data: A data object to be saved as configured in the registered
## data set.
## Raises:
## DatasetNotFoundError: When a data set with the given name
## has not yet been registered.
## Example:
## ::
## >>> import pandas as pd
## >>>
## >>> from kedro.extras.datasets.pandas import CSVDataSet
## >>>
## >>> cars = CSVDataSet(filepath="cars.csv",
## >>> load_args=None,
## >>> save_args={"index": False})
## >>> io = DataCatalog(data_sets={'cars': cars})
## >>>
## >>> df = pd.DataFrame({'col1': [1, 2],
## >>> 'col2': [4, 5],
## >>> 'col3': [5, 6]})
## >>> io.save("cars", df)
## """
## dataset = self._get_dataset(name)
### self._print("Saving data to '%s' (%s)...", name, type(dataset).__name__)
## dataset.save(data)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._housename)
#class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str):

Loading…
Cancel
Save