set typing signature

develop
gwen 3 years ago
parent c47620101c
commit 5de2279d28

@ -1,34 +1,34 @@
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Dict
from lxml import etree from lxml import etree
from actesdataset import XMLDataSet from actesdataset import XMLDataSet
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def transform(source_doc, xlststylesheet): def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str:
#<class 'lxml.etree._XSLTResultTree'>
xslt_doc = etree.parse(xlststylesheet) xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc) xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc)) return str(xslt_transformer(source_doc))
def parse_xml_collection(datasets, param): def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]:
# FIXME set signature
# datasets -> dict
# param -> str
output_datasets = dict() output_datasets = dict()
# datasets = bourbon.get_datasets()
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# manually loading the dataset # manually loading the dataset because the collection **is not**
# registered in the catalog
dataset._load() dataset._load()
# transformation on each dataset
output_source_doc = transform(dataset.get_source_doc(), param) output_source_doc = transform(dataset.get_source_doc(), param)
# set dataset's output filepath
output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate")
output_xmldataset = XMLDataSet(output_filepath) output_xmldataset = XMLDataSet(output_filepath)
output_xmldataset.set_source_doc(output_source_doc) output_xmldataset.set_source_doc(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets[dataset_filenamestem] = output_xmldataset
# 02_intermediate :
# let's create subfolders if they don't exist # let's create subfolders if they don't exist
output_filepath = Path(output_filepath) output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent

@ -1,5 +1,4 @@
from kedro.pipeline import Pipeline, node, pipeline from kedro.pipeline import Pipeline, node, pipeline
from kedro.framework.session import KedroSession
from .nodes import parse_xml_collection from .nodes import parse_xml_collection

@ -14,33 +14,32 @@ logger = logging.getLogger(__name__)
class XMLDataSet: class XMLDataSet:
"lxml.etree._ElementTree loader" "lxml.etree._ElementTree loader"
# FIXME set the typing signature def __init__(self, filepath: str) -> None:
def __init__(self, filepath: str):
self._filepath = filepath self._filepath = filepath
def get_filepath(self): def get_filepath(self) -> str:
return self._filepath return self._filepath
def get_source_doc(self): def get_source_doc(self) -> str:
if hasattr(self, 'source_doc'): if hasattr(self, 'source_doc'):
return self.source_doc return self.source_doc
else: else:
attr_error_msg = str(self._describe()) attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc): def set_source_doc(self, source_doc: str) -> None:
self.source_doc = source_doc self.source_doc = source_doc
def _transform_source_doc(self): def _transform_source_doc(self) -> etree._ElementTree:
# remove namespace : # removing namespace
query = "descendant-or-self::*[namespace-uri()!='']" query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query): for element in self.source_doc.xpath(query):
#replace element name with its local name #replacing element name with its local name
element.tag = etree.QName(element).localname element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc) etree.cleanup_namespaces(self.source_doc)
return self.source_doc return self.source_doc
def _load(self): def _load(self) -> etree._ElementTree:
self.source_doc = etree.parse(self._filepath) self.source_doc = etree.parse(self._filepath)
self._transform_source_doc() self._transform_source_doc()
return self.source_doc return self.source_doc
@ -55,104 +54,34 @@ class XMLDataSet:
class XMLDataSetCollection(AbstractDataSet): class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet`` """Stores instances of ``XMLDataSet``
implementations to provide ``load`` and ``save`` capabilities. implementations to provide ``_load`` and ``_save`` capabilities.
anywhere in the program. To use a ``DataCatalog``, you need to
instantiate it with a file system folder path, it "reflects"
this file system of XML files.
It loads a dictionary of XML data sets.
Args:
data_sets: A dictionary of data set names and data set instances.
Example::
>>> from .actesdatasets import XMLDataSet, XMLCatalogReflector
>>>
>>> cars = XMLDataSet(filepath="cars.xml")
>>> io = XMLCatalogReflector(housename='bourbon', folderpath='/tmp/mydir', data_sets={'cars': cars})
# filepath, load_args=None, save_args=None):
""" """
# FIXME set the typing signature
def __init__(self, def __init__(self,
housename: str, housename: str,
folderpath: str): folderpath: str) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
def get_datasets(self): def get_datasets(self) -> Dict[str, Any]:
if hasattr(self, 'datasets'): if hasattr(self, 'datasets'):
return self.datasets return self.datasets
else: else:
attr_error_msg = str(self._describe()) attr_error_msg = str(self._describe())
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
# FIXME : set the signature def _load(self) -> dict[str, XMLDataSet]:
def _load(self):
":return: dict[str, XMLDataSet]"
self.datasets = dict() self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = XMLDataSet( self.datasets[filepath.stem] = XMLDataSet(
filepath=str(filepath)) filepath=str(filepath))
return self.datasets return self.datasets
# FIXME : set the signature def _save(self, datasets: dict[str, XMLDataSet]) -> None:
def _save(self, datasets):
# faire une méthode save et pas _save
for stemfilename, dataset in datasets.items(): for stemfilename, dataset in datasets.items():
# FIXME XXX -> pas besoin refaire un get_source_doc !!!!!!
dataset._save(dataset.get_source_doc()) dataset._save(dataset.get_source_doc())
def _describe(self): def _describe(self) -> dict[str, Any]:
return dict(name=self._housename, folderpath=self._folderpath) return dict(name=self._housename, folderpath=self._folderpath)
# def load(self, name: str) -> Any:
# """Loads a registered data set.
# Args:
# name: A data set to be loaded.
# version: Optional argument for concrete data version to be loaded.
# Works only with versioned datasets.
# Returns:
# The loaded data as configured.
# """
# return result
#
## def save(self, name: str, data: Any) -> None:
## """Save data to a registered data set.
## Args:
## name: A data set to be saved to.
## data: A data object to be saved as configured in the registered
## data set.
## Raises:
## DatasetNotFoundError: When a data set with the given name
## has not yet been registered.
## Example:
## ::
## >>> import pandas as pd
## >>>
## >>> from kedro.extras.datasets.pandas import CSVDataSet
## >>>
## >>> cars = CSVDataSet(filepath="cars.csv",
## >>> load_args=None,
## >>> save_args={"index": False})
## >>> io = DataCatalog(data_sets={'cars': cars})
## >>>
## >>> df = pd.DataFrame({'col1': [1, 2],
## >>> 'col2': [4, 5],
## >>> 'col3': [5, 6]})
## >>> io.save("cars", df)
## """
## dataset = self._get_dataset(name)
### self._print("Saving data to '%s' (%s)...", name, type(dataset).__name__)
## dataset.save(data)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._housename)
#class JSONDataSet(AbstractDataSet): #class JSONDataSet(AbstractDataSet):

Loading…
Cancel
Save