|
|
|
|
@ -14,33 +14,32 @@ logger = logging.getLogger(__name__)
|
|
|
|
|
class XMLDataSet:
|
|
|
|
|
"lxml.etree._ElementTree loader"
|
|
|
|
|
|
|
|
|
|
# FIXME set the typing signature
|
|
|
|
|
def __init__(self, filepath: str):
|
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
|
|
self._filepath = filepath
|
|
|
|
|
|
|
|
|
|
def get_filepath(self):
|
|
|
|
|
def get_filepath(self) -> str:
|
|
|
|
|
return self._filepath
|
|
|
|
|
|
|
|
|
|
def get_source_doc(self):
|
|
|
|
|
def get_source_doc(self) -> str:
|
|
|
|
|
if hasattr(self, 'source_doc'):
|
|
|
|
|
return self.source_doc
|
|
|
|
|
else:
|
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
|
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
|
|
|
|
|
|
|
|
|
|
def set_source_doc(self, source_doc):
|
|
|
|
|
def set_source_doc(self, source_doc: str) -> None:
|
|
|
|
|
self.source_doc = source_doc
|
|
|
|
|
|
|
|
|
|
def _transform_source_doc(self):
|
|
|
|
|
# remove namespace :
|
|
|
|
|
def _transform_source_doc(self) -> etree._ElementTree:
|
|
|
|
|
# removing namespace
|
|
|
|
|
query = "descendant-or-self::*[namespace-uri()!='']"
|
|
|
|
|
for element in self.source_doc.xpath(query):
|
|
|
|
|
#replace element name with its local name
|
|
|
|
|
#replacing element name with its local name
|
|
|
|
|
element.tag = etree.QName(element).localname
|
|
|
|
|
etree.cleanup_namespaces(self.source_doc)
|
|
|
|
|
return self.source_doc
|
|
|
|
|
|
|
|
|
|
def _load(self):
|
|
|
|
|
def _load(self) -> etree._ElementTree:
|
|
|
|
|
self.source_doc = etree.parse(self._filepath)
|
|
|
|
|
self._transform_source_doc()
|
|
|
|
|
return self.source_doc
|
|
|
|
|
@ -55,104 +54,34 @@ class XMLDataSet:
|
|
|
|
|
|
|
|
|
|
class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
|
"""Stores instances of ``XMLDataSet``
|
|
|
|
|
implementations to provide ``load`` and ``save`` capabilities.
|
|
|
|
|
anywhere in the program. To use a ``DataCatalog``, you need to
|
|
|
|
|
instantiate it with a file system folder path, it "reflects"
|
|
|
|
|
this file system of XML files.
|
|
|
|
|
It loads a dictionary of XML data sets.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
data_sets: A dictionary of data set names and data set instances.
|
|
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
|
|
>>> from .actesdatasets import XMLDataSet, XMLCatalogReflector
|
|
|
|
|
>>>
|
|
|
|
|
>>> cars = XMLDataSet(filepath="cars.xml")
|
|
|
|
|
>>> io = XMLCatalogReflector(housename='bourbon', folderpath='/tmp/mydir', data_sets={'cars': cars})
|
|
|
|
|
# filepath, load_args=None, save_args=None):
|
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
|
"""
|
|
|
|
|
# FIXME set the typing signature
|
|
|
|
|
def __init__(self,
|
|
|
|
|
housename: str,
|
|
|
|
|
folderpath: str):
|
|
|
|
|
folderpath: str) -> None:
|
|
|
|
|
self._housename = housename
|
|
|
|
|
self._folderpath = Path(folderpath)
|
|
|
|
|
|
|
|
|
|
def get_datasets(self):
|
|
|
|
|
def get_datasets(self) -> Dict[str, Any]:
|
|
|
|
|
if hasattr(self, 'datasets'):
|
|
|
|
|
return self.datasets
|
|
|
|
|
else:
|
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
|
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
|
|
|
|
|
|
|
|
|
|
# FIXME : set the signature
|
|
|
|
|
def _load(self):
|
|
|
|
|
":return: dict[str, XMLDataSet]"
|
|
|
|
|
def _load(self) -> dict[str, XMLDataSet]:
|
|
|
|
|
self.datasets = dict()
|
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
|
self.datasets[filepath.stem] = XMLDataSet(
|
|
|
|
|
filepath=str(filepath))
|
|
|
|
|
return self.datasets
|
|
|
|
|
|
|
|
|
|
# FIXME : set the signature
|
|
|
|
|
def _save(self, datasets):
|
|
|
|
|
# faire une méthode save et pas _save
|
|
|
|
|
def _save(self, datasets: dict[str, XMLDataSet]) -> None:
|
|
|
|
|
for stemfilename, dataset in datasets.items():
|
|
|
|
|
# FIXME XXX -> pas besoin refaire un get_source_doc !!!!!!
|
|
|
|
|
dataset._save(dataset.get_source_doc())
|
|
|
|
|
|
|
|
|
|
def _describe(self):
|
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
|
|
return dict(name=self._housename, folderpath=self._folderpath)
|
|
|
|
|
|
|
|
|
|
# def load(self, name: str) -> Any:
|
|
|
|
|
# """Loads a registered data set.
|
|
|
|
|
|
|
|
|
|
# Args:
|
|
|
|
|
# name: A data set to be loaded.
|
|
|
|
|
# version: Optional argument for concrete data version to be loaded.
|
|
|
|
|
# Works only with versioned datasets.
|
|
|
|
|
|
|
|
|
|
# Returns:
|
|
|
|
|
# The loaded data as configured.
|
|
|
|
|
# """
|
|
|
|
|
# return result
|
|
|
|
|
#
|
|
|
|
|
## def save(self, name: str, data: Any) -> None:
|
|
|
|
|
## """Save data to a registered data set.
|
|
|
|
|
|
|
|
|
|
## Args:
|
|
|
|
|
## name: A data set to be saved to.
|
|
|
|
|
## data: A data object to be saved as configured in the registered
|
|
|
|
|
## data set.
|
|
|
|
|
|
|
|
|
|
## Raises:
|
|
|
|
|
## DatasetNotFoundError: When a data set with the given name
|
|
|
|
|
## has not yet been registered.
|
|
|
|
|
|
|
|
|
|
## Example:
|
|
|
|
|
## ::
|
|
|
|
|
|
|
|
|
|
## >>> import pandas as pd
|
|
|
|
|
## >>>
|
|
|
|
|
## >>> from kedro.extras.datasets.pandas import CSVDataSet
|
|
|
|
|
## >>>
|
|
|
|
|
## >>> cars = CSVDataSet(filepath="cars.csv",
|
|
|
|
|
## >>> load_args=None,
|
|
|
|
|
## >>> save_args={"index": False})
|
|
|
|
|
## >>> io = DataCatalog(data_sets={'cars': cars})
|
|
|
|
|
## >>>
|
|
|
|
|
## >>> df = pd.DataFrame({'col1': [1, 2],
|
|
|
|
|
## >>> 'col2': [4, 5],
|
|
|
|
|
## >>> 'col3': [5, 6]})
|
|
|
|
|
## >>> io.save("cars", df)
|
|
|
|
|
## """
|
|
|
|
|
## dataset = self._get_dataset(name)
|
|
|
|
|
### self._print("Saving data to '%s' (%s)...", name, type(dataset).__name__)
|
|
|
|
|
## dataset.save(data)
|
|
|
|
|
|
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
|
|
# return dict(filepath=self._housename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#class JSONDataSet(AbstractDataSet):
|
|
|
|
|
|