You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
3.6 KiB
Python

import logging
import json
from typing import Dict, Any
from pathlib import Path
from lxml import etree
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
class XMLDataSet:
"lxml.etree._ElementTree loader"
def __init__(self, filepath: str) -> None:
self._filepath = filepath
def get_filepath(self) -> str:
"xml file's long filename getters"
return self._filepath
def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'):
return self.source_doc
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc: str) -> None:
"XML source_doc (xml as a string) setter"
self.source_doc = source_doc
def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)"
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
#replacing element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc)
return self.source_doc
def _load(self) -> etree._ElementTree:
"kedro's API-like loader"
self.source_doc = etree.parse(self._filepath)
self._transform_source_doc()
return self.source_doc
def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath)
class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet``
implementations to provide ``_load`` and ``_save`` capabilities.
"""
def __init__(self,
housename: str,
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
def get_datasets(self) -> Dict[str, Any]:
"datasets mapper getter"
if hasattr(self, 'datasets'):
return self.datasets
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
def _load(self) -> dict[str, XMLDataSet]:
"kedro's API loader"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = XMLDataSet(
filepath=str(filepath))
return self.datasets
def _save(self, datasets: dict[str, XMLDataSet]) -> None:
"kedro's API saver"
for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc())
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath)
#class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
# def _load(self) -> Dict:
# with open(self._filepath, 'r') as fp:
# return json.load(fp)
# def _save(self, data: Dict) -> None:
# with open(self._filepath, 'w') as fp:
# json.dump(data, fp, sort_keys=True, indent=4)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)