|
|
|
@ -12,17 +12,40 @@ from kedro.framework.session import KedroSession
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME hériter de abc (cf dans le code de kedro)
|
|
|
|
|
|
|
|
class XMLDataSet:
|
|
|
|
class XMLDataSet:
|
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
|
self._filepath = filepath
|
|
|
|
self._filepath = filepath
|
|
|
|
|
|
|
|
# xml etree internal representation
|
|
|
|
|
|
|
|
self._dom = None
|
|
|
|
|
|
|
|
# xml as an str output
|
|
|
|
|
|
|
|
self._str = None
|
|
|
|
|
|
|
|
|
|
|
|
def get_filepath(self) -> str:
|
|
|
|
def _load(self):
|
|
|
|
"xml file's long filename getters"
|
|
|
|
"kedro's API-like loader"
|
|
|
|
return self._filepath
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
|
|
|
|
|
"kedro's API-like saver"
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
|
|
def tostring(self) -> str:
|
|
|
|
|
|
|
|
"XML source_doc (xml as a string) getter"
|
|
|
|
|
|
|
|
# FIXME : charger le _dom d'abord, puis génerer le str ici
|
|
|
|
|
|
|
|
if getattr(self, '_str') is not None:
|
|
|
|
|
|
|
|
return self._str
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
|
|
|
|
raise AttributeError(f"XMLDataSet dom object {attr_error_msg} has not been loaded yet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
|
|
def filepath(self) -> str:
|
|
|
|
|
|
|
|
"xml file's filename getters"
|
|
|
|
|
|
|
|
return self._filepath
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME à supprimer
|
|
|
|
def get_source_doc(self) -> str:
|
|
|
|
def get_source_doc(self) -> str:
|
|
|
|
"XML source_doc (xml as a string) getter"
|
|
|
|
"XML source_doc (xml as a string) getter"
|
|
|
|
if hasattr(self, 'source_doc'):
|
|
|
|
if hasattr(self, 'source_doc'):
|
|
|
|
@ -31,6 +54,7 @@ class XMLDataSet:
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
|
|
|
|
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME : À SUPPRIMER
|
|
|
|
def set_source_doc(self, source_doc: str) -> None:
|
|
|
|
def set_source_doc(self, source_doc: str) -> None:
|
|
|
|
"XML source_doc (xml as a string) setter"
|
|
|
|
"XML source_doc (xml as a string) setter"
|
|
|
|
self.source_doc = source_doc
|
|
|
|
self.source_doc = source_doc
|
|
|
|
@ -42,6 +66,9 @@ class XMLDataSet:
|
|
|
|
|
|
|
|
|
|
|
|
class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
|
|
|
|
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
|
|
|
return self.str
|
|
|
|
|
|
|
|
|
|
|
|
def _transform_source_doc(self) -> etree._ElementTree:
|
|
|
|
def _transform_source_doc(self) -> etree._ElementTree:
|
|
|
|
"xml transformer (with element tree)"
|
|
|
|
"xml transformer (with element tree)"
|
|
|
|
@ -52,12 +79,10 @@ class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
#replacing element name with its local name
|
|
|
|
#replacing element name with its local name
|
|
|
|
element.tag = etree.QName(element).localname
|
|
|
|
element.tag = etree.QName(element).localname
|
|
|
|
etree.cleanup_namespaces(self.source_doc)
|
|
|
|
etree.cleanup_namespaces(self.source_doc)
|
|
|
|
return self.source_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self) -> etree._ElementTree:
|
|
|
|
def _load(self):
|
|
|
|
"kedro's API-like loader"
|
|
|
|
"kedro's API-like loader"
|
|
|
|
self._transform_source_doc()
|
|
|
|
self._transform_source_doc()
|
|
|
|
return self.source_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
|
"kedro's API-like saver"
|
|
|
|
"kedro's API-like saver"
|
|
|
|
@ -67,17 +92,27 @@ class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
|
|
|
|
class BsXMLDataSet(XMLDataSet):
|
|
|
|
class BsXMLDataSet(XMLDataSet):
|
|
|
|
"XMLDataSet loaded with BeautifulSoup"
|
|
|
|
"XMLDataSet loaded with BeautifulSoup"
|
|
|
|
|
|
|
|
def _load(self) -> str:
|
|
|
|
def _load(self) -> etree._ElementTree:
|
|
|
|
|
|
|
|
"kedro's API-like loader"
|
|
|
|
"kedro's API-like loader"
|
|
|
|
self._transform_source_doc()
|
|
|
|
self.source_doc = self._load_soup()
|
|
|
|
return self.source_doc
|
|
|
|
return self.source_doc
|
|
|
|
|
|
|
|
|
|
|
|
def _load_soup(self):
|
|
|
|
def _load_soup(self):
|
|
|
|
"""open a xml file and return a BeautifulSoup object"""
|
|
|
|
"""open a xml file and return a BeautifulSoup object"""
|
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as opening:
|
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as opening:
|
|
|
|
xml = BeautifulSoup(opening, 'xml')
|
|
|
|
xml = BeautifulSoup(opening, 'xml')
|
|
|
|
return xml
|
|
|
|
self.internal_xml = xml
|
|
|
|
|
|
|
|
## xml.prettify() -> str (source_doc)
|
|
|
|
|
|
|
|
return xml.prettify()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_internal_xml(self):
|
|
|
|
|
|
|
|
"beautiful soup internal DOM"
|
|
|
|
|
|
|
|
if hasattr(self, 'internal_xml'):
|
|
|
|
|
|
|
|
return self.internal_xml
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
attr_error_msg = str(self._describe())
|
|
|
|
|
|
|
|
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
|
|
|
|
|
|
|
|
return self.internal_xml
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
|
"kedro's API-like saver"
|
|
|
|
"kedro's API-like saver"
|
|
|
|
@ -142,7 +177,7 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
|
|
|
|
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
|
"kedro's API loader"
|
|
|
|
"kedro's API loader method"
|
|
|
|
self.datasets = dict()
|
|
|
|
self.datasets = dict()
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(
|
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(
|
|
|
|
@ -150,7 +185,7 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
return self.datasets
|
|
|
|
return self.datasets
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None:
|
|
|
|
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None:
|
|
|
|
"kedro's API saver"
|
|
|
|
"kedro's API saver method"
|
|
|
|
for stemfilename, dataset in datasets.items():
|
|
|
|
for stemfilename, dataset in datasets.items():
|
|
|
|
dataset._save(dataset.get_source_doc())
|
|
|
|
dataset._save(dataset.get_source_doc())
|
|
|
|
|
|
|
|
|
|
|
|
|