passage au wrapper _dom

develop
gwen 3 years ago
parent 567aeed0a1
commit 5b2894256e

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

@ -25,7 +25,7 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
dataset._load() dataset._load()
output_source_doc = transform(dataset.get_source_doc(), param) output_source_doc = transform(dataset.get_source_doc(), param)
# set dataset's output filepath # set dataset's output filepath
output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_xmldataset = EtreeXMLDataSet(output_filepath) output_xmldataset = EtreeXMLDataSet(output_filepath)
output_xmldataset.set_source_doc(output_source_doc) output_xmldataset.set_source_doc(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets[dataset_filenamestem] = output_xmldataset

@ -12,17 +12,40 @@ from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# FIXME hériter de abc (cf dans le code de kedro)
class XMLDataSet: class XMLDataSet:
"Abstract base class for an XML dataset loader" "Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None: def __init__(self, filepath: str) -> None:
self._filepath = filepath self._filepath = filepath
# xml etree internal representation
self._dom = None
# xml as an str output
self._str = None
def get_filepath(self) -> str: def _load(self):
"xml file's long filename getters" "kedro's API-like loader"
return self._filepath pass
def _save(self, data:str) -> None:
"kedro's API-like saver"
pass
@property
def tostring(self) -> str:
"XML source_doc (xml as a string) getter"
# FIXME : charger le _dom d'abord, puis génerer le str ici
if getattr(self, '_str') is not None:
return self._str
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet dom object {attr_error_msg} has not been loaded yet")
@property
def filepath(self) -> str:
"xml file's filename getters"
return self._filepath
# FIXME à supprimer
def get_source_doc(self) -> str: def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter" "XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'): if hasattr(self, 'source_doc'):
@ -31,6 +54,7 @@ class XMLDataSet:
attr_error_msg = str(self._describe()) attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
# FIXME : À SUPPRIMER
def set_source_doc(self, source_doc: str) -> None: def set_source_doc(self, source_doc: str) -> None:
"XML source_doc (xml as a string) setter" "XML source_doc (xml as a string) setter"
self.source_doc = source_doc self.source_doc = source_doc
@ -42,6 +66,9 @@ class XMLDataSet:
class EtreeXMLDataSet(XMLDataSet): class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def __str__(self):
return self.str
def _transform_source_doc(self) -> etree._ElementTree: def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)" "xml transformer (with element tree)"
@ -52,12 +79,10 @@ class EtreeXMLDataSet(XMLDataSet):
#replacing element name with its local name #replacing element name with its local name
element.tag = etree.QName(element).localname element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc) etree.cleanup_namespaces(self.source_doc)
return self.source_doc
def _load(self) -> etree._ElementTree: def _load(self):
"kedro's API-like loader" "kedro's API-like loader"
self._transform_source_doc() self._transform_source_doc()
return self.source_doc
def _save(self, data:str) -> None: def _save(self, data:str) -> None:
"kedro's API-like saver" "kedro's API-like saver"
@ -67,17 +92,27 @@ class EtreeXMLDataSet(XMLDataSet):
class BsXMLDataSet(XMLDataSet): class BsXMLDataSet(XMLDataSet):
"XMLDataSet loaded with BeautifulSoup" "XMLDataSet loaded with BeautifulSoup"
def _load(self) -> str:
def _load(self) -> etree._ElementTree:
"kedro's API-like loader" "kedro's API-like loader"
self._transform_source_doc() self.source_doc = self._load_soup()
return self.source_doc return self.source_doc
def _load_soup(self): def _load_soup(self):
"""open a xml file and return a BeautifulSoup object""" """open a xml file and return a BeautifulSoup object"""
with open(self._filepath, 'r', encoding="utf-8") as opening: with open(self._filepath, 'r', encoding="utf-8") as opening:
xml = BeautifulSoup(opening, 'xml') xml = BeautifulSoup(opening, 'xml')
return xml self.internal_xml = xml
## xml.prettify() -> str (source_doc)
return xml.prettify()
def get_internal_xml(self):
"beautiful soup internal DOM"
if hasattr(self, 'internal_xml'):
return self.internal_xml
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
return self.internal_xml
def _save(self, data:str) -> None: def _save(self, data:str) -> None:
"kedro's API-like saver" "kedro's API-like saver"
@ -142,7 +177,7 @@ class XMLDataSetCollection(AbstractDataSet):
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader" "kedro's API loader method"
self.datasets = dict() self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet( self.datasets[filepath.stem] = EtreeXMLDataSet(
@ -150,7 +185,7 @@ class XMLDataSetCollection(AbstractDataSet):
return self.datasets return self.datasets
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None: def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None:
"kedro's API saver" "kedro's API saver method"
for stemfilename, dataset in datasets.items(): for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc()) dataset._save(dataset.get_source_doc())

Loading…
Cancel
Save