diff --git a/actes-princiers/docs/source/img/kedro-viz.png b/actes-princiers/docs/source/img/kedro-viz.png new file mode 100644 index 0000000..29b6615 Binary files /dev/null and b/actes-princiers/docs/source/img/kedro-viz.png differ diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 20ceda8..167245a 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -25,7 +25,7 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di dataset._load() output_source_doc = transform(dataset.get_source_doc(), param) # set dataset's output filepath - output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") + output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") output_xmldataset = EtreeXMLDataSet(output_filepath) output_xmldataset.set_source_doc(output_source_doc) output_datasets[dataset_filenamestem] = output_xmldataset diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 213c1d7..ad03151 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -12,17 +12,40 @@ from kedro.framework.session import KedroSession logger = logging.getLogger(__name__) -# FIXME hériter de abc (cf dans le code de kedro) class XMLDataSet: "Abstract base class for an XML dataset loader" - + def __init__(self, filepath: str) -> None: self._filepath = filepath + # xml etree internal representation + self._dom = None + # xml as an str output + self._str = None - def get_filepath(self) -> str: - "xml file's long filename getters" - return self._filepath + def _load(self): + "kedro's API-like loader" + pass + def _save(self, data:str) -> None: + "kedro's API-like saver" + pass + + @property + def tostring(self) -> str: + "XML source_doc (xml as a string) getter" + # FIXME : charger le _dom d'abord, puis génerer le str ici + if getattr(self, '_str') is not None: + return self._str + else: + attr_error_msg = str(self._describe()) + raise AttributeError(f"XMLDataSet dom object {attr_error_msg} has not been loaded yet") + + @property + def filepath(self) -> str: + "xml file's filename getters" + return self._filepath + + # FIXME à supprimer def get_source_doc(self) -> str: "XML source_doc (xml as a string) getter" if hasattr(self, 'source_doc'): @@ -31,6 +54,7 @@ class XMLDataSet: attr_error_msg = str(self._describe()) raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") + # FIXME : À SUPPRIMER def set_source_doc(self, source_doc: str) -> None: "XML source_doc (xml as a string) setter" self.source_doc = source_doc @@ -42,6 +66,9 @@ class XMLDataSet: class EtreeXMLDataSet(XMLDataSet): "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" + + def __str__(self): + return self.str def _transform_source_doc(self) -> etree._ElementTree: "xml transformer (with element tree)" @@ -52,12 +79,10 @@ class EtreeXMLDataSet(XMLDataSet): #replacing element name with its local name element.tag = etree.QName(element).localname etree.cleanup_namespaces(self.source_doc) - return self.source_doc - def _load(self) -> etree._ElementTree: + def _load(self): "kedro's API-like loader" self._transform_source_doc() - return self.source_doc def _save(self, data:str) -> None: "kedro's API-like saver" @@ -67,17 +92,27 @@ class EtreeXMLDataSet(XMLDataSet): class BsXMLDataSet(XMLDataSet): "XMLDataSet loaded with BeautifulSoup" - - def _load(self) -> etree._ElementTree: + def _load(self) -> str: "kedro's API-like loader" - self._transform_source_doc() + self.source_doc = self._load_soup() return self.source_doc def _load_soup(self): """open a xml file and return a BeautifulSoup object""" with open(self._filepath, 'r', encoding="utf-8") as opening: xml = BeautifulSoup(opening, 'xml') - return xml + self.internal_xml = xml + ## xml.prettify() -> str (source_doc) + return xml.prettify() + + def get_internal_xml(self): + "beautiful soup internal DOM" + if hasattr(self, 'internal_xml'): + return self.internal_xml + else: + attr_error_msg = str(self._describe()) + raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'") + return self.internal_xml def _save(self, data:str) -> None: "kedro's API-like saver" @@ -142,7 +177,7 @@ class XMLDataSetCollection(AbstractDataSet): raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") def _load(self) -> dict[str, EtreeXMLDataSet]: - "kedro's API loader" + "kedro's API loader method" self.datasets = dict() for filepath in sorted(self._folderpath.glob("*.xml")): self.datasets[filepath.stem] = EtreeXMLDataSet( @@ -150,7 +185,7 @@ class XMLDataSetCollection(AbstractDataSet): return self.datasets def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None: - "kedro's API saver" + "kedro's API saver method" for stemfilename, dataset in datasets.items(): dataset._save(dataset.get_source_doc())