diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 4825ff3..50dccd0 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -4,7 +4,6 @@ bourbon: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/01_raw/houses/bourbon - outputfolderpath: data/02_intermediate/houses/bourbon/xml bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 5904028..be5901b 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -2,16 +2,21 @@ import logging from pathlib import Path from typing import Dict +from kedro.framework.session import KedroSession + from actesdataset import EtreeXMLDataSet, XMLDataSetCollection logger = logging.getLogger(__name__) +with KedroSession.create() as session: + context = session.load_context() + catalog = context.get_catalog() +# bourbon = catalog['bourbon_xmlcontent'] +# logger.info("+++++++++++++++++++" + bourbon['folderpath']) + outputfolderpath = catalog['bourbon_xmlcontent']['folderpath'] -def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]: +def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]: "node function entry point, performs batch processing" - # collection mapping - datasets = datasetcollection.datasets - outputfolderpath = datasetcollection.outputfolderpath output_datasets = dict() for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 614daf3..706c58b 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -1,19 +1,8 @@ -import logging from kedro.pipeline import Pipeline, node, pipeline from .nodes import parse_xml_collection -from kedro.framework.session import KedroSession - -logger = logging.getLogger(__name__) - -with KedroSession.create() as session: - context = session.load_context() - catalog = context.get_catalog() - bourbon = catalog['bourbon_xmlcontent'] - logger.info("+++++++++++++++++++" + bourbon['folderpath']) - def create_pipeline(**kwargs) -> Pipeline: return pipeline( diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index da6a7d9..7680eec 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -1,6 +1,6 @@ import logging import json -from typing import Dict, Any, Optional +from typing import Dict, Any from pathlib import Path from lxml import etree @@ -121,12 +121,9 @@ class XMLDataSetCollection(AbstractDataSet): """ def __init__(self, housename: str, - folderpath: str, - outputfolderpath: Optional[str]=None) -> None: + folderpath: str) -> None: self._housename = housename self._folderpath = Path(folderpath) - if outputfolderpath is not None: - self.outputfolderpath = Path(outputfolderpath) def _load(self) -> dict[str, EtreeXMLDataSet]: "kedro's API loader method" @@ -134,9 +131,7 @@ class XMLDataSetCollection(AbstractDataSet): for filepath in sorted(self._folderpath.glob("*.xml")): self.datasets[filepath.stem] = EtreeXMLDataSet( filepath=str(filepath)) - # return self.datasets - # we need the object itself during transformation - return self + return self.datasets def _save(self, data) -> None: """kedro's API saver method