develop
gwen 3 years ago
parent 55b6990228
commit 762cc74876

@ -4,7 +4,6 @@ bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection

@ -2,16 +2,21 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Dict from typing import Dict
from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, XMLDataSetCollection from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
# bourbon = catalog['bourbon_xmlcontent']
# logger.info("+++++++++++++++++++" + bourbon['folderpath'])
outputfolderpath = catalog['bourbon_xmlcontent']['folderpath']
def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]: def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
# collection mapping
datasets = datasetcollection.datasets
outputfolderpath = datasetcollection.outputfolderpath
output_datasets = dict() output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because

@ -1,19 +1,8 @@
import logging
from kedro.pipeline import Pipeline, node, pipeline from kedro.pipeline import Pipeline, node, pipeline
from .nodes import parse_xml_collection from .nodes import parse_xml_collection
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
bourbon = catalog['bourbon_xmlcontent']
logger.info("+++++++++++++++++++" + bourbon['folderpath'])
def create_pipeline(**kwargs) -> Pipeline: def create_pipeline(**kwargs) -> Pipeline:
return pipeline( return pipeline(

@ -1,6 +1,6 @@
import logging import logging
import json import json
from typing import Dict, Any, Optional from typing import Dict, Any
from pathlib import Path from pathlib import Path
from lxml import etree from lxml import etree
@ -121,12 +121,9 @@ class XMLDataSetCollection(AbstractDataSet):
""" """
def __init__(self, def __init__(self,
housename: str, housename: str,
folderpath: str, folderpath: str) -> None:
outputfolderpath: Optional[str]=None) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
if outputfolderpath is not None:
self.outputfolderpath = Path(outputfolderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"
@ -134,9 +131,7 @@ class XMLDataSetCollection(AbstractDataSet):
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet( self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath)) filepath=str(filepath))
# return self.datasets return self.datasets
# we need the object itself during transformation
return self
def _save(self, data) -> None: def _save(self, data) -> None:
"""kedro's API saver method """kedro's API saver method

Loading…
Cancel
Save