develop
gwen 3 years ago
parent 55b6990228
commit 762cc74876

@ -4,7 +4,6 @@ bourbon:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection

@ -2,16 +2,21 @@ import logging
from pathlib import Path
from typing import Dict
from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
# bourbon = catalog['bourbon_xmlcontent']
# logger.info("+++++++++++++++++++" + bourbon['folderpath'])
outputfolderpath = catalog['bourbon_xmlcontent']['folderpath']
def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing"
# collection mapping
datasets = datasetcollection.datasets
outputfolderpath = datasetcollection.outputfolderpath
output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because

@ -1,19 +1,8 @@
import logging
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import parse_xml_collection
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
bourbon = catalog['bourbon_xmlcontent']
logger.info("+++++++++++++++++++" + bourbon['folderpath'])
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(

@ -1,6 +1,6 @@
import logging
import json
from typing import Dict, Any, Optional
from typing import Dict, Any
from pathlib import Path
from lxml import etree
@ -121,12 +121,9 @@ class XMLDataSetCollection(AbstractDataSet):
"""
def __init__(self,
housename: str,
folderpath: str,
outputfolderpath: Optional[str]=None) -> None:
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
if outputfolderpath is not None:
self.outputfolderpath = Path(outputfolderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
@ -134,9 +131,7 @@ class XMLDataSetCollection(AbstractDataSet):
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
# return self.datasets
# we need the object itself during transformation
return self
return self.datasets
def _save(self, data) -> None:
"""kedro's API saver method

Loading…
Cancel
Save