output - node

develop
gwen 3 years ago
parent 4c687c25dd
commit 1991698e5c

@ -6,7 +6,7 @@ bourbon:
folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml
bourbon_content:
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
@ -19,25 +19,25 @@ bourbon_content:
# ________________________________________________________________________
berry:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/berry
#berry:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/01_raw/houses/berry
berry_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/berry
#berry_content:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/02_intermediate/houses/berry
# ________________________________________________________________________
## ________________________________________________________________________
anjou:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/anjou
#anjou:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/01_raw/houses/anjou
anjou_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/anjou
#anjou_content:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/02_intermediate/houses/anjou

@ -2,13 +2,16 @@ import logging
from pathlib import Path
from typing import Dict
from actesdataset import EtreeXMLDataSet
from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
logger = logging.getLogger(__name__)
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing"
# collection mapping
datasets = datasetcollection.datasets
outputfolderpath = datasetcollection.outputfolderpath
output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
@ -18,9 +21,9 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, Etre
logger.info(f"dataset {descr} loaded")
output_source_doc = dataset.transform()
# set dataset's output filepath
# output_filepath = _outputfolderpath
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_xmldataset = EtreeXMLDataSet(output_filepath)
# output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
output_xmldataset = EtreeXMLDataSet(str(output_filepath))
# let's create subfolders now, if they don't exist
output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent

@ -9,21 +9,21 @@ def create_pipeline(**kwargs) -> Pipeline:
node(
func=parse_xml_collection,
inputs=["bourbon"],
outputs="bourbon_content",
outputs="bourbon_xmlcontent",
name="bourbon_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["berry"],
outputs="berry_content",
name="berry_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["anjou"],
outputs="anjou_content",
name="anjou_ds_collection",
),
# node(
# func=parse_xml_collection,
# inputs=["berry"],
# outputs="berry_content",
# name="berry_ds_collection",
# ),
# node(
# func=parse_xml_collection,
# inputs=["anjou"],
# outputs="anjou_content",
# name="anjou_ds_collection",
# ),
]
)

@ -125,7 +125,8 @@ class XMLDataSetCollection(AbstractDataSet):
outputfolderpath: Optional[str]=None) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
self._outputfolderpath = outputfolderpath
if outputfolderpath is not None:
self.outputfolderpath = Path(outputfolderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
@ -133,7 +134,9 @@ class XMLDataSetCollection(AbstractDataSet):
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
return self.datasets
# return self.datasets
# we need the object itself during transformation
return self
def _save(self, data) -> None:
"""kedro's API saver method

Loading…
Cancel
Save