output - node

develop
gwen 3 years ago
parent 4c687c25dd
commit 1991698e5c

@ -6,7 +6,7 @@ bourbon:
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml outputfolderpath: data/02_intermediate/houses/bourbon/xml
bourbon_content: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml folderpath: data/02_intermediate/houses/bourbon/xml
@ -19,25 +19,25 @@ bourbon_content:
# ________________________________________________________________________ # ________________________________________________________________________
berry: #berry:
type: actesdataset.XMLDataSetCollection # type: actesdataset.XMLDataSetCollection
housename: berry # housename: berry
folderpath: data/01_raw/houses/berry # folderpath: data/01_raw/houses/berry
berry_content: #berry_content:
type: actesdataset.XMLDataSetCollection # type: actesdataset.XMLDataSetCollection
housename: berry # housename: berry
folderpath: data/02_intermediate/houses/berry # folderpath: data/02_intermediate/houses/berry
# ________________________________________________________________________ ## ________________________________________________________________________
anjou: #anjou:
type: actesdataset.XMLDataSetCollection # type: actesdataset.XMLDataSetCollection
housename: berry # housename: berry
folderpath: data/01_raw/houses/anjou # folderpath: data/01_raw/houses/anjou
anjou_content: #anjou_content:
type: actesdataset.XMLDataSetCollection # type: actesdataset.XMLDataSetCollection
housename: berry # housename: berry
folderpath: data/02_intermediate/houses/anjou # folderpath: data/02_intermediate/houses/anjou

@ -2,13 +2,16 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Dict from typing import Dict
from actesdataset import EtreeXMLDataSet from actesdataset import EtreeXMLDataSet, XMLDataSetCollection
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]: def parse_xml_collection(datasetcollection: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
# collection mapping
datasets = datasetcollection.datasets
outputfolderpath = datasetcollection.outputfolderpath
output_datasets = dict() output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
@ -18,9 +21,9 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, Etre
logger.info(f"dataset {descr} loaded") logger.info(f"dataset {descr} loaded")
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
# output_filepath = _outputfolderpath # output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
output_xmldataset = EtreeXMLDataSet(output_filepath) output_xmldataset = EtreeXMLDataSet(str(output_filepath))
# let's create subfolders now, if they don't exist # let's create subfolders now, if they don't exist
output_filepath = Path(output_filepath) output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent

@ -9,21 +9,21 @@ def create_pipeline(**kwargs) -> Pipeline:
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs=["bourbon"], inputs=["bourbon"],
outputs="bourbon_content", outputs="bourbon_xmlcontent",
name="bourbon_ds_collection", name="bourbon_ds_collection",
), ),
node( # node(
func=parse_xml_collection, # func=parse_xml_collection,
inputs=["berry"], # inputs=["berry"],
outputs="berry_content", # outputs="berry_content",
name="berry_ds_collection", # name="berry_ds_collection",
), # ),
node( # node(
func=parse_xml_collection, # func=parse_xml_collection,
inputs=["anjou"], # inputs=["anjou"],
outputs="anjou_content", # outputs="anjou_content",
name="anjou_ds_collection", # name="anjou_ds_collection",
), # ),
] ]
) )

@ -125,7 +125,8 @@ class XMLDataSetCollection(AbstractDataSet):
outputfolderpath: Optional[str]=None) -> None: outputfolderpath: Optional[str]=None) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
self._outputfolderpath = outputfolderpath if outputfolderpath is not None:
self.outputfolderpath = Path(outputfolderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"
@ -133,7 +134,9 @@ class XMLDataSetCollection(AbstractDataSet):
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet( self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath)) filepath=str(filepath))
return self.datasets # return self.datasets
# we need the object itself during transformation
return self
def _save(self, data) -> None: def _save(self, data) -> None:
"""kedro's API saver method """kedro's API saver method

Loading…
Cancel
Save