add output datasets

develop
gwen 3 years ago
parent 29fab8bc02
commit 4c687c25dd

@ -4,12 +4,12 @@ bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml
# FIXME change the path to data/02_intermediate/houses/bourbon/xml
bourbon_content: bourbon_content:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon folderpath: data/02_intermediate/houses/bourbon/xml
#bourbon_json: #bourbon_json:
# type: actesdataset.XMLDataSetCollection # type: actesdataset.XMLDataSetCollection

@ -7,7 +7,7 @@ from actesdataset import EtreeXMLDataSet
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Dict[str, EtreeXMLDataSet]: def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
output_datasets = dict() output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
@ -18,6 +18,7 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
logger.info(f"dataset {descr} loaded") logger.info(f"dataset {descr} loaded")
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
# output_filepath = _outputfolderpath
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_xmldataset = EtreeXMLDataSet(output_filepath) output_xmldataset = EtreeXMLDataSet(output_filepath)
# let's create subfolders now, if they don't exist # let's create subfolders now, if they don't exist

@ -8,19 +8,19 @@ def create_pipeline(**kwargs) -> Pipeline:
[ [
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs=["bourbon", "params:xsltstylesheet"], inputs=["bourbon"],
outputs="bourbon_content", outputs="bourbon_content",
name="bourbon_ds_collection", name="bourbon_ds_collection",
), ),
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs=["berry", "params:xsltstylesheet"], inputs=["berry"],
outputs="berry_content", outputs="berry_content",
name="berry_ds_collection", name="berry_ds_collection",
), ),
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs=["anjou", "params:xsltstylesheet"], inputs=["anjou"],
outputs="anjou_content", outputs="anjou_content",
name="anjou_ds_collection", name="anjou_ds_collection",
), ),

@ -1,6 +1,6 @@
import logging import logging
import json import json
from typing import Dict, Any from typing import Dict, Any, Optional
from pathlib import Path from pathlib import Path
from lxml import etree from lxml import etree
@ -121,9 +121,11 @@ class XMLDataSetCollection(AbstractDataSet):
""" """
def __init__(self, def __init__(self,
housename: str, housename: str,
folderpath: str) -> None: folderpath: str,
outputfolderpath: Optional[str]=None) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
self._outputfolderpath = outputfolderpath
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"

Loading…
Cancel
Save