add output datasets

develop
gwen 3 years ago
parent 29fab8bc02
commit 4c687c25dd

@ -4,12 +4,12 @@ bourbon:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
outputfolderpath: data/02_intermediate/houses/bourbon/xml
# FIXME change the path to data/02_intermediate/houses/bourbon/xml
bourbon_content:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
#bourbon_json:
# type: actesdataset.XMLDataSetCollection

@ -7,7 +7,7 @@ from actesdataset import EtreeXMLDataSet
logger = logging.getLogger(__name__)
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Dict[str, EtreeXMLDataSet]:
def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet]) -> Dict[str, EtreeXMLDataSet]:
"node function entry point, performs batch processing"
output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items():
@ -18,6 +18,7 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
logger.info(f"dataset {descr} loaded")
output_source_doc = dataset.transform()
# set dataset's output filepath
# output_filepath = _outputfolderpath
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_xmldataset = EtreeXMLDataSet(output_filepath)
# let's create subfolders now, if they don't exist

@ -8,19 +8,19 @@ def create_pipeline(**kwargs) -> Pipeline:
[
node(
func=parse_xml_collection,
inputs=["bourbon", "params:xsltstylesheet"],
inputs=["bourbon"],
outputs="bourbon_content",
name="bourbon_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["berry", "params:xsltstylesheet"],
inputs=["berry"],
outputs="berry_content",
name="berry_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["anjou", "params:xsltstylesheet"],
inputs=["anjou"],
outputs="anjou_content",
name="anjou_ds_collection",
),

@ -1,6 +1,6 @@
import logging
import json
from typing import Dict, Any
from typing import Dict, Any, Optional
from pathlib import Path
from lxml import etree
@ -121,9 +121,11 @@ class XMLDataSetCollection(AbstractDataSet):
"""
def __init__(self,
housename: str,
folderpath: str) -> None:
folderpath: str,
outputfolderpath: Optional[str]=None) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
self._outputfolderpath = outputfolderpath
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"

Loading…
Cancel
Save