output dataset

develop
gwen 3 years ago
parent f54fb4ba1c
commit 83f39b8986

@ -6,19 +6,19 @@ bourbon:
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output dataset
# output (write) dataset
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only dataset)
# input (read only) dataset
bourbon_json:
type: actesdataset.BsXMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output dataset
# output (write) dataset
bourbon_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: bourbon

@ -5,9 +5,8 @@ from typing import Dict
from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection)
# JSONDataSetCollection)
# FullJSONDataSetCollection)
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
JSONDataSetCollection)
logger = logging.getLogger(__name__)
@ -22,7 +21,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
output_catalog = catalog[housename + '_xmlcontent']
outputfolderpath = output_catalog['folderpath']
# output_datasets = dict()
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
@ -41,13 +39,13 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti
return output_datasets
def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]:
def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetCollection:
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename
output_catalog = catalog[housename + '_jsonoutput']
outputfolderpath = output_catalog['folderpath']
output_datasets = dict()
output_datasets = JSONDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
@ -55,13 +53,13 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLD
output_source_doc = dataset.transform()
# set dataset's output filepath
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
output_xmldataset = BsXMLDataSet(str(output_filepath))
output_xmldataset = JSONDataSet(str(output_filepath))
# let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets
#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]:

Loading…
Cancel
Save