output dataset

develop
gwen 3 years ago
parent f54fb4ba1c
commit 83f39b8986

@ -6,19 +6,19 @@ bourbon:
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output dataset # output (write) dataset
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only dataset) # input (read only) dataset
bourbon_json: bourbon_json:
type: actesdataset.BsXMLDataSetCollection type: actesdataset.BsXMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output dataset # output (write) dataset
bourbon_jsonoutput: bourbon_jsonoutput:
type: actesdataset.JSONDataSetCollection type: actesdataset.JSONDataSetCollection
housename: bourbon housename: bourbon

@ -5,9 +5,8 @@ from typing import Dict
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection) from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
# JSONDataSetCollection) JSONDataSetCollection)
# FullJSONDataSetCollection)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,7 +21,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml" # outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
output_catalog = catalog[housename + '_xmlcontent'] output_catalog = catalog[housename + '_xmlcontent']
outputfolderpath = output_catalog['folderpath'] outputfolderpath = output_catalog['folderpath']
# output_datasets = dict()
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath)) output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
@ -41,13 +39,13 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti
return output_datasets return output_datasets
def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]: def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetCollection:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename
output_catalog = catalog[housename + '_jsonoutput'] output_catalog = catalog[housename + '_jsonoutput']
outputfolderpath = output_catalog['folderpath'] outputfolderpath = output_catalog['folderpath']
output_datasets = dict() output_datasets = JSONDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
@ -55,13 +53,13 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLD
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json") output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
output_xmldataset = BsXMLDataSet(str(output_filepath)) output_xmldataset = JSONDataSet(str(output_filepath))
# let's create subfolders, if they don't exist # let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True) output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file # save on file
output_xmldataset._save(output_source_doc) output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets return output_datasets
#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]: #def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]:

Loading…
Cancel
Save