full json

develop
gwen 3 years ago
parent d695c8b8f0
commit cf0d386ebf

@ -1,22 +1,26 @@
# ________________________________________________________________________
# input (read only) dataset
bourbon:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output dataset
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only dataset)
bourbon_json:
type: actesdataset.JSONDataSetCollection
type: actesdataset.BsXMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output dataset
bourbon_jsonoutput:
type: actesdataset.FullJSONDataSetCollection
type: actesdataset.JSONDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/json

@ -5,7 +5,8 @@ from typing import Dict
from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, JSONDataSetCollection)
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection)
# JSONDataSetCollection)
# FullJSONDataSetCollection)
logger = logging.getLogger(__name__)
@ -14,14 +15,15 @@ with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]:
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
output_catalog = catalog[housename + '_xmlcontent']
outputfolderpath = output_catalog['folderpath']
output_datasets = dict()
# output_datasets = dict()
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
@ -35,11 +37,11 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets
def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]:
def parse_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]:
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename

@ -132,6 +132,8 @@ class DataSetCollection(AbstractDataSet):
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
# the collections key: file name, value: dataset object
self.datasets = dict()
def _save(self, data) -> None:
"""kedro's API saver method
@ -147,17 +149,17 @@ class DataSetCollection(AbstractDataSet):
return dict(name=self._housename,
folderpath=str(self._folderpath))
class XMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
return self
class JSONDataSetCollection(DataSetCollection):
class BsXMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, BsXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
@ -183,7 +185,7 @@ class JSONDataSet: #(AbstractDataSet):
return dict(filepath=self._filepath)
class FullJSONDataSetCollection(DataSetCollection):
class JSONDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
self.datasets = dict()

Loading…
Cancel
Save