full json

develop
gwen 3 years ago
parent d695c8b8f0
commit cf0d386ebf

@ -1,22 +1,26 @@
# ________________________________________________________________________ # ________________________________________________________________________
# input (read only) dataset
bourbon: bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output dataset
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only dataset)
bourbon_json: bourbon_json:
type: actesdataset.JSONDataSetCollection type: actesdataset.BsXMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output dataset
bourbon_jsonoutput: bourbon_jsonoutput:
type: actesdataset.FullJSONDataSetCollection type: actesdataset.JSONDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/json folderpath: data/02_intermediate/houses/bourbon/json

@ -5,7 +5,8 @@ from typing import Dict
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, JSONDataSetCollection) from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection)
# JSONDataSetCollection)
# FullJSONDataSetCollection) # FullJSONDataSetCollection)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -14,14 +15,15 @@ with KedroSession.create() as session:
context = session.load_context() context = session.load_context()
catalog = context.get_catalog() catalog = context.get_catalog()
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXMLDataSet]: def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml" # outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
output_catalog = catalog[housename + '_xmlcontent'] output_catalog = catalog[housename + '_xmlcontent']
outputfolderpath = output_catalog['folderpath'] outputfolderpath = output_catalog['folderpath']
output_datasets = dict() # output_datasets = dict()
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
@ -35,11 +37,11 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
output_xmldataset_dir.mkdir(parents=True, exist_ok=True) output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file # save on file
output_xmldataset._save(output_source_doc) output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets return output_datasets
def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]: def parse_json_collection(datasetcol: BsXMLDataSetCollection) -> Dict[str, BsXMLDataSet]:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename

@ -132,6 +132,8 @@ class DataSetCollection(AbstractDataSet):
folderpath: str) -> None: folderpath: str) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
# the collections key: file name, value: dataset object
self.datasets = dict()
def _save(self, data) -> None: def _save(self, data) -> None:
"""kedro's API saver method """kedro's API saver method
@ -147,17 +149,17 @@ class DataSetCollection(AbstractDataSet):
return dict(name=self._housename, return dict(name=self._housename,
folderpath=str(self._folderpath)) folderpath=str(self._folderpath))
class XMLDataSetCollection(DataSetCollection): class XMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet( self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath)) filepath=str(filepath))
return self return self
class JSONDataSetCollection(DataSetCollection): class BsXMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, BsXMLDataSet]: def _load(self) -> dict[str, BsXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"
self.datasets = dict() self.datasets = dict()
@ -183,7 +185,7 @@ class JSONDataSet: #(AbstractDataSet):
return dict(filepath=self._filepath) return dict(filepath=self._filepath)
class FullJSONDataSetCollection(DataSetCollection): class JSONDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]: def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method" "kedro's API loader method"
self.datasets = dict() self.datasets = dict()

Loading…
Cancel
Save