full json

develop
gwen 3 years ago
parent 20cec1e2bd
commit b67739dce8

@ -6,24 +6,38 @@ bourbon:
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output (write) dataset
# output (write) **pseudo xml** dataset
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
# input (read) **pseudo xml** dataset
bourbon_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only) dataset
bourbon_json:
type: actesdataset.BsXMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# output (write) dataset
# input (read) and output (write) dataset
bourbon_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/json
# output (write) dataset
bourbon_fulljsonoutput:
type: actesdataset.JSONDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/fulljson
## ________________________________________________________________________
#berry:

@ -22,11 +22,11 @@ class ProjectContext(KedroContext):
houses = self.config_loader.get("houses*")
return houses['raw_datapath']
def get_catalog(self):
"catalog loader entry point"
# loading yaml defined catalogs
catalog = self.config_loader.get('catalog*')
return catalog
# def get_catalog(self):
# "catalog loader entry point"
# # loading yaml defined catalogs
# catalog = self.config_loader.get('catalog*')
# return catalog
# def _get_catalog(self, *args, **kwargs):
# "catalog loader entry point"

@ -6,7 +6,7 @@ from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
JSONDataSetCollection)
JSONDataSetCollection, TextDataSetCollection)
logger = logging.getLogger(__name__)
@ -60,15 +60,29 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets
#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]:
# logger.info("9999999999999999999999" + str(xmlcontent.datasets.keys()))
# json_datasets = jsondoc.datasets
## xmlcontent._load()
# logger.info(str(xmlcontent))
# xmlcontent = xmlcontent.datasets
# for dataset_filenamestem, dataset in json_datasets.items():
# document = dataset._load()
# document['xmlcontent'] = xmlcontent[dataset_filenamestem].source_doc
# return json_datasets
#
def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: TextDataSetCollection) -> JSONDataSetCollection:
"adds xmlcontent to the json"
jsondatasets = jsondoc.datasets
housename = jsondoc._housename
output_datasets = context.catalog.load(housename + '_fulljsonoutput')
outputfolderpath = output_datasets._folderpath
xmldatasets = xmlcontent.datasets
for dataset_filenamestem, dataset in jsondatasets.items():
document = dataset._load()
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
output_xmldataset = JSONDataSet(str(output_filepath))
# json dict update with xmlcontent
if dataset_filenamestem in xmldatasets:
xmlds = xmldatasets[dataset_filenamestem]
# xmlds._load()
document['xmlcontent'] = xmldatasets[dataset_filenamestem]._load()
else:
raise KeyError(f"xmlcontent datasets does not have the key : {dataset_filenamestem}")
# let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(document)
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets

@ -1,8 +1,8 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import (parse_xml_collection, make_json_collection)
# add_xmlcontent_tojson)
from .nodes import (parse_xml_collection, make_json_collection,
add_xmlcontent_tojson)
def create_pipeline(**kwargs) -> Pipeline:
@ -20,17 +20,17 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs="bourbon_jsonoutput",
name="bourbon_json_ds_collection",
),
# node(
# func=add_xmlcontent_tojson,
# inputs=["bourbon_json", "bourbon_xmlcontent"],
# outputs="bourbon_fulljson",
# name="bourbon_fulljson_ds_collection",
# ),
node(
func=add_xmlcontent_tojson,
inputs=["bourbon_jsonoutput", "bourbon_pseudoxmlcontent"],
outputs="bourbon_fulljsonoutput",
name="bourbon_fulljson_ds_collection",
),
# node(
# func=parse_xml_collection,
# inputs="berry",
# outputs=None, #"berry_xmlcontent",
# outputs="berry_xmlcontent",
# name="berry_ds_collection",
# ),
# node(

@ -192,26 +192,36 @@ class JSONDataSetCollection(DataSetCollection):
filepath=str(filepath))
return self
#class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem
# example usage
# >>> string_to_write = "This will go in a file."
# >>>
# >>> data_set = TextDataSet(filepath="test.md")
# >>> data_set.save(string_to_write)
# >>> reloaded = data_set.load()
# >>> assert string_to_write == reloaded
# """
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> str:
# with open(self._filepath, 'r') as fhandle:
# return fhandle.read()
# def _save(self, data: str) -> None:
# with open(self._filepath, 'w') as fhandle:
# fhandle.write(data)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
class TextDataSet:
"""loads/saves data from/to a text file using an underlying filesystem
example usage
>>> string_to_write = "This will go in a file."
>>>
>>> data_set = TextDataSet(filepath="test.md")
>>> data_set.save(string_to_write)
>>> reloaded = data_set.load()
>>> assert string_to_write == reloaded
"""
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self) -> str:
with open(self._filepath, 'r') as fhandle:
return fhandle.read()
def _save(self, data: str) -> None:
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)
class TextDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
self.datasets[filepath.stem] = TextDataSet(
filepath=str(filepath))
return self

Loading…
Cancel
Save