full json

develop
gwen 3 years ago
parent 20cec1e2bd
commit b67739dce8

@ -6,24 +6,38 @@ bourbon:
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output (write) dataset # output (write) **pseudo xml** dataset
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml folderpath: data/02_intermediate/houses/bourbon/xml
# input (read) **pseudo xml** dataset
bourbon_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/xml
# input (read only) dataset # input (read only) dataset
bourbon_json: bourbon_json:
type: actesdataset.BsXMLDataSetCollection type: actesdataset.BsXMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# output (write) dataset # input (read) and output (write) dataset
bourbon_jsonoutput: bourbon_jsonoutput:
type: actesdataset.JSONDataSetCollection type: actesdataset.JSONDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/json folderpath: data/02_intermediate/houses/bourbon/json
# output (write) dataset
bourbon_fulljsonoutput:
type: actesdataset.JSONDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/fulljson
## ________________________________________________________________________ ## ________________________________________________________________________
#berry: #berry:

@ -22,11 +22,11 @@ class ProjectContext(KedroContext):
houses = self.config_loader.get("houses*") houses = self.config_loader.get("houses*")
return houses['raw_datapath'] return houses['raw_datapath']
def get_catalog(self): # def get_catalog(self):
"catalog loader entry point" # "catalog loader entry point"
# loading yaml defined catalogs # # loading yaml defined catalogs
catalog = self.config_loader.get('catalog*') # catalog = self.config_loader.get('catalog*')
return catalog # return catalog
# def _get_catalog(self, *args, **kwargs): # def _get_catalog(self, *args, **kwargs):
# "catalog loader entry point" # "catalog loader entry point"

@ -6,7 +6,7 @@ from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet
from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection, from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection,
JSONDataSetCollection) JSONDataSetCollection, TextDataSetCollection)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -60,15 +60,29 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
output_datasets.datasets[dataset_filenamestem] = output_xmldataset output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets return output_datasets
#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]: def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: TextDataSetCollection) -> JSONDataSetCollection:
"adds xmlcontent to the json"
jsondatasets = jsondoc.datasets
housename = jsondoc._housename
output_datasets = context.catalog.load(housename + '_fulljsonoutput')
outputfolderpath = output_datasets._folderpath
xmldatasets = xmlcontent.datasets
for dataset_filenamestem, dataset in jsondatasets.items():
document = dataset._load()
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
output_xmldataset = JSONDataSet(str(output_filepath))
# json dict update with xmlcontent
if dataset_filenamestem in xmldatasets:
xmlds = xmldatasets[dataset_filenamestem]
# xmlds._load()
document['xmlcontent'] = xmldatasets[dataset_filenamestem]._load()
else:
raise KeyError(f"xmlcontent datasets does not have the key : {dataset_filenamestem}")
# let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(document)
output_datasets.datasets[dataset_filenamestem] = output_xmldataset
return output_datasets
# logger.info("9999999999999999999999" + str(xmlcontent.datasets.keys()))
# json_datasets = jsondoc.datasets
## xmlcontent._load()
# logger.info(str(xmlcontent))
# xmlcontent = xmlcontent.datasets
# for dataset_filenamestem, dataset in json_datasets.items():
# document = dataset._load()
# document['xmlcontent'] = xmlcontent[dataset_filenamestem].source_doc
# return json_datasets
#

@ -1,8 +1,8 @@
from kedro.pipeline import Pipeline, node, pipeline from kedro.pipeline import Pipeline, node, pipeline
from .nodes import (parse_xml_collection, make_json_collection) from .nodes import (parse_xml_collection, make_json_collection,
# add_xmlcontent_tojson) add_xmlcontent_tojson)
def create_pipeline(**kwargs) -> Pipeline: def create_pipeline(**kwargs) -> Pipeline:
@ -20,17 +20,17 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs="bourbon_jsonoutput", outputs="bourbon_jsonoutput",
name="bourbon_json_ds_collection", name="bourbon_json_ds_collection",
), ),
# node( node(
# func=add_xmlcontent_tojson, func=add_xmlcontent_tojson,
# inputs=["bourbon_json", "bourbon_xmlcontent"], inputs=["bourbon_jsonoutput", "bourbon_pseudoxmlcontent"],
# outputs="bourbon_fulljson", outputs="bourbon_fulljsonoutput",
# name="bourbon_fulljson_ds_collection", name="bourbon_fulljson_ds_collection",
# ), ),
# node( # node(
# func=parse_xml_collection, # func=parse_xml_collection,
# inputs="berry", # inputs="berry",
# outputs=None, #"berry_xmlcontent", # outputs="berry_xmlcontent",
# name="berry_ds_collection", # name="berry_ds_collection",
# ), # ),
# node( # node(

@ -192,26 +192,36 @@ class JSONDataSetCollection(DataSetCollection):
filepath=str(filepath)) filepath=str(filepath))
return self return self
#class TextDataSet: class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem """loads/saves data from/to a text file using an underlying filesystem
# example usage example usage
# >>> string_to_write = "This will go in a file." >>> string_to_write = "This will go in a file."
# >>> >>>
# >>> data_set = TextDataSet(filepath="test.md") >>> data_set = TextDataSet(filepath="test.md")
# >>> data_set.save(string_to_write) >>> data_set.save(string_to_write)
# >>> reloaded = data_set.load() >>> reloaded = data_set.load()
# >>> assert string_to_write == reloaded >>> assert string_to_write == reloaded
# """ """
# def __init__(self, filepath: str): def __init__(self, filepath: str):
# self._filepath = filepath self._filepath = filepath
#
# def _load(self) -> str: def _load(self) -> str:
# with open(self._filepath, 'r') as fhandle: with open(self._filepath, 'r') as fhandle:
# return fhandle.read() return fhandle.read()
# def _save(self, data: str) -> None: def _save(self, data: str) -> None:
# with open(self._filepath, 'w') as fhandle: with open(self._filepath, 'w') as fhandle:
# fhandle.write(data) fhandle.write(data)
# def _describe(self) -> Dict[str, Any]: def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath) return dict(filepath=self._filepath)
class TextDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
self.datasets[filepath.stem] = TextDataSet(
filepath=str(filepath))
return self

Loading…
Cancel
Save