diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 1e410ca..84df6ef 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -6,24 +6,38 @@ bourbon: housename: bourbon folderpath: data/01_raw/houses/bourbon -# output (write) dataset +# output (write) **pseudo xml** dataset bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon/xml +# input (read) **pseudo xml** dataset +bourbon_pseudoxmlcontent: + type: actesdataset.TextDataSetCollection + housename: bourbon + folderpath: data/02_intermediate/houses/bourbon/xml + # input (read only) dataset bourbon_json: type: actesdataset.BsXMLDataSetCollection housename: bourbon folderpath: data/01_raw/houses/bourbon -# output (write) dataset +# input (read) and output (write) dataset bourbon_jsonoutput: type: actesdataset.JSONDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon/json +# output (write) dataset +bourbon_fulljsonoutput: + type: actesdataset.JSONDataSetCollection + housename: bourbon + folderpath: data/02_intermediate/houses/bourbon/fulljson + + + ## ________________________________________________________________________ #berry: diff --git a/actes-princiers/src/actes_princiers/customcontext.py b/actes-princiers/src/actes_princiers/customcontext.py index 49763a3..173d518 100644 --- a/actes-princiers/src/actes_princiers/customcontext.py +++ b/actes-princiers/src/actes_princiers/customcontext.py @@ -22,11 +22,11 @@ class ProjectContext(KedroContext): houses = self.config_loader.get("houses*") return houses['raw_datapath'] - def get_catalog(self): - "catalog loader entry point" - # loading yaml defined catalogs - catalog = self.config_loader.get('catalog*') - return catalog +# def get_catalog(self): +# "catalog loader entry point" +# # loading yaml defined catalogs +# catalog = self.config_loader.get('catalog*') +# return catalog # def _get_catalog(self, *args, **kwargs): # "catalog loader entry point" diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 17b4ee6..928f7a4 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -6,7 +6,7 @@ from kedro.framework.session import KedroSession from actesdataset import EtreeXMLDataSet, BsXMLDataSet, JSONDataSet from actesdataset import (XMLDataSetCollection, BsXMLDataSetCollection, - JSONDataSetCollection) + JSONDataSetCollection, TextDataSetCollection) logger = logging.getLogger(__name__) @@ -60,15 +60,29 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle output_datasets.datasets[dataset_filenamestem] = output_xmldataset return output_datasets -#def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: XMLDataSetCollection) -> Dict[str, JSONDataSet]: - -# logger.info("9999999999999999999999" + str(xmlcontent.datasets.keys())) -# json_datasets = jsondoc.datasets -## xmlcontent._load() -# logger.info(str(xmlcontent)) -# xmlcontent = xmlcontent.datasets -# for dataset_filenamestem, dataset in json_datasets.items(): -# document = dataset._load() -# document['xmlcontent'] = xmlcontent[dataset_filenamestem].source_doc -# return json_datasets -# +def add_xmlcontent_tojson(jsondoc: JSONDataSetCollection, xmlcontent: TextDataSetCollection) -> JSONDataSetCollection: + "adds xmlcontent to the json" + jsondatasets = jsondoc.datasets + housename = jsondoc._housename + output_datasets = context.catalog.load(housename + '_fulljsonoutput') + outputfolderpath = output_datasets._folderpath + xmldatasets = xmlcontent.datasets + for dataset_filenamestem, dataset in jsondatasets.items(): + document = dataset._load() + output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json") + output_xmldataset = JSONDataSet(str(output_filepath)) + # json dict update with xmlcontent + if dataset_filenamestem in xmldatasets: + xmlds = xmldatasets[dataset_filenamestem] +# xmlds._load() + document['xmlcontent'] = xmldatasets[dataset_filenamestem]._load() + else: + raise KeyError(f"xmlcontent datasets does not have the key : {dataset_filenamestem}") + # let's create subfolders, if they don't exist + output_xmldataset_dir = output_filepath.parent + output_xmldataset_dir.mkdir(parents=True, exist_ok=True) + # save on file + output_xmldataset._save(document) + output_datasets.datasets[dataset_filenamestem] = output_xmldataset + return output_datasets + diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 0f5406e..aeb764d 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -1,8 +1,8 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import (parse_xml_collection, make_json_collection) -# add_xmlcontent_tojson) +from .nodes import (parse_xml_collection, make_json_collection, + add_xmlcontent_tojson) def create_pipeline(**kwargs) -> Pipeline: @@ -20,17 +20,17 @@ def create_pipeline(**kwargs) -> Pipeline: outputs="bourbon_jsonoutput", name="bourbon_json_ds_collection", ), -# node( -# func=add_xmlcontent_tojson, -# inputs=["bourbon_json", "bourbon_xmlcontent"], -# outputs="bourbon_fulljson", -# name="bourbon_fulljson_ds_collection", -# ), + node( + func=add_xmlcontent_tojson, + inputs=["bourbon_jsonoutput", "bourbon_pseudoxmlcontent"], + outputs="bourbon_fulljsonoutput", + name="bourbon_fulljson_ds_collection", + ), # node( # func=parse_xml_collection, # inputs="berry", -# outputs=None, #"berry_xmlcontent", +# outputs="berry_xmlcontent", # name="berry_ds_collection", # ), # node( diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 0bfda10..77039c7 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -192,26 +192,36 @@ class JSONDataSetCollection(DataSetCollection): filepath=str(filepath)) return self -#class TextDataSet: -# """loads/saves data from/to a text file using an underlying filesystem -# example usage -# >>> string_to_write = "This will go in a file." -# >>> -# >>> data_set = TextDataSet(filepath="test.md") -# >>> data_set.save(string_to_write) -# >>> reloaded = data_set.load() -# >>> assert string_to_write == reloaded -# """ -# def __init__(self, filepath: str): -# self._filepath = filepath -# -# def _load(self) -> str: -# with open(self._filepath, 'r') as fhandle: -# return fhandle.read() - -# def _save(self, data: str) -> None: -# with open(self._filepath, 'w') as fhandle: -# fhandle.write(data) - -# def _describe(self) -> Dict[str, Any]: -# return dict(filepath=self._filepath) +class TextDataSet: + """loads/saves data from/to a text file using an underlying filesystem + example usage + >>> string_to_write = "This will go in a file." + >>> + >>> data_set = TextDataSet(filepath="test.md") + >>> data_set.save(string_to_write) + >>> reloaded = data_set.load() + >>> assert string_to_write == reloaded + """ + def __init__(self, filepath: str): + self._filepath = filepath + + def _load(self) -> str: + with open(self._filepath, 'r') as fhandle: + return fhandle.read() + + def _save(self, data: str) -> None: + with open(self._filepath, 'w') as fhandle: + fhandle.write(data) + + def _describe(self) -> Dict[str, Any]: + return dict(filepath=self._filepath) + +class TextDataSetCollection(DataSetCollection): + def _load(self) -> dict[str, JSONDataSet]: + "kedro's API loader method" + self.datasets = dict() + for filepath in sorted(self._folderpath.glob("*.pseudoxml")): + self.datasets[filepath.stem] = TextDataSet( + filepath=str(filepath)) + return self +