|
|
|
|
@ -7,9 +7,8 @@ from kedro.pipeline import Pipeline, node, pipeline
|
|
|
|
|
|
|
|
|
|
from actesdataset import XMLDataSet
|
|
|
|
|
|
|
|
|
|
#from actes_princiers.pipelines.xml_processing.nodes import parse_xsl
|
|
|
|
|
|
|
|
|
|
def tree(directory, relative_to=None):
|
|
|
|
|
"helper that returns a directory tree structure"
|
|
|
|
|
trees = dict()
|
|
|
|
|
for path in sorted(directory.rglob("*.xml")):
|
|
|
|
|
trees[path.stem] = str(path.relative_to(relative_to))
|
|
|
|
|
@ -46,97 +45,27 @@ def house_dataset_loader(catalog):
|
|
|
|
|
nodes_description.append(node_description)
|
|
|
|
|
return nodes_description
|
|
|
|
|
|
|
|
|
|
# TODO : next step, pipeline step
|
|
|
|
|
#def create_pipeline(**kwargs) -> Dict[str, Pipeline]:
|
|
|
|
|
# """
|
|
|
|
|
# :return: a mapping "pipeline_name", Pipeline() object
|
|
|
|
|
# """
|
|
|
|
|
# nodes_description = kwargs['nodes_description']
|
|
|
|
|
# dataset_pipeline = pipeline(nodes_factory(nodes_description))
|
|
|
|
|
# return {
|
|
|
|
|
# "__default__": Pipeline(
|
|
|
|
|
# dataset_pipeline
|
|
|
|
|
# )
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
class ProjectContext(KedroContext):
|
|
|
|
|
project_name = "actes princiers"
|
|
|
|
|
project_version = "0.1"
|
|
|
|
|
package_name = "actes_princiers"
|
|
|
|
|
|
|
|
|
|
# def get_houses_config(self):
|
|
|
|
|
# """loading from generic configuration file
|
|
|
|
|
# (that is, the global houses `houses.yaml`)"""
|
|
|
|
|
# houses_file = self.config_loader.get("houses*")
|
|
|
|
|
# return houses_file['houses']
|
|
|
|
|
|
|
|
|
|
# def houses_data_catalog_loader(self):
|
|
|
|
|
# "generic houses PartitionedDataSet"
|
|
|
|
|
# houses = self.get_houses_config()
|
|
|
|
|
# for house in houses:
|
|
|
|
|
# house_name = house['name']
|
|
|
|
|
# # FIXME : absolutely not necessary.
|
|
|
|
|
# # just retrieve the data's tree directory
|
|
|
|
|
## self.custom_catalog.add(house_name, PartitionedDataSet(
|
|
|
|
|
## # FIXME put this path in the project's configuration
|
|
|
|
|
## dataset=XMLDataSet,
|
|
|
|
|
## filename_suffix='.xml'))
|
|
|
|
|
# path='data/01_raw/xml/' + house_name,
|
|
|
|
|
|
|
|
|
|
# def houses_dataset_factory(self):
|
|
|
|
|
# "loads all the datasets corresponding to the programmatically loaded catalogs"
|
|
|
|
|
# houses = self.get_houses_config()
|
|
|
|
|
# for house in houses:
|
|
|
|
|
# house_name = house['name']
|
|
|
|
|
# self._house_dataset_loader(house_name)
|
|
|
|
|
|
|
|
|
|
# def house_dataset_loader(self):
|
|
|
|
|
# # FIXME : just dataset catalog, not PartitionedCatalog
|
|
|
|
|
# # **or** a custom DataSet Catalog that lists
|
|
|
|
|
# # copied on the partitionedDataSet
|
|
|
|
|
# #input_catalog = self.custom_catalog.load(house_name)
|
|
|
|
|
# #for dataset_name, in_catalog_load_func in input_catalog.items():
|
|
|
|
|
|
|
|
|
|
# # FIXME : retrieve root path from config
|
|
|
|
|
# # or make an autopath function helper
|
|
|
|
|
# data_root_path = Path.cwd() / 'data' / '01_raw' / 'xml'
|
|
|
|
|
# # FIXME : remove the str() function here
|
|
|
|
|
# for dataset_name, dataset_path in tree(str(data_root_path)).items():
|
|
|
|
|
## for dataset_name, in_catalog_load_func in input_catalog.items():
|
|
|
|
|
## in_catalog_value = in_catalog_load_func()
|
|
|
|
|
# # adding programmatically an input catalog entry
|
|
|
|
|
## dataset_name = house_name + "_" + dataset_name
|
|
|
|
|
# # FIXME : how to set this filename ?
|
|
|
|
|
## dataset_path = "data/01_raw/xml/" + house_name + "/" + dataset_name + ".xml"
|
|
|
|
|
#
|
|
|
|
|
# self.custom_catalog.add(data_set_name=dataset_name,
|
|
|
|
|
# data_set=XMLDataSet(filepath=dataset_path),
|
|
|
|
|
# replace=True)
|
|
|
|
|
# # adding an output catalog entry
|
|
|
|
|
# output_dataset_name = + "_output"
|
|
|
|
|
# # FIXME pas propre : faire ça avec la pathlib...
|
|
|
|
|
# # pas la peine de mettre une extension ".html" ici
|
|
|
|
|
# output_dataset_path = dataset_path.replace("01_raw", "02_intermediate")
|
|
|
|
|
# self.custom_catalog.add(data_set_name=output_dataset_name,
|
|
|
|
|
# data_set=XMLDataSet(filepath=output_dataset_path),
|
|
|
|
|
# replace=True)
|
|
|
|
|
# # usefull information for the next stage (the pipeline stage)
|
|
|
|
|
# self.nodes_description = dict(
|
|
|
|
|
# inputs=dataset_name,
|
|
|
|
|
# outputs=output_dataset_name,
|
|
|
|
|
# name=dataset_name)
|
|
|
|
|
def get_houses_config(self):
|
|
|
|
|
"""loading from generic configuration file
|
|
|
|
|
(that is, the global houses `houses.yaml`)"""
|
|
|
|
|
houses_file = self.config_loader.get("houses*")
|
|
|
|
|
# FIXME : put this in attribute in the context
|
|
|
|
|
return houses_file['houses']
|
|
|
|
|
|
|
|
|
|
def _get_catalog(self, *args, **kwargs):
|
|
|
|
|
"catalog loader entry point"
|
|
|
|
|
# loading yaml defined catalogs
|
|
|
|
|
catalog = super()._get_catalog(*args, **kwargs)
|
|
|
|
|
# kedro.io.data_catalog.DataCatalog
|
|
|
|
|
# si je veux vraiment mettre ça dans le catalog global
|
|
|
|
|
# adding houses generic catalog
|
|
|
|
|
# self.houses_data_catalog_loader()
|
|
|
|
|
# adding the datasets that corresponds to the generic catalogs
|
|
|
|
|
# self.houses_dataset_factory()
|
|
|
|
|
# adding data sets
|
|
|
|
|
self.nodes_description = house_dataset_loader(catalog)
|
|
|
|
|
return catalog
|
|
|
|
|
|
|
|
|
|
def prepare_pipeline_creation(self):
|
|
|
|
|
return self.nodes_description
|
|
|
|
|
|
|
|
|
|
|