|
|
|
@ -2,7 +2,6 @@ from pathlib import Path
|
|
|
|
from typing import Dict
|
|
|
|
from typing import Dict
|
|
|
|
|
|
|
|
|
|
|
|
from kedro.framework.context import KedroContext
|
|
|
|
from kedro.framework.context import KedroContext
|
|
|
|
#from kedro.pipeline import Pipeline
|
|
|
|
|
|
|
|
from kedro.pipeline import Pipeline, node, pipeline
|
|
|
|
from kedro.pipeline import Pipeline, node, pipeline
|
|
|
|
|
|
|
|
|
|
|
|
from actesdataset import XMLDataSet
|
|
|
|
from actesdataset import XMLDataSet
|
|
|
|
@ -14,36 +13,6 @@ def tree(directory, relative_to=None):
|
|
|
|
trees[path.stem] = str(path.relative_to(relative_to))
|
|
|
|
trees[path.stem] = str(path.relative_to(relative_to))
|
|
|
|
return trees
|
|
|
|
return trees
|
|
|
|
|
|
|
|
|
|
|
|
def house_dataset_loader(catalog):
|
|
|
|
|
|
|
|
nodes_description = []
|
|
|
|
|
|
|
|
# FIXME : a custom DataSet Catalog that lists
|
|
|
|
|
|
|
|
# input_catalog = catalog.load(house_name)
|
|
|
|
|
|
|
|
# FIXME : set root path from config, not here
|
|
|
|
|
|
|
|
# or make an autopath function helper
|
|
|
|
|
|
|
|
data_root_path = Path.cwd() / 'data' / '01_raw' / 'xml'
|
|
|
|
|
|
|
|
relative_to = Path.cwd()
|
|
|
|
|
|
|
|
for dataset_name, dataset_path in tree(data_root_path, relative_to=relative_to).items():
|
|
|
|
|
|
|
|
# dataset_path : data/01_raw/xml/bourbon/brb_ch_i_1437_08_18a.xml
|
|
|
|
|
|
|
|
catalog.add(data_set_name=dataset_name,
|
|
|
|
|
|
|
|
data_set=XMLDataSet(filepath=dataset_path),
|
|
|
|
|
|
|
|
replace=True)
|
|
|
|
|
|
|
|
# adding an output catalog entry
|
|
|
|
|
|
|
|
output_dataset_name = dataset_name + "_output"
|
|
|
|
|
|
|
|
# FIXME : JE NE SUIS PAS SATISFAIT
|
|
|
|
|
|
|
|
output_dataset_path = Path(dataset_path.replace("01_raw", "02_intermediate"))
|
|
|
|
|
|
|
|
# let's create subfolders if they don't exist
|
|
|
|
|
|
|
|
output_dataset_dir = output_dataset_path.parent
|
|
|
|
|
|
|
|
output_dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
catalog.add(data_set_name=output_dataset_name,
|
|
|
|
|
|
|
|
data_set=XMLDataSet(filepath=output_dataset_path),
|
|
|
|
|
|
|
|
replace=True)
|
|
|
|
|
|
|
|
# prepare information for the next stage (the pipeline stage)
|
|
|
|
|
|
|
|
node_description = dict(
|
|
|
|
|
|
|
|
inputs=dataset_name,
|
|
|
|
|
|
|
|
outputs=output_dataset_name,
|
|
|
|
|
|
|
|
name=dataset_name)
|
|
|
|
|
|
|
|
nodes_description.append(node_description)
|
|
|
|
|
|
|
|
return nodes_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ProjectContext(KedroContext):
|
|
|
|
class ProjectContext(KedroContext):
|
|
|
|
project_name = "actes princiers"
|
|
|
|
project_name = "actes princiers"
|
|
|
|
@ -63,9 +32,37 @@ class ProjectContext(KedroContext):
|
|
|
|
catalog = super()._get_catalog(*args, **kwargs)
|
|
|
|
catalog = super()._get_catalog(*args, **kwargs)
|
|
|
|
# kedro.io.data_catalog.DataCatalog
|
|
|
|
# kedro.io.data_catalog.DataCatalog
|
|
|
|
# adding data sets
|
|
|
|
# adding data sets
|
|
|
|
self.nodes_description = house_dataset_loader(catalog)
|
|
|
|
self.nodes_description = self._house_dataset_loader(catalog)
|
|
|
|
return catalog
|
|
|
|
return catalog
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _house_dataset_loader(self, catalog):
|
|
|
|
|
|
|
|
nodes_description = []
|
|
|
|
|
|
|
|
# XXX : get root path from config, not here
|
|
|
|
|
|
|
|
data_root_path = Path.cwd() / 'data' / '01_raw' / 'xml'
|
|
|
|
|
|
|
|
relative_to = Path.cwd()
|
|
|
|
|
|
|
|
for dataset_name, dataset_path in tree(data_root_path, relative_to=relative_to).items():
|
|
|
|
|
|
|
|
# dataset_path : data/01_raw/xml/bourbon/brb_ch_i_1437_08_18a.xml
|
|
|
|
|
|
|
|
catalog.add(data_set_name=dataset_name,
|
|
|
|
|
|
|
|
data_set=XMLDataSet(filepath=dataset_path),
|
|
|
|
|
|
|
|
replace=True)
|
|
|
|
|
|
|
|
# adding an output catalog entry
|
|
|
|
|
|
|
|
output_dataset_name = dataset_name + "_output"
|
|
|
|
|
|
|
|
# XXX : make better
|
|
|
|
|
|
|
|
output_dataset_path = Path(dataset_path.replace("01_raw", "02_intermediate"))
|
|
|
|
|
|
|
|
# let's create subfolders if they don't exist
|
|
|
|
|
|
|
|
output_dataset_dir = output_dataset_path.parent
|
|
|
|
|
|
|
|
output_dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
catalog.add(data_set_name=output_dataset_name,
|
|
|
|
|
|
|
|
data_set=XMLDataSet(filepath=output_dataset_path),
|
|
|
|
|
|
|
|
replace=True)
|
|
|
|
|
|
|
|
# prepare information for the next stage (the pipeline stage)
|
|
|
|
|
|
|
|
node_description = dict(
|
|
|
|
|
|
|
|
inputs=dataset_name,
|
|
|
|
|
|
|
|
outputs=output_dataset_name,
|
|
|
|
|
|
|
|
name=dataset_name)
|
|
|
|
|
|
|
|
nodes_description.append(node_description)
|
|
|
|
|
|
|
|
return nodes_description
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_pipeline_creation(self):
|
|
|
|
def prepare_pipeline_creation(self):
|
|
|
|
return self.nodes_description
|
|
|
|
return self.nodes_description
|
|
|
|
|
|
|
|
|
|
|
|
|