refactoring

develop
gwen 3 years ago
parent 2bf13c78be
commit ec3ce5791b

@ -1 +1 @@
xlststylesheet: static/xsl/actes_princiers.xsl
xlststylesheet: templates/xsl/actes_princiers.xsl

@ -2,7 +2,6 @@ from pathlib import Path
from typing import Dict
from kedro.framework.context import KedroContext
#from kedro.pipeline import Pipeline
from kedro.pipeline import Pipeline, node, pipeline
from actesdataset import XMLDataSet
@ -14,36 +13,6 @@ def tree(directory, relative_to=None):
trees[path.stem] = str(path.relative_to(relative_to))
return trees
def house_dataset_loader(catalog):
nodes_description = []
# FIXME : a custom DataSet Catalog that lists
# input_catalog = catalog.load(house_name)
# FIXME : set root path from config, not here
# or make an autopath function helper
data_root_path = Path.cwd() / 'data' / '01_raw' / 'xml'
relative_to = Path.cwd()
for dataset_name, dataset_path in tree(data_root_path, relative_to=relative_to).items():
# dataset_path : data/01_raw/xml/bourbon/brb_ch_i_1437_08_18a.xml
catalog.add(data_set_name=dataset_name,
data_set=XMLDataSet(filepath=dataset_path),
replace=True)
# adding an output catalog entry
output_dataset_name = dataset_name + "_output"
# FIXME : JE NE SUIS PAS SATISFAIT
output_dataset_path = Path(dataset_path.replace("01_raw", "02_intermediate"))
# let's create subfolders if they don't exist
output_dataset_dir = output_dataset_path.parent
output_dataset_dir.mkdir(parents=True, exist_ok=True)
catalog.add(data_set_name=output_dataset_name,
data_set=XMLDataSet(filepath=output_dataset_path),
replace=True)
# prepare information for the next stage (the pipeline stage)
node_description = dict(
inputs=dataset_name,
outputs=output_dataset_name,
name=dataset_name)
nodes_description.append(node_description)
return nodes_description
class ProjectContext(KedroContext):
project_name = "actes princiers"
@ -63,9 +32,37 @@ class ProjectContext(KedroContext):
catalog = super()._get_catalog(*args, **kwargs)
# kedro.io.data_catalog.DataCatalog
# adding data sets
self.nodes_description = house_dataset_loader(catalog)
self.nodes_description = self._house_dataset_loader(catalog)
return catalog
def _house_dataset_loader(self, catalog):
nodes_description = []
# XXX : get root path from config, not here
data_root_path = Path.cwd() / 'data' / '01_raw' / 'xml'
relative_to = Path.cwd()
for dataset_name, dataset_path in tree(data_root_path, relative_to=relative_to).items():
# dataset_path : data/01_raw/xml/bourbon/brb_ch_i_1437_08_18a.xml
catalog.add(data_set_name=dataset_name,
data_set=XMLDataSet(filepath=dataset_path),
replace=True)
# adding an output catalog entry
output_dataset_name = dataset_name + "_output"
# XXX : make better
output_dataset_path = Path(dataset_path.replace("01_raw", "02_intermediate"))
# let's create subfolders if they don't exist
output_dataset_dir = output_dataset_path.parent
output_dataset_dir.mkdir(parents=True, exist_ok=True)
catalog.add(data_set_name=output_dataset_name,
data_set=XMLDataSet(filepath=output_dataset_path),
replace=True)
# prepare information for the next stage (the pipeline stage)
node_description = dict(
inputs=dataset_name,
outputs=output_dataset_name,
name=dataset_name)
nodes_description.append(node_description)
return nodes_description
def prepare_pipeline_creation(self):
return self.nodes_description

Loading…
Cancel
Save