diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml
index d3517d7..cb3d4c9 100644
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@@ -47,6 +47,24 @@ preprocess_html:
filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html
# _________________________________________________________________________
+# same test with kedro.io.PartitionedDataSet
+
+load_full_xml_catalog:
+ type: PartitionedDataSet
+ path: data/01_raw/xml/Anjou/
+ dataset:
+ type: actesdataset.XMLDataSet
+ filename_suffix: '.xml'
+
+preprocess_full_catalog_html:
+ type: PartitionedDataSet
+ path: data/02_intermediate/xml/Anjou/
+ dataset:
+ type: actesdataset.XMLDataSet
+ filename_suffix: '.html'
+
+# _________________________________________________________________________
+
preprocessed_actors:
type: pandas.CSVDataSet
diff --git a/actes-princiers/docs/source/conf.py b/actes-princiers/docs/source/conf.py
index 6da395c..683f842 100644
--- a/actes-princiers/docs/source/conf.py
+++ b/actes-princiers/docs/source/conf.py
@@ -48,6 +48,7 @@ extensions = [
"sphinx.ext.coverage",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode",
+ "myst_parser",
#"nbsphinx",
#"sphinx_copybutton",
]
diff --git a/actes-princiers/notebooks/LoadDataCatalog.ipynb b/actes-princiers/notebooks/LoadDataCatalog.ipynb
index 3240455..ea08035 100644
--- a/actes-princiers/notebooks/LoadDataCatalog.ipynb
+++ b/actes-princiers/notebooks/LoadDataCatalog.ipynb
@@ -246,6 +246,14 @@
"cleaned_actors.iloc[9]"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "ee287f62",
+ "metadata": {},
+ "source": [
+ "## Autres catalogues"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
@@ -262,8 +270,11 @@
" 'preprocessed_dataset_test',\n",
" 'load_xml',\n",
" 'preprocess_html',\n",
+ " 'load_full_xml_catalog',\n",
+ " 'preprocess_full_catalog_html',\n",
" 'preprocessed_actors',\n",
- " 'parameters']"
+ " 'parameters',\n",
+ " 'params:xlststylesheet']"
]
},
"execution_count": 1,
@@ -308,6 +319,122 @@
"source": [
"catalog.load(\"load_xml\")"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a46ddef9",
+ "metadata": {},
+ "source": [
+ "## PartitionedDataset catalogs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "96a60999",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
[06/22/23 15:01:39] INFO Loading data from 'load_full_xml_catalog' (PartitionedDataSet)... data_catalog.py:345\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[2;36m[06/22/23 15:01:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=663642;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=709654;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ " INFO Loading data from 'load_full_xml_catalog' (PartitionedDataSet)... data_catalog.py:345\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=916916;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=129179;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'anj_is_i_1441_08_05a': >,\n",
+ " 'anj_lo_i_1360_08a': >,\n",
+ " 'anj_lo_i_1371_07_08a': >,\n",
+ " 'anj_lo_ii_1401_04_28a': >,\n",
+ " 'anj_lo_ii_1402_11_07a': >,\n",
+ " 'anj_lo_ii_1405_05_02a': >,\n",
+ " 'anj_lo_ii_1406_01_26a': >,\n",
+ " 'anj_lo_ii_1406_04_15a': >,\n",
+ " 'anj_lo_ii_1409_08_07a': >,\n",
+ " 'anj_lo_ii_1409_12_12a': >,\n",
+ " 'anj_lo_ii_1413_03_01a': >,\n",
+ " 'anj_lo_iii_1420_11_04a': >,\n",
+ " 'anj_lo_iii_1422_02_09a': >,\n",
+ " 'anj_lo_iii_1424_03_31a': >,\n",
+ " 'anj_lo_iii_1424_03_31b': >,\n",
+ " 'anj_lo_iii_1428_06_07a': >,\n",
+ " 'anj_lo_iii_1428_06_07b': >,\n",
+ " 'anj_lo_iii_1432_10_27a': >,\n",
+ " 'anj_ma_i_1370_12_10a': >,\n",
+ " 'anj_re_i_1437_09_16a': >,\n",
+ " 'anj_re_i_1439_11_22a': >,\n",
+ " 'anj_re_i_1440_01_20a': >,\n",
+ " 'anj_re_i_1445a': >,\n",
+ " 'anj_re_i_1450_11_07a': >,\n",
+ " 'anj_re_i_1454_01_14a': >,\n",
+ " 'anj_re_i_1454_02_09a': >,\n",
+ " 'anj_re_i_1454_06_17a': >,\n",
+ " 'anj_re_i_1454_09_01a': >,\n",
+ " 'anj_re_i_1455_11_13a': >,\n",
+ " 'anj_re_i_1456_11_29a': >,\n",
+ " 'anj_re_i_1457_01_04a': >,\n",
+ " 'anj_re_i_1459_03_17a': >,\n",
+ " 'anj_re_i_1459_04_16a': >,\n",
+ " 'anj_re_i_1463_07_21a': >,\n",
+ " 'anj_re_i_1466_12_16a': >,\n",
+ " 'anj_re_i_1474_02_01a': >,\n",
+ " 'anj_re_i_1475_05_26a': >,\n",
+ " 'anj_yo_i_1418_12_20a': >,\n",
+ " 'anj_yo_i_1421_06_28a': >,\n",
+ " 'anj_yo_i_1442_02_24a': >}"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "partitions = catalog.load('load_full_xml_catalog')\n",
+ "catalog.load('load_full_xml_catalog')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bdc37079",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ">"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "partitions['anj_is_i_1441_08_05a']"
+ ]
}
],
"metadata": {
diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
index 345c84d..7aca6bf 100755
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@@ -1,14 +1,7 @@
from lxml import etree
-#from pathlib import Path
-## path and file configuration
-#_here = Path(__file__).resolve().parent
-#xsl_stylesheet = _here / "actes_princiers.xsl"
-
def parse_xsl(source_doc, xlststylesheet):
#
- #'write', ou 'write_output
- # FIXME recuperer la feuille de style xsl
xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc))
diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
index 95d60aa..8657e46 100755
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py
@@ -1,9 +1,110 @@
from kedro.pipeline import Pipeline, node, pipeline
-
from .nodes import parse_xsl
+from actesdataset import XMLDataSet
+
+from kedro.io import PartitionedDataSet
+
+#from kedro.framework.session import KedroSession
+#from kedro.context import KedroContext, load_context
+
+#with KedroSession.create() as session:
+# context = session.load_context()
+# catalog = context.catalog
+
+#catalog.add(data_set_name="mon_test_de_catalogue",
+# data_set=XMLDataSet(
+# filepath="data/02_intermediate/xml/Anjou/test.dat",
+# ),
+# replace=True,
+# )
+#print(catalog.list())
+
+#def from_dict(dico):
+
+# data_set = PartitionedDataSet(
+# path="df_with_partition",
+# dataset="pandas.CSVDataset",
+# filename_suffix=".csv"
+# )
+# return data_set.save(dict_df)
+
+
+
+#def parse_xsl_factory(dico):
+# pdataset = catalog.load("preprocess_full_catalog_html")
+# pdataset.save(dico)
+# return pataset
+
+
+#def create_pipeline(partitioned_dict):
+
+# datasets = list(partitioned_dict.keys())
+# for dataset in datasets :
+# yield Pipeline([
+# node(
+# func=parse_xsl,
+# inputs= dataset,
+# outputs="combined_data",
+# name="combined_data"
+# ),
+# ...#other nodes
+# ])
+
+
+##def preprocess_pipeline():
+## for catalog_name, catalog in catalog.load('load_full_xml_catalog').items():
+## yield node(func=parse_xsl,
+## inputs= [catalog_name, "params:xlststylesheet"],
+## outputs = ,
+## name = "preprocess_html" + catalog_name
+## )
+
+# load_full_xml_catalog
+# preprocess_full_catalog_html
+# inputs=["load_full_xml_catalog", "params:xlststylesheet"],
+# outputs="preprocess_full_catalog_html",
+
+def nodes_factory():
+ nodes = []
+ input_catalog = catalog.load("load_full_xml_catalog")
+ #output_catalog = catalog.load("preprocess_full_catalog_html")
+
+ for in_catalog_key, in_catalog_value in input_catalog.items():
+
+ # adding programmatically an input catalog entry
+ input_catalog_name = "load_full_xml_catalog" + in_catalog_key
+ # FIXME : à récuperer du catalogue Patitioned
+ input_filepath = "data/01_raw/xml/Anjou/" + in_catalog_key + ".html"
+ catalog.add(data_set_name=input_catalog_name,
+ data_set=XMLDataSet(filepath=input_filepath),
+ replace=True)
+# if input_catalog_name in catalog.list():
+# print("OK")
+# else:
+# print("NOK")
+ # adding programmatically an output catalog entry
+ output_catalog_name = "preprocess_full_catalog_html" + in_catalog_key
+ # FIXME : à récuperer du catalogue Patitioned
+ output_filepath = "data/02_intermediate/xml/Anjou/" + in_catalog_key + ".html"
+ catalog.add(data_set_name=output_catalog_name,
+ data_set=XMLDataSet(filepath=output_filepath),
+ replace=True)
+ # constructing the node programmatically
+ nodes.append(node(
+ func=parse_xsl,
+ inputs=[input_catalog_name, "params:xlststylesheet"],
+ outputs=output_catalog_name,
+ name=in_catalog_key,
+ tags="xsl",
+ ))
+ # XXX
+# context.catalog = catalog
+ return nodes
+
+#nodes = nodes_factory()
-def create_pipeline(**kwargs) -> Pipeline:
+def create_pipeline(**kwargs):
return pipeline(
[
node(
diff --git a/actes-princiers/src/run.py b/actes-princiers/src/run.py
new file mode 100644
index 0000000..5e256b0
--- /dev/null
+++ b/actes-princiers/src/run.py
@@ -0,0 +1,36 @@
+"Application entry point"
+
+from pathlib import Path
+from typing import Dict
+
+from kedro.framework.context import KedroContext, load_package_context
+from kedro.pipeline import Pipeline
+
+from actes_princiers.pipeline_registry import register_pipelines
+#bnhm.pipeline import create_pipelines
+
+
+class ProjectContext(KedroContext):
+ project_name = "actes princiers"
+ project_version = "0.1"
+ package_name = "actes_princiers"
+
+ def _get_pipelines(self) -> Dict[str, Pipeline]:
+# return create_pipelines()
+ return register_pipelines()
+
+ def _get_catalog(self, *args, **kwargs):
+ catalog = super()._get_catalog(*args, **kwargs)
+ return catalog
+
+def run_package():
+ # Entry point for running a Kedro project packaged with `kedro package`
+ # using `python -m .run` command.
+ project_context = load_package_context(
+ project_path=Path.cwd(), package_name=Path(__file__).resolve().parent.name
+ )
+ project_context.run()
+
+
+if __name__ == "__main__":
+ run_package()