From f4d14cd68c1184140f6920aef8b3a9164aaf695b Mon Sep 17 00:00:00 2001 From: gwen Date: Fri, 23 Jun 2023 16:05:48 +0200 Subject: [PATCH] =?UTF-8?q?PartitionedDataSet=20utilis=C3=A9=20pour=20tran?= =?UTF-8?q?sformer=20un=20lot=20de=20datas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- actes-princiers/conf/base/catalog.yml | 18 +++ actes-princiers/docs/source/conf.py | 1 + .../notebooks/LoadDataCatalog.ipynb | 129 +++++++++++++++++- .../pipelines/xml_processing/nodes.py | 7 - .../pipelines/xml_processing/pipeline.py | 105 +++++++++++++- actes-princiers/src/run.py | 36 +++++ 6 files changed, 286 insertions(+), 10 deletions(-) create mode 100644 actes-princiers/src/run.py diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index d3517d7..cb3d4c9 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -47,6 +47,24 @@ preprocess_html: filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html # _________________________________________________________________________ +# same test with kedro.io.PartitionedDataSet + +load_full_xml_catalog: + type: PartitionedDataSet + path: data/01_raw/xml/Anjou/ + dataset: + type: actesdataset.XMLDataSet + filename_suffix: '.xml' + +preprocess_full_catalog_html: + type: PartitionedDataSet + path: data/02_intermediate/xml/Anjou/ + dataset: + type: actesdataset.XMLDataSet + filename_suffix: '.html' + +# _________________________________________________________________________ + preprocessed_actors: type: pandas.CSVDataSet diff --git a/actes-princiers/docs/source/conf.py b/actes-princiers/docs/source/conf.py index 6da395c..683f842 100644 --- a/actes-princiers/docs/source/conf.py +++ b/actes-princiers/docs/source/conf.py @@ -48,6 +48,7 @@ extensions = [ "sphinx.ext.coverage", "sphinx.ext.ifconfig", "sphinx.ext.viewcode", + "myst_parser", #"nbsphinx", #"sphinx_copybutton", ] diff --git a/actes-princiers/notebooks/LoadDataCatalog.ipynb b/actes-princiers/notebooks/LoadDataCatalog.ipynb index 3240455..ea08035 100644 --- a/actes-princiers/notebooks/LoadDataCatalog.ipynb +++ b/actes-princiers/notebooks/LoadDataCatalog.ipynb @@ -246,6 +246,14 @@ "cleaned_actors.iloc[9]" ] }, + { + "cell_type": "markdown", + "id": "ee287f62", + "metadata": {}, + "source": [ + "## Autres catalogues" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -262,8 +270,11 @@ " 'preprocessed_dataset_test',\n", " 'load_xml',\n", " 'preprocess_html',\n", + " 'load_full_xml_catalog',\n", + " 'preprocess_full_catalog_html',\n", " 'preprocessed_actors',\n", - " 'parameters']" + " 'parameters',\n", + " 'params:xlststylesheet']" ] }, "execution_count": 1, @@ -308,6 +319,122 @@ "source": [ "catalog.load(\"load_xml\")" ] + }, + { + "cell_type": "markdown", + "id": "a46ddef9", + "metadata": {}, + "source": [ + "## PartitionedDataset catalogs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96a60999", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[06/22/23 15:01:39] INFO     Loading data from 'load_full_xml_catalog' (PartitionedDataSet)...  data_catalog.py:345\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[06/22/23 15:01:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=663642;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=709654;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Loading data from 'load_full_xml_catalog' (PartitionedDataSet)...  data_catalog.py:345\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=916916;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=129179;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'anj_is_i_1441_08_05a': >,\n", + " 'anj_lo_i_1360_08a': >,\n", + " 'anj_lo_i_1371_07_08a': >,\n", + " 'anj_lo_ii_1401_04_28a': >,\n", + " 'anj_lo_ii_1402_11_07a': >,\n", + " 'anj_lo_ii_1405_05_02a': >,\n", + " 'anj_lo_ii_1406_01_26a': >,\n", + " 'anj_lo_ii_1406_04_15a': >,\n", + " 'anj_lo_ii_1409_08_07a': >,\n", + " 'anj_lo_ii_1409_12_12a': >,\n", + " 'anj_lo_ii_1413_03_01a': >,\n", + " 'anj_lo_iii_1420_11_04a': >,\n", + " 'anj_lo_iii_1422_02_09a': >,\n", + " 'anj_lo_iii_1424_03_31a': >,\n", + " 'anj_lo_iii_1424_03_31b': >,\n", + " 'anj_lo_iii_1428_06_07a': >,\n", + " 'anj_lo_iii_1428_06_07b': >,\n", + " 'anj_lo_iii_1432_10_27a': >,\n", + " 'anj_ma_i_1370_12_10a': >,\n", + " 'anj_re_i_1437_09_16a': >,\n", + " 'anj_re_i_1439_11_22a': >,\n", + " 'anj_re_i_1440_01_20a': >,\n", + " 'anj_re_i_1445a': >,\n", + " 'anj_re_i_1450_11_07a': >,\n", + " 'anj_re_i_1454_01_14a': >,\n", + " 'anj_re_i_1454_02_09a': >,\n", + " 'anj_re_i_1454_06_17a': >,\n", + " 'anj_re_i_1454_09_01a': >,\n", + " 'anj_re_i_1455_11_13a': >,\n", + " 'anj_re_i_1456_11_29a': >,\n", + " 'anj_re_i_1457_01_04a': >,\n", + " 'anj_re_i_1459_03_17a': >,\n", + " 'anj_re_i_1459_04_16a': >,\n", + " 'anj_re_i_1463_07_21a': >,\n", + " 'anj_re_i_1466_12_16a': >,\n", + " 'anj_re_i_1474_02_01a': >,\n", + " 'anj_re_i_1475_05_26a': >,\n", + " 'anj_yo_i_1418_12_20a': >,\n", + " 'anj_yo_i_1421_06_28a': >,\n", + " 'anj_yo_i_1442_02_24a': >}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "partitions = catalog.load('load_full_xml_catalog')\n", + "catalog.load('load_full_xml_catalog')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bdc37079", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + ">" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "partitions['anj_is_i_1441_08_05a']" + ] } ], "metadata": { diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 345c84d..7aca6bf 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -1,14 +1,7 @@ from lxml import etree -#from pathlib import Path -## path and file configuration -#_here = Path(__file__).resolve().parent -#xsl_stylesheet = _here / "actes_princiers.xsl" - def parse_xsl(source_doc, xlststylesheet): # - #'write', ou 'write_output - # FIXME recuperer la feuille de style xsl xslt_doc = etree.parse(xlststylesheet) xslt_transformer = etree.XSLT(xslt_doc) return str(xslt_transformer(source_doc)) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 95d60aa..8657e46 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -1,9 +1,110 @@ from kedro.pipeline import Pipeline, node, pipeline - from .nodes import parse_xsl +from actesdataset import XMLDataSet + +from kedro.io import PartitionedDataSet + +#from kedro.framework.session import KedroSession +#from kedro.context import KedroContext, load_context + +#with KedroSession.create() as session: +# context = session.load_context() +# catalog = context.catalog + +#catalog.add(data_set_name="mon_test_de_catalogue", +# data_set=XMLDataSet( +# filepath="data/02_intermediate/xml/Anjou/test.dat", +# ), +# replace=True, +# ) +#print(catalog.list()) + +#def from_dict(dico): + +# data_set = PartitionedDataSet( +# path="df_with_partition", +# dataset="pandas.CSVDataset", +# filename_suffix=".csv" +# ) +# return data_set.save(dict_df) + + + +#def parse_xsl_factory(dico): +# pdataset = catalog.load("preprocess_full_catalog_html") +# pdataset.save(dico) +# return pataset + + +#def create_pipeline(partitioned_dict): + +# datasets = list(partitioned_dict.keys()) +# for dataset in datasets : +# yield Pipeline([ +# node( +# func=parse_xsl, +# inputs= dataset, +# outputs="combined_data", +# name="combined_data" +# ), +# ...#other nodes +# ]) + + +##def preprocess_pipeline(): +## for catalog_name, catalog in catalog.load('load_full_xml_catalog').items(): +## yield node(func=parse_xsl, +## inputs= [catalog_name, "params:xlststylesheet"], +## outputs = , +## name = "preprocess_html" + catalog_name +## ) + +# load_full_xml_catalog +# preprocess_full_catalog_html +# inputs=["load_full_xml_catalog", "params:xlststylesheet"], +# outputs="preprocess_full_catalog_html", + +def nodes_factory(): + nodes = [] + input_catalog = catalog.load("load_full_xml_catalog") + #output_catalog = catalog.load("preprocess_full_catalog_html") + + for in_catalog_key, in_catalog_value in input_catalog.items(): + + # adding programmatically an input catalog entry + input_catalog_name = "load_full_xml_catalog" + in_catalog_key + # FIXME : à récuperer du catalogue Patitioned + input_filepath = "data/01_raw/xml/Anjou/" + in_catalog_key + ".html" + catalog.add(data_set_name=input_catalog_name, + data_set=XMLDataSet(filepath=input_filepath), + replace=True) +# if input_catalog_name in catalog.list(): +# print("OK") +# else: +# print("NOK") + # adding programmatically an output catalog entry + output_catalog_name = "preprocess_full_catalog_html" + in_catalog_key + # FIXME : à récuperer du catalogue Patitioned + output_filepath = "data/02_intermediate/xml/Anjou/" + in_catalog_key + ".html" + catalog.add(data_set_name=output_catalog_name, + data_set=XMLDataSet(filepath=output_filepath), + replace=True) + # constructing the node programmatically + nodes.append(node( + func=parse_xsl, + inputs=[input_catalog_name, "params:xlststylesheet"], + outputs=output_catalog_name, + name=in_catalog_key, + tags="xsl", + )) + # XXX +# context.catalog = catalog + return nodes + +#nodes = nodes_factory() -def create_pipeline(**kwargs) -> Pipeline: +def create_pipeline(**kwargs): return pipeline( [ node( diff --git a/actes-princiers/src/run.py b/actes-princiers/src/run.py new file mode 100644 index 0000000..5e256b0 --- /dev/null +++ b/actes-princiers/src/run.py @@ -0,0 +1,36 @@ +"Application entry point" + +from pathlib import Path +from typing import Dict + +from kedro.framework.context import KedroContext, load_package_context +from kedro.pipeline import Pipeline + +from actes_princiers.pipeline_registry import register_pipelines +#bnhm.pipeline import create_pipelines + + +class ProjectContext(KedroContext): + project_name = "actes princiers" + project_version = "0.1" + package_name = "actes_princiers" + + def _get_pipelines(self) -> Dict[str, Pipeline]: +# return create_pipelines() + return register_pipelines() + + def _get_catalog(self, *args, **kwargs): + catalog = super()._get_catalog(*args, **kwargs) + return catalog + +def run_package(): + # Entry point for running a Kedro project packaged with `kedro package` + # using `python -m .run` command. + project_context = load_package_context( + project_path=Path.cwd(), package_name=Path(__file__).resolve().parent.name + ) + project_context.run() + + +if __name__ == "__main__": + run_package()