PartitionedDataSet utilisé pour transformer un lot de datas

develop
gwen 3 years ago
parent 5002efb631
commit f4d14cd68c

@ -47,6 +47,24 @@ preprocess_html:
filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html
# _________________________________________________________________________
# same test with kedro.io.PartitionedDataSet
load_full_xml_catalog:
type: PartitionedDataSet
path: data/01_raw/xml/Anjou/
dataset:
type: actesdataset.XMLDataSet
filename_suffix: '.xml'
preprocess_full_catalog_html:
type: PartitionedDataSet
path: data/02_intermediate/xml/Anjou/
dataset:
type: actesdataset.XMLDataSet
filename_suffix: '.html'
# _________________________________________________________________________
preprocessed_actors:
type: pandas.CSVDataSet

@ -48,6 +48,7 @@ extensions = [
"sphinx.ext.coverage",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode",
"myst_parser",
#"nbsphinx",
#"sphinx_copybutton",
]

@ -246,6 +246,14 @@
"cleaned_actors.iloc[9]"
]
},
{
"cell_type": "markdown",
"id": "ee287f62",
"metadata": {},
"source": [
"## Autres catalogues"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -262,8 +270,11 @@
" 'preprocessed_dataset_test',\n",
" 'load_xml',\n",
" 'preprocess_html',\n",
" 'load_full_xml_catalog',\n",
" 'preprocess_full_catalog_html',\n",
" 'preprocessed_actors',\n",
" 'parameters']"
" 'parameters',\n",
" 'params:xlststylesheet']"
]
},
"execution_count": 1,
@ -308,6 +319,122 @@
"source": [
"catalog.load(\"load_xml\")"
]
},
{
"cell_type": "markdown",
"id": "a46ddef9",
"metadata": {},
"source": [
"## PartitionedDataset catalogs"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "96a60999",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/22/23 15:01:39] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'load_full_xml_catalog'</span> <span style=\"font-weight: bold\">(</span>PartitionedDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[06/22/23 15:01:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=663642;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=709654;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'load_full_xml_catalog'</span> <span style=\"font-weight: bold\">(</span>PartitionedDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=916916;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=129179;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'anj_is_i_1441_08_05a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7e16df0>>,\n",
" 'anj_lo_i_1360_08a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9700>>,\n",
" 'anj_lo_i_1371_07_08a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd96a0>>,\n",
" 'anj_lo_ii_1401_04_28a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9430>>,\n",
" 'anj_lo_ii_1402_11_07a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd92b0>>,\n",
" 'anj_lo_ii_1405_05_02a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9340>>,\n",
" 'anj_lo_ii_1406_01_26a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd93d0>>,\n",
" 'anj_lo_ii_1406_04_15a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd94c0>>,\n",
" 'anj_lo_ii_1409_08_07a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd94f0>>,\n",
" 'anj_lo_ii_1409_12_12a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9520>>,\n",
" 'anj_lo_ii_1413_03_01a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9610>>,\n",
" 'anj_lo_iii_1420_11_04a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9580>>,\n",
" 'anj_lo_iii_1422_02_09a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd95b0>>,\n",
" 'anj_lo_iii_1424_03_31a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9d00>>,\n",
" 'anj_lo_iii_1424_03_31b': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9c40>>,\n",
" 'anj_lo_iii_1428_06_07a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9970>>,\n",
" 'anj_lo_iii_1428_06_07b': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9940>>,\n",
" 'anj_lo_iii_1432_10_27a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9880>>,\n",
" 'anj_ma_i_1370_12_10a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9a90>>,\n",
" 'anj_re_i_1437_09_16a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9a30>>,\n",
" 'anj_re_i_1439_11_22a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9af0>>,\n",
" 'anj_re_i_1440_01_20a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9730>>,\n",
" 'anj_re_i_1445a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9b20>>,\n",
" 'anj_re_i_1450_11_07a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd99d0>>,\n",
" 'anj_re_i_1454_01_14a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd99a0>>,\n",
" 'anj_re_i_1454_02_09a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd97f0>>,\n",
" 'anj_re_i_1454_06_17a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9d30>>,\n",
" 'anj_re_i_1454_09_01a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9d90>>,\n",
" 'anj_re_i_1455_11_13a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9df0>>,\n",
" 'anj_re_i_1456_11_29a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9e50>>,\n",
" 'anj_re_i_1457_01_04a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9eb0>>,\n",
" 'anj_re_i_1459_03_17a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9f10>>,\n",
" 'anj_re_i_1459_04_16a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9f70>>,\n",
" 'anj_re_i_1463_07_21a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7dd9fd0>>,\n",
" 'anj_re_i_1466_12_16a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb070>>,\n",
" 'anj_re_i_1474_02_01a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb0d0>>,\n",
" 'anj_re_i_1475_05_26a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb130>>,\n",
" 'anj_yo_i_1418_12_20a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb190>>,\n",
" 'anj_yo_i_1421_06_28a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb1f0>>,\n",
" 'anj_yo_i_1442_02_24a': <bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7fa3f7ddb250>>}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"partitions = catalog.load('load_full_xml_catalog')\n",
"catalog.load('load_full_xml_catalog')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bdc37079",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method AbstractDataSet.load of <actesdataset.XMLDataSet object at 0x7faad403c550>>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"partitions['anj_is_i_1441_08_05a']"
]
}
],
"metadata": {

@ -1,14 +1,7 @@
from lxml import etree
#from pathlib import Path
## path and file configuration
#_here = Path(__file__).resolve().parent
#xsl_stylesheet = _here / "actes_princiers.xsl"
def parse_xsl(source_doc, xlststylesheet):
#<class 'lxml.etree._XSLTResultTree'>
#'write', ou 'write_output
# FIXME recuperer la feuille de style xsl
xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc))

@ -1,9 +1,110 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import parse_xsl
from actesdataset import XMLDataSet
from kedro.io import PartitionedDataSet
#from kedro.framework.session import KedroSession
#from kedro.context import KedroContext, load_context
#with KedroSession.create() as session:
# context = session.load_context()
# catalog = context.catalog
#catalog.add(data_set_name="mon_test_de_catalogue",
# data_set=XMLDataSet(
# filepath="data/02_intermediate/xml/Anjou/test.dat",
# ),
# replace=True,
# )
#print(catalog.list())
#def from_dict(dico):
# data_set = PartitionedDataSet(
# path="df_with_partition",
# dataset="pandas.CSVDataset",
# filename_suffix=".csv"
# )
# return data_set.save(dict_df)
#def parse_xsl_factory(dico):
# pdataset = catalog.load("preprocess_full_catalog_html")
# pdataset.save(dico)
# return pataset
#def create_pipeline(partitioned_dict):
# datasets = list(partitioned_dict.keys())
# for dataset in datasets :
# yield Pipeline([
# node(
# func=parse_xsl,
# inputs= dataset,
# outputs="combined_data",
# name="combined_data"
# ),
# ...#other nodes
# ])
##def preprocess_pipeline():
## for catalog_name, catalog in catalog.load('load_full_xml_catalog').items():
## yield node(func=parse_xsl,
## inputs= [catalog_name, "params:xlststylesheet"],
## outputs = ,
## name = "preprocess_html" + catalog_name
## )
# load_full_xml_catalog
# preprocess_full_catalog_html
# inputs=["load_full_xml_catalog", "params:xlststylesheet"],
# outputs="preprocess_full_catalog_html",
def nodes_factory():
nodes = []
input_catalog = catalog.load("load_full_xml_catalog")
#output_catalog = catalog.load("preprocess_full_catalog_html")
for in_catalog_key, in_catalog_value in input_catalog.items():
# adding programmatically an input catalog entry
input_catalog_name = "load_full_xml_catalog" + in_catalog_key
# FIXME : à récuperer du catalogue Patitioned
input_filepath = "data/01_raw/xml/Anjou/" + in_catalog_key + ".html"
catalog.add(data_set_name=input_catalog_name,
data_set=XMLDataSet(filepath=input_filepath),
replace=True)
# if input_catalog_name in catalog.list():
# print("OK")
# else:
# print("NOK")
# adding programmatically an output catalog entry
output_catalog_name = "preprocess_full_catalog_html" + in_catalog_key
# FIXME : à récuperer du catalogue Patitioned
output_filepath = "data/02_intermediate/xml/Anjou/" + in_catalog_key + ".html"
catalog.add(data_set_name=output_catalog_name,
data_set=XMLDataSet(filepath=output_filepath),
replace=True)
# constructing the node programmatically
nodes.append(node(
func=parse_xsl,
inputs=[input_catalog_name, "params:xlststylesheet"],
outputs=output_catalog_name,
name=in_catalog_key,
tags="xsl",
))
# XXX
# context.catalog = catalog
return nodes
#nodes = nodes_factory()
def create_pipeline(**kwargs) -> Pipeline:
def create_pipeline(**kwargs):
return pipeline(
[
node(

@ -0,0 +1,36 @@
"Application entry point"
from pathlib import Path
from typing import Dict
from kedro.framework.context import KedroContext, load_package_context
from kedro.pipeline import Pipeline
from actes_princiers.pipeline_registry import register_pipelines
#bnhm.pipeline import create_pipelines
class ProjectContext(KedroContext):
project_name = "actes princiers"
project_version = "0.1"
package_name = "actes_princiers"
def _get_pipelines(self) -> Dict[str, Pipeline]:
# return create_pipelines()
return register_pipelines()
def _get_catalog(self, *args, **kwargs):
catalog = super()._get_catalog(*args, **kwargs)
return catalog
def run_package():
# Entry point for running a Kedro project packaged with `kedro package`
# using `python -m <project_package>.run` command.
project_context = load_package_context(
project_path=Path.cwd(), package_name=Path(__file__).resolve().parent.name
)
project_context.run()
if __name__ == "__main__":
run_package()
Loading…
Cancel
Save