{ "cells": [ { "cell_type": "markdown", "id": "aeacd24e", "metadata": {}, "source": [ "# Catalogs\n", "\n", "## Chargement des actors" ] }, { "cell_type": "code", "execution_count": 4, "id": "ae9bc24c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog" ] }, { "cell_type": "code", "execution_count": 38, "id": "dc290e93", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[06/16/23 15:56:44] INFO     Loading data from 'actors' (CSVDataSet)...                         data_catalog.py:345\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[06/16/23 15:56:44]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=858812;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=44255;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NAMEROLEHOUSEDATE1DATE2DATE3
0Charles Ier de BourbonprinceBourbon14001434.01456.0
1Gort, ÉtiennesecretBourbon14251440.0NaN
2ErartsecretBerry14041405.0NaN
3Jean de BerryprinceBerry13371360.01416.0
4Agnès de BourgogneprinceBourbon14071434.01476.0
\n", "
" ], "text/plain": [ " NAME ROLE HOUSE DATE1 DATE2 DATE3\n", "0 Charles Ier de Bourbon prince Bourbon 1400 1434.0 1456.0\n", "1 Gort, Étienne secret Bourbon 1425 1440.0 NaN\n", "2 Erart secret Berry 1404 1405.0 NaN\n", "3 Jean de Berry prince Berry 1337 1360.0 1416.0\n", "4 Agnès de Bourgogne prince Bourbon 1407 1434.0 1476.0" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog.load(\"actors\").head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "eedbc7fb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['actors', 'corpus-agnes-bourgogne', 'corpus-charles-i', 'parameters']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog.list()" ] }, { "cell_type": "code", "execution_count": 20, "id": "3168935f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[06/16/23 14:58:30] INFO     Loading data from 'actors' (CSVDataSet)...                         data_catalog.py:345\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[06/16/23 14:58:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=659228;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=160900;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "['NAME', 'ROLE', 'HOUSE', 'DATE1', 'DATE2', 'DATE3']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "actors = catalog.load(\"actors\")\n", "actors.columns.tolist()" ] }, { "cell_type": "markdown", "id": "902dd387", "metadata": {}, "source": [ "## Nettoyage des valeurs non renseignées\n", "\n", "Ligne d'origine (ligne 9) : \n", "`\"René d'Anjou\";\"prince\";\"Anjou\";\"XXXX\";\"XXXX\";\"XXXX\"`\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "24fc62ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NAME Bernard d'Armagnac\n", "ROLE prince\n", "HOUSE Armagnac\n", "DATE1 NaN\n", "DATE2 NaN\n", "DATE3 NaN\n", "Name: 9, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#actors.values\n", "import numpy as np\n", "cleaned_actors = actors.replace(\"XXXX\", np.NaN)\n", "actors.head()\n", "#actors.values\n", "cleaned_actors.iloc[9]" ] }, { "cell_type": "markdown", "id": "ee287f62", "metadata": {}, "source": [ "## Autres catalogues" ] }, { "cell_type": "code", "execution_count": 1, "id": "053ed17c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['actors',\n", " 'corpus-agnes-bourgogne',\n", " 'corpus-charles-i',\n", " 'dataset_test',\n", " 'preprocessed_dataset_test',\n", " 'load_xml',\n", " 'preprocess_html',\n", " 'load_full_xml_catalog',\n", " 'preprocess_full_catalog_html',\n", " 'preprocessed_actors',\n", " 'parameters',\n", " 'params:xlststylesheet']" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog.list()" ] }, { "cell_type": "code", "execution_count": 2, "id": "660b898c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[06/20/23 16:44:19] INFO     Loading data from 'load_xml' (XMLDataSet)...                       data_catalog.py:345\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[06/20/23 16:44:19]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_xml'\u001b[0m \u001b[1m(\u001b[0mXMLDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=813727;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=696103;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog.load(\"load_xml\")" ] }, { "cell_type": "markdown", "id": "a46ddef9", "metadata": {}, "source": [ "## PartitionedDataset catalogs" ] }, { "cell_type": "code", "execution_count": 1, "id": "96a60999", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[06/22/23 15:01:39] INFO     Loading data from 'load_full_xml_catalog' (PartitionedDataSet)...  data_catalog.py:345\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[06/22/23 15:01:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=663642;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=709654;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
                    INFO     Loading data from 'load_full_xml_catalog' (PartitionedDataSet)...  data_catalog.py:345\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=916916;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=129179;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'anj_is_i_1441_08_05a': >,\n", " 'anj_lo_i_1360_08a': >,\n", " 'anj_lo_i_1371_07_08a': >,\n", " 'anj_lo_ii_1401_04_28a': >,\n", " 'anj_lo_ii_1402_11_07a': >,\n", " 'anj_lo_ii_1405_05_02a': >,\n", " 'anj_lo_ii_1406_01_26a': >,\n", " 'anj_lo_ii_1406_04_15a': >,\n", " 'anj_lo_ii_1409_08_07a': >,\n", " 'anj_lo_ii_1409_12_12a': >,\n", " 'anj_lo_ii_1413_03_01a': >,\n", " 'anj_lo_iii_1420_11_04a': >,\n", " 'anj_lo_iii_1422_02_09a': >,\n", " 'anj_lo_iii_1424_03_31a': >,\n", " 'anj_lo_iii_1424_03_31b': >,\n", " 'anj_lo_iii_1428_06_07a': >,\n", " 'anj_lo_iii_1428_06_07b': >,\n", " 'anj_lo_iii_1432_10_27a': >,\n", " 'anj_ma_i_1370_12_10a': >,\n", " 'anj_re_i_1437_09_16a': >,\n", " 'anj_re_i_1439_11_22a': >,\n", " 'anj_re_i_1440_01_20a': >,\n", " 'anj_re_i_1445a': >,\n", " 'anj_re_i_1450_11_07a': >,\n", " 'anj_re_i_1454_01_14a': >,\n", " 'anj_re_i_1454_02_09a': >,\n", " 'anj_re_i_1454_06_17a': >,\n", " 'anj_re_i_1454_09_01a': >,\n", " 'anj_re_i_1455_11_13a': >,\n", " 'anj_re_i_1456_11_29a': >,\n", " 'anj_re_i_1457_01_04a': >,\n", " 'anj_re_i_1459_03_17a': >,\n", " 'anj_re_i_1459_04_16a': >,\n", " 'anj_re_i_1463_07_21a': >,\n", " 'anj_re_i_1466_12_16a': >,\n", " 'anj_re_i_1474_02_01a': >,\n", " 'anj_re_i_1475_05_26a': >,\n", " 'anj_yo_i_1418_12_20a': >,\n", " 'anj_yo_i_1421_06_28a': >,\n", " 'anj_yo_i_1442_02_24a': >}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "partitions = catalog.load('load_full_xml_catalog')\n", "catalog.load('load_full_xml_catalog')" ] }, { "cell_type": "code", "execution_count": 4, "id": "bdc37079", "metadata": {}, "outputs": [ { "data": { "text/plain": [ ">" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "partitions['anj_is_i_1441_08_05a']" ] } ], "metadata": { "kernelspec": { "display_name": "Kedro (actes_princiers)", "language": "python", "name": "kedro_actes_princiers" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }