{ "cells": [ { "cell_type": "markdown", "id": "aeacd24e", "metadata": {}, "source": [ "# Catalogs\n", "\n", "## Chargement des actors" ] }, { "cell_type": "code", "execution_count": 2, "id": "ae9bc24c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <module>:1 │\n", "│ │\n", "│ ❱ 1 catalog │\n", "│ 2 dir(catalog) │\n", "│ 3 │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "NameError: name 'catalog' is not defined\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m
[06/30/23 17:50:49] INFO Loading data from 'xmlreflector' (XMLHousesReflector)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m[06/30/23 17:50:49]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'xmlreflector'\u001b[0m \u001b[1m(\u001b[0mXMLHousesReflector\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=287074;file:///home/gwen/.local/lib/python3.10/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=134334;file:///home/gwen/.local/lib/python3.10/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ /home/gwen/.local/lib/python3.10/site-packages/kedro/io/core.py:187 in load │\n", "│ │\n", "│ 184 │ │ self._logger.debug(\"Loading %s\", str(self)) │\n", "│ 185 │ │ │\n", "│ 186 │ │ try: │\n", "│ ❱ 187 │ │ │ return self._load() │\n", "│ 188 │ │ except DataSetError: │\n", "│ 189 │ │ │ raise │\n", "│ 190 │ │ except Exception as exc: │\n", "│ │\n", "│ /media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/actes-princiers/src/actesdataset │\n", "│ .py:62 in _load │\n", "│ │\n", "│ 59 │ │ self.filepath = filepath │\n", "│ 60 │ │\n", "│ 61 │ def _load(self): │\n", "│ ❱ 62 │ │ raise \"C'est chargé!\" │\n", "│ 63 │ │\n", "│ 64 │ def _save(self): │\n", "│ 65 │ │ raise NotImplementedError(\"Attention : dataset en lecture seule !\") │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "TypeError: exceptions must derive from BaseException\n", "\n", "The above exception was the direct cause of the following exception:\n", "\n", "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ /tmp/ipykernel_28884/4226322454.py:1 in <module> │\n", "│ │\n", "│ [Errno 2] No such file or directory: '/tmp/ipykernel_28884/4226322454.py' │\n", "│ │\n", "│ /home/gwen/.local/lib/python3.10/site-packages/kedro/io/data_catalog.py:349 in load │\n", "│ │\n", "│ 346 │ │ │ \"Loading data from '%s' (%s)...\", name, type(dataset).__name__ │\n", "│ 347 │ │ ) │\n", "│ 348 │ │ │\n", "│ ❱ 349 │ │ result = dataset.load() │\n", "│ 350 │ │ │\n", "│ 351 │ │ return result │\n", "│ 352 │\n", "│ │\n", "│ /home/gwen/.local/lib/python3.10/site-packages/kedro/io/core.py:196 in load │\n", "│ │\n", "│ 193 │ │ │ message = ( │\n", "│ 194 │ │ │ │ f\"Failed while loading data from data set {str(self)}.\\n{str(exc)}\" │\n", "│ 195 │ │ │ ) │\n", "│ ❱ 196 │ │ │ raise DataSetError(message) from exc │\n", "│ 197 │ │\n", "│ 198 │ def save(self, data: _DI) -> None: │\n", "│ 199 │ │ \"\"\"Saves data by delegation to the provided save method. │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "DataSetError: Failed while loading data from data set XMLHousesReflector(name=my own dataset).\n", "exceptions must derive from BaseException\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────────────── \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m ───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2;33m/home/gwen/.local/lib/python3.10/site-packages/kedro/io/\u001b[0m\u001b[1;33mcore.py\u001b[0m:\u001b[94m187\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m184 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m._logger.debug(\u001b[33m\"\u001b[0m\u001b[33mLoading \u001b[0m\u001b[33m%s\u001b[0m\u001b[33m\"\u001b[0m, \u001b[96mstr\u001b[0m(\u001b[96mself\u001b[0m)) \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m185 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m186 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m187 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._load() \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m188 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mexcept\u001b[0m DataSetError: \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m189 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m190 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m exc: \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2;33m/media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/actes-princiers/src/\u001b[0m\u001b[1;33mactesdataset\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[1;33m.py\u001b[0m:\u001b[94m62\u001b[0m in \u001b[92m_load\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 59 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.filepath = filepath \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 60 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 61 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_load\u001b[0m(\u001b[96mself\u001b[0m): \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 62 \u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[33m\"\u001b[0m\u001b[33mC\u001b[0m\u001b[33m'\u001b[0m\u001b[33mest chargé!\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 63 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 64 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_save\u001b[0m(\u001b[96mself\u001b[0m): \u001b[31m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2m 65 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mNotImplementedError\u001b[0m(\u001b[33m\"\u001b[0m\u001b[33mAttention : dataset en lecture seule !\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n", "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "\u001b[1;91mTypeError: \u001b[0mexceptions must derive from BaseException\n", "\n", "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n", "\n", "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────────────── \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m ───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2;33m/tmp/ipykernel_28884/\u001b[0m\u001b[1;33m4226322454.py\u001b[0m:\u001b[94m1\u001b[0m in \u001b[92m
[06/16/23 15:56:44] INFO Loading data from 'actors' (CSVDataSet)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m[06/16/23 15:56:44]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=858812;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=44255;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| \n", " | NAME | \n", "ROLE | \n", "HOUSE | \n", "DATE1 | \n", "DATE2 | \n", "DATE3 | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "Charles Ier de Bourbon | \n", "prince | \n", "Bourbon | \n", "1400 | \n", "1434.0 | \n", "1456.0 | \n", "
| 1 | \n", "Gort, Étienne | \n", "secret | \n", "Bourbon | \n", "1425 | \n", "1440.0 | \n", "NaN | \n", "
| 2 | \n", "Erart | \n", "secret | \n", "Berry | \n", "1404 | \n", "1405.0 | \n", "NaN | \n", "
| 3 | \n", "Jean de Berry | \n", "prince | \n", "Berry | \n", "1337 | \n", "1360.0 | \n", "1416.0 | \n", "
| 4 | \n", "Agnès de Bourgogne | \n", "prince | \n", "Bourbon | \n", "1407 | \n", "1434.0 | \n", "1476.0 | \n", "
[06/16/23 14:58:30] INFO Loading data from 'actors' (CSVDataSet)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m[06/16/23 14:58:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=659228;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=160900;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "['NAME', 'ROLE', 'HOUSE', 'DATE1', 'DATE2', 'DATE3']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "actors = catalog.load(\"actors\")\n", "actors.columns.tolist()" ] }, { "cell_type": "markdown", "id": "902dd387", "metadata": {}, "source": [ "## Nettoyage des valeurs non renseignées\n", "\n", "Ligne d'origine (ligne 9) : \n", "`\"René d'Anjou\";\"prince\";\"Anjou\";\"XXXX\";\"XXXX\";\"XXXX\"`\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "24fc62ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NAME Bernard d'Armagnac\n", "ROLE prince\n", "HOUSE Armagnac\n", "DATE1 NaN\n", "DATE2 NaN\n", "DATE3 NaN\n", "Name: 9, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#actors.values\n", "import numpy as np\n", "cleaned_actors = actors.replace(\"XXXX\", np.NaN)\n", "actors.head()\n", "#actors.values\n", "cleaned_actors.iloc[9]" ] }, { "cell_type": "markdown", "id": "ee287f62", "metadata": {}, "source": [ "## Autres catalogues" ] }, { "cell_type": "code", "execution_count": 1, "id": "053ed17c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['actors',\n", " 'corpus-agnes-bourgogne',\n", " 'corpus-charles-i',\n", " 'dataset_test',\n", " 'preprocessed_dataset_test',\n", " 'load_xml',\n", " 'preprocess_html',\n", " 'load_full_xml_catalog',\n", " 'preprocess_full_catalog_html',\n", " 'preprocessed_actors',\n", " 'parameters',\n", " 'params:xlststylesheet']" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog.list()" ] }, { "cell_type": "code", "execution_count": 2, "id": "660b898c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[06/20/23 16:44:19] INFO Loading data from 'load_xml' (XMLDataSet)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m[06/20/23 16:44:19]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_xml'\u001b[0m \u001b[1m(\u001b[0mXMLDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=813727;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=696103;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "
[06/22/23 15:01:39] INFO Loading data from 'load_full_xml_catalog' (PartitionedDataSet)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m[06/22/23 15:01:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=663642;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=709654;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFO Loading data from 'load_full_xml_catalog' (PartitionedDataSet)... data_catalog.py:345\n", "\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_full_xml_catalog'\u001b[0m \u001b[1m(\u001b[0mPartitionedDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=916916;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=129179;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'anj_is_i_1441_08_05a':