You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

335 lines
13 KiB
Plaintext

3 years ago
{
"cells": [
{
"cell_type": "markdown",
3 years ago
"id": "aeacd24e",
3 years ago
"metadata": {},
"source": [
"# Catalogs\n",
"\n",
"## Chargement des actors"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ae9bc24c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<kedro.io.data_catalog.DataCatalog at 0x7fbafd365970>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog"
]
},
{
"cell_type": "code",
3 years ago
"execution_count": 38,
3 years ago
"id": "dc290e93",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
3 years ago
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/16/23 15:56:44] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'actors'</span> <span style=\"font-weight: bold\">(</span>CSVDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
3 years ago
"</pre>\n"
],
"text/plain": [
3 years ago
"\u001b[2;36m[06/16/23 15:56:44]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=858812;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=44255;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
3 years ago
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NAME</th>\n",
" <th>ROLE</th>\n",
" <th>HOUSE</th>\n",
" <th>DATE1</th>\n",
" <th>DATE2</th>\n",
" <th>DATE3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Charles Ier de Bourbon</td>\n",
" <td>prince</td>\n",
" <td>Bourbon</td>\n",
" <td>1400</td>\n",
3 years ago
" <td>1434.0</td>\n",
" <td>1456.0</td>\n",
3 years ago
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Gort, Étienne</td>\n",
" <td>secret</td>\n",
" <td>Bourbon</td>\n",
" <td>1425</td>\n",
3 years ago
" <td>1440.0</td>\n",
3 years ago
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Erart</td>\n",
" <td>secret</td>\n",
" <td>Berry</td>\n",
" <td>1404</td>\n",
3 years ago
" <td>1405.0</td>\n",
3 years ago
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Jean de Berry</td>\n",
" <td>prince</td>\n",
" <td>Berry</td>\n",
" <td>1337</td>\n",
3 years ago
" <td>1360.0</td>\n",
" <td>1416.0</td>\n",
3 years ago
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Agnès de Bourgogne</td>\n",
" <td>prince</td>\n",
" <td>Bourbon</td>\n",
" <td>1407</td>\n",
3 years ago
" <td>1434.0</td>\n",
" <td>1476.0</td>\n",
3 years ago
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
3 years ago
" NAME ROLE HOUSE DATE1 DATE2 DATE3\n",
"0 Charles Ier de Bourbon prince Bourbon 1400 1434.0 1456.0\n",
"1 Gort, Étienne secret Bourbon 1425 1440.0 NaN\n",
"2 Erart secret Berry 1404 1405.0 NaN\n",
"3 Jean de Berry prince Berry 1337 1360.0 1416.0\n",
"4 Agnès de Bourgogne prince Bourbon 1407 1434.0 1476.0"
3 years ago
]
},
3 years ago
"execution_count": 38,
3 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.load(\"actors\").head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
3 years ago
"id": "eedbc7fb",
3 years ago
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['actors', 'corpus-agnes-bourgogne', 'corpus-charles-i', 'parameters']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.list()"
]
},
{
"cell_type": "code",
"execution_count": 20,
3 years ago
"id": "3168935f",
3 years ago
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/16/23 14:58:30] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'actors'</span> <span style=\"font-weight: bold\">(</span>CSVDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[06/16/23 14:58:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=659228;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=160900;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"['NAME', 'ROLE', 'HOUSE', 'DATE1', 'DATE2', 'DATE3']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"actors = catalog.load(\"actors\")\n",
"actors.columns.tolist()"
]
},
{
"cell_type": "markdown",
3 years ago
"id": "902dd387",
3 years ago
"metadata": {},
"source": [
"## Nettoyage des valeurs non renseignées\n",
"\n",
"Ligne d'origine (ligne 9) : \n",
"`\"René d'Anjou\";\"prince\";\"Anjou\";\"XXXX\";\"XXXX\";\"XXXX\"`\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
3 years ago
"id": "24fc62ce",
3 years ago
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NAME Bernard d'Armagnac\n",
"ROLE prince\n",
"HOUSE Armagnac\n",
"DATE1 NaN\n",
"DATE2 NaN\n",
"DATE3 NaN\n",
"Name: 9, dtype: object"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#actors.values\n",
"import numpy as np\n",
"cleaned_actors = actors.replace(\"XXXX\", np.NaN)\n",
"actors.head()\n",
"#actors.values\n",
"cleaned_actors.iloc[9]"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "053ed17c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['actors',\n",
" 'corpus-agnes-bourgogne',\n",
" 'corpus-charles-i',\n",
" 'dataset_test',\n",
" 'preprocessed_dataset_test',\n",
" 'load_xml',\n",
" 'preprocess_html',\n",
" 'preprocessed_actors',\n",
" 'parameters']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.list()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "660b898c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/20/23 16:44:19] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'load_xml'</span> <span style=\"font-weight: bold\">(</span>XMLDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[06/20/23 16:44:19]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_xml'\u001b[0m \u001b[1m(\u001b[0mXMLDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=813727;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=696103;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<lxml.etree._ElementTree at 0x7f3e4c3b99c0>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.load(\"load_xml\")"
]
3 years ago
}
],
"metadata": {
"kernelspec": {
"display_name": "Kedro (actes_princiers)",
"language": "python",
"name": "kedro_actes_princiers"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}