essai pipeline

main
gwen 3 years ago
parent 97b3bd4280
commit 1fa54c163a

@ -1,5 +1,17 @@
# Actes princiers -- refactoring datascience # Actes princiers -- refactoring datascience
## Project Name
human readable name : `Actes Princiers`
The project name 'Actes Princiers' has been applied to:
- The project title in `datascience/actes-princiers/README.md`
- The folder created for your project in `datascience/actes-princiers`
- The project's python package in `datascience/actes-princiers/src/actes_princiers`
A best-practice setup includes initialising git and creating a virtual environment before running 'pip install -r src/requirements.txt'
## Getting started ## Getting started
- Install the virtual environment : `python3 -m venv .venv` - Install the virtual environment : `python3 -m venv .venv`
@ -7,10 +19,18 @@
- install kedro `pip install kedro` - install kedro `pip install kedro`
- Install the packages and libraries `pip install -r src/requirements.txt` - Install the packages and libraries `pip install -r src/requirements.txt`
Then open a terminal in the `actest-princiers` folder Then open a terminal in the `actes-princiers` folder
and launch jupyter : `kedro jupyter notebook` and launch jupyter : `kedro jupyter notebook`
or start the ipython prompt : `kedro ipython` or start the ipython prompt : `kedro ipython`
## Launching the pipeline
`kedro run`
## Visualizing the pipelines
`kedro viz`
## Developper's rules and guidelines ## Developper's rules and guidelines
Declare any dependencies in `src/requirements.txt` for `pip` installation. Declare any dependencies in `src/requirements.txt` for `pip` installation.
@ -32,18 +52,10 @@ After this, if you'd like to update your project requirements, please update `sr
[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) [Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies)
Project Name ## tips
============
human readable name : `Actes Princiers` You need to reload Kedro variables by calling `%reload_kedro` in your notebook and re-run the code snippet
The project name 'Actes Princiers' has been applied to:
- The project title in `datascience/actes-princiers/README.md`
- The folder created for your project in `datascience/actes-princiers`
- The project's python package in `datascience/actes-princiers/src/actes_princiers`
A best-practice setup includes initialising git and creating a virtual environment before running 'pip install -r src/requirements.txt'
Par rapport aux bonnes pratiques kedro Par rapport aux bonnes pratiques kedro
------------------------------------------ ------------------------------------------

@ -6,6 +6,7 @@ actors:
filepath: data/01_raw/csv/actors.csv filepath: data/01_raw/csv/actors.csv
load_args: load_args:
sep: ";" sep: ";"
corpus-agnes-bourgogne: corpus-agnes-bourgogne:
type: pandas.CSVDataSet type: pandas.CSVDataSet
filepath: data/01_raw/csv/corpus-agnes-bourgogne.csv filepath: data/01_raw/csv/corpus-agnes-bourgogne.csv
@ -18,3 +19,9 @@ corpus-charles-i:
load_args: load_args:
sep: ";" sep: ";"
preprocessed_actors:
type: pandas.CSVDataSet
filepath: data/02_intermediate/csv/preprocessed_actors.csv
save_args:
sep: ";"

@ -0,0 +1,50 @@
NAME;ROLE;HOUSE;DATE1;DATE2;DATE3
Charles Ier de Bourbon;prince;Bourbon;1400;1434.0;1456.0
Gort, Étienne;secret;Bourbon;1425;1440.0;
Erart;secret;Berry;1404;1405.0;
Jean de Berry;prince;Berry;1337;1360.0;1416.0
Agnès de Bourgogne;prince;Bourbon;1407;1434.0;1476.0
Marghas, Philippe;secret;Bourbon;1426;1433.0;
Marie de Berry;prince;Bourbon;1480;1410.0;1434.0
René d'Anjou;prince;Anjou;;;
Arthur de Richemont;prince;Bretagne;;;
Bernard d'Armagnac;prince;Armagnac;;;
Philippe le Bon;prince;Bourgogne;;;
Gourriet, Lorrin;secret;Bourbon;;;
De Bar, Étienne;secret;Bourbon;;;
Gon, Jean;secret;Bourbon;;;
Trichon, Jean;secret;Bourbon;;;
Chevalier, E.;secret;Bretagne;;;
Cadier, Guillaume;secret;Bourbon;;;
Decharmeres, J.;secret;Anjou;;;
Dommessent;secret;Bourgogne;Bretagne;;
Andraut, Laurent;secret;Bourbon;;;
Breneal, Jean;secret;Bourgogne;;;
De Castillione;secret;Anjou;;;
Yollande d'Aragon;prince;Anjou;;;
Marie de Blois;prince;Anjou;;;
Grauquellin;secret;Anjou;;;
Michael;secret;Anjou;;;
Matheus;secret;Anjou;;;
Louis Ier d'Anjou;prince;Anjou;;;
Louis II d'Anjou;prince;Anjou;;;
Louis III d'Anjou;prince;Anjou;;;
Caillot, G.;secret;Anjou;;;
Olivier;secret;Anjou;;;
Benepy;secret;Anjou;;;
Gontier, Col;secret;Berry;;;
Franchome;secret;Anjou;;;
Isabelle de Lorraine;prince;Anjou;;;
Bollumbrellus;secret;Anjou;;;
Nicolao Perigaut;secret;Anjou;;;
De Vaulx;secret;Anjou;;;
Alardeau, Jean;secret;Anjou;;;
Charnières;secret;Anjou;;;
Nicolas;secret;Anjou;;;
Rouxelet;secret;Anjou;;;
Boursier;secret;Anjou;;;
Petre;secret;Anjou;;;
Ponce Caihe;secret;Anjou;;;
J. Crete;secret;Anjou;;;
J. de Vernon;secret;Anjou;;;
Tourneville, Guillaume;secret;Anjou;;;
1 NAME ROLE HOUSE DATE1 DATE2 DATE3
2 Charles Ier de Bourbon prince Bourbon 1400 1434.0 1456.0
3 Gort, Étienne secret Bourbon 1425 1440.0
4 Erart secret Berry 1404 1405.0
5 Jean de Berry prince Berry 1337 1360.0 1416.0
6 Agnès de Bourgogne prince Bourbon 1407 1434.0 1476.0
7 Marghas, Philippe secret Bourbon 1426 1433.0
8 Marie de Berry prince Bourbon 1480 1410.0 1434.0
9 René d'Anjou prince Anjou
10 Arthur de Richemont prince Bretagne
11 Bernard d'Armagnac prince Armagnac
12 Philippe le Bon prince Bourgogne
13 Gourriet, Lorrin secret Bourbon
14 De Bar, Étienne secret Bourbon
15 Gon, Jean secret Bourbon
16 Trichon, Jean secret Bourbon
17 Chevalier, E. secret Bretagne
18 Cadier, Guillaume secret Bourbon
19 Decharmeres, J. secret Anjou
20 Dommessent secret Bourgogne Bretagne
21 Andraut, Laurent secret Bourbon
22 Breneal, Jean secret Bourgogne
23 De Castillione secret Anjou
24 Yollande d'Aragon prince Anjou
25 Marie de Blois prince Anjou
26 Grauquellin secret Anjou
27 Michael secret Anjou
28 Matheus secret Anjou
29 Louis Ier d'Anjou prince Anjou
30 Louis II d'Anjou prince Anjou
31 Louis III d'Anjou prince Anjou
32 Caillot, G. secret Anjou
33 Olivier secret Anjou
34 Benepy secret Anjou
35 Gontier, Col secret Berry
36 Franchome secret Anjou
37 Isabelle de Lorraine prince Anjou
38 Bollumbrellus secret Anjou
39 Nicolao Perigaut secret Anjou
40 De Vaulx secret Anjou
41 Alardeau, Jean secret Anjou
42 Charnières secret Anjou
43 Nicolas secret Anjou
44 Rouxelet secret Anjou
45 Boursier secret Anjou
46 Petre secret Anjou
47 Ponce Caihe secret Anjou
48 J. Crete secret Anjou
49 J. de Vernon secret Anjou
50 Tourneville, Guillaume secret Anjou

@ -26,7 +26,7 @@ from actes_princiers import __version__ as release
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = "actes_princiers" project = "actes_princiers"
author = "Kedro" author = "Jean-Damien"
# The short X.Y version. # The short X.Y version.
version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1)

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "951f178d", "id": "aeacd24e",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Catalogs\n", "# Catalogs\n",
@ -33,18 +33,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 38,
"id": "dc290e93", "id": "dc290e93",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/html": [ "text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/16/23 14:56:53] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'actors'</span> <span style=\"font-weight: bold\">(</span>CSVDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n", "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/16/23 15:56:44] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'actors'</span> <span style=\"font-weight: bold\">(</span>CSVDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n" "</pre>\n"
], ],
"text/plain": [ "text/plain": [
"\u001b[2;36m[06/16/23 14:56:53]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=755052;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=546933;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" "\u001b[2;36m[06/16/23 15:56:44]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=858812;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=44255;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
] ]
}, },
"metadata": {}, "metadata": {},
@ -86,8 +86,8 @@
" <td>prince</td>\n", " <td>prince</td>\n",
" <td>Bourbon</td>\n", " <td>Bourbon</td>\n",
" <td>1400</td>\n", " <td>1400</td>\n",
" <td>1434</td>\n", " <td>1434.0</td>\n",
" <td>1456</td>\n", " <td>1456.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
@ -95,7 +95,7 @@
" <td>secret</td>\n", " <td>secret</td>\n",
" <td>Bourbon</td>\n", " <td>Bourbon</td>\n",
" <td>1425</td>\n", " <td>1425</td>\n",
" <td>1440</td>\n", " <td>1440.0</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -104,7 +104,7 @@
" <td>secret</td>\n", " <td>secret</td>\n",
" <td>Berry</td>\n", " <td>Berry</td>\n",
" <td>1404</td>\n", " <td>1404</td>\n",
" <td>1405</td>\n", " <td>1405.0</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@ -113,8 +113,8 @@
" <td>prince</td>\n", " <td>prince</td>\n",
" <td>Berry</td>\n", " <td>Berry</td>\n",
" <td>1337</td>\n", " <td>1337</td>\n",
" <td>1360</td>\n", " <td>1360.0</td>\n",
" <td>1416</td>\n", " <td>1416.0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
@ -122,8 +122,8 @@
" <td>prince</td>\n", " <td>prince</td>\n",
" <td>Bourbon</td>\n", " <td>Bourbon</td>\n",
" <td>1407</td>\n", " <td>1407</td>\n",
" <td>1434</td>\n", " <td>1434.0</td>\n",
" <td>1476</td>\n", " <td>1476.0</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
@ -131,14 +131,14 @@
], ],
"text/plain": [ "text/plain": [
" NAME ROLE HOUSE DATE1 DATE2 DATE3\n", " NAME ROLE HOUSE DATE1 DATE2 DATE3\n",
"0 Charles Ier de Bourbon prince Bourbon 1400 1434 1456\n", "0 Charles Ier de Bourbon prince Bourbon 1400 1434.0 1456.0\n",
"1 Gort, Étienne secret Bourbon 1425 1440 NaN\n", "1 Gort, Étienne secret Bourbon 1425 1440.0 NaN\n",
"2 Erart secret Berry 1404 1405 NaN\n", "2 Erart secret Berry 1404 1405.0 NaN\n",
"3 Jean de Berry prince Berry 1337 1360 1416\n", "3 Jean de Berry prince Berry 1337 1360.0 1416.0\n",
"4 Agnès de Bourgogne prince Bourbon 1407 1434 1476" "4 Agnès de Bourgogne prince Bourbon 1407 1434.0 1476.0"
] ]
}, },
"execution_count": 15, "execution_count": 38,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -150,7 +150,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"id": "fbccaa41", "id": "eedbc7fb",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -171,7 +171,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 20,
"id": "530a8932", "id": "3168935f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -205,7 +205,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "7f10c2c3", "id": "902dd387",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Nettoyage des valeurs non renseignées\n", "## Nettoyage des valeurs non renseignées\n",
@ -217,7 +217,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 37,
"id": "ea0451df", "id": "24fc62ce",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {

@ -0,0 +1,3 @@
"Data Processing pipeline"
from .pipeline import create_pipeline # NOQA

@ -0,0 +1,16 @@
import pandas as pd
import numpy as np
def _is_true(x: pd.Series) -> pd.Series:
return x == "t"
def _parse_percentage(x: pd.Series) -> pd.Series:
x = x.str.replace("%", "")
x = x.astype(float) / 100
return x
def preprocess_actors(actors: pd.DataFrame) -> pd.DataFrame:
actors.replace("XXXX", np.NaN)
return actors

@ -0,0 +1,16 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import preprocess_actors
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=preprocess_actors,
inputs="actors",
outputs="preprocessed_actors",
name="preprocess_actors_node",
),
]
)
Loading…
Cancel
Save