diff --git a/README.md b/README.md index 13804ba..9a264d6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,17 @@ # Actes princiers -- refactoring datascience +## Project Name + +human readable name : `Actes Princiers` + +The project name 'Actes Princiers' has been applied to: + +- The project title in `datascience/actes-princiers/README.md` +- The folder created for your project in `datascience/actes-princiers` +- The project's python package in `datascience/actes-princiers/src/actes_princiers` + +A best-practice setup includes initialising git and creating a virtual environment before running 'pip install -r src/requirements.txt' + ## Getting started - Install the virtual environment : `python3 -m venv .venv` @@ -7,10 +19,18 @@ - install kedro `pip install kedro` - Install the packages and libraries `pip install -r src/requirements.txt` -Then open a terminal in the `actest-princiers` folder +Then open a terminal in the `actes-princiers` folder and launch jupyter : `kedro jupyter notebook` or start the ipython prompt : `kedro ipython` +## Launching the pipeline + +`kedro run` + +## Visualizing the pipelines + +`kedro viz` + ## Developper's rules and guidelines Declare any dependencies in `src/requirements.txt` for `pip` installation. @@ -32,18 +52,10 @@ After this, if you'd like to update your project requirements, please update `sr [Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) -Project Name -============ +## tips -human readable name : `Actes Princiers` - -The project name 'Actes Princiers' has been applied to: +You need to reload Kedro variables by calling `%reload_kedro` in your notebook and re-run the code snippet -- The project title in `datascience/actes-princiers/README.md` -- The folder created for your project in `datascience/actes-princiers` -- The project's python package in `datascience/actes-princiers/src/actes_princiers` - -A best-practice setup includes initialising git and creating a virtual environment before running 'pip install -r src/requirements.txt' Par rapport aux bonnes pratiques kedro ------------------------------------------ diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 9142e0c..b9912e0 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -6,6 +6,7 @@ actors: filepath: data/01_raw/csv/actors.csv load_args: sep: ";" + corpus-agnes-bourgogne: type: pandas.CSVDataSet filepath: data/01_raw/csv/corpus-agnes-bourgogne.csv @@ -18,3 +19,9 @@ corpus-charles-i: load_args: sep: ";" +preprocessed_actors: + type: pandas.CSVDataSet + filepath: data/02_intermediate/csv/preprocessed_actors.csv + save_args: + sep: ";" + diff --git a/actes-princiers/data/02_intermediate/csv/preprocessed_actors.csv b/actes-princiers/data/02_intermediate/csv/preprocessed_actors.csv new file mode 100644 index 0000000..7784d02 --- /dev/null +++ b/actes-princiers/data/02_intermediate/csv/preprocessed_actors.csv @@ -0,0 +1,50 @@ +NAME;ROLE;HOUSE;DATE1;DATE2;DATE3 +Charles Ier de Bourbon;prince;Bourbon;1400;1434.0;1456.0 +Gort, Étienne;secret;Bourbon;1425;1440.0; +Erart;secret;Berry;1404;1405.0; +Jean de Berry;prince;Berry;1337;1360.0;1416.0 +Agnès de Bourgogne;prince;Bourbon;1407;1434.0;1476.0 +Marghas, Philippe;secret;Bourbon;1426;1433.0; +Marie de Berry;prince;Bourbon;1480;1410.0;1434.0 +René d'Anjou;prince;Anjou;;; +Arthur de Richemont;prince;Bretagne;;; +Bernard d'Armagnac;prince;Armagnac;;; +Philippe le Bon;prince;Bourgogne;;; +Gourriet, Lorrin;secret;Bourbon;;; +De Bar, Étienne;secret;Bourbon;;; +Gon, Jean;secret;Bourbon;;; +Trichon, Jean;secret;Bourbon;;; +Chevalier, E.;secret;Bretagne;;; +Cadier, Guillaume;secret;Bourbon;;; +Decharmeres, J.;secret;Anjou;;; +Dommessent;secret;Bourgogne;Bretagne;; +Andraut, Laurent;secret;Bourbon;;; +Breneal, Jean;secret;Bourgogne;;; +De Castillione;secret;Anjou;;; +Yollande d'Aragon;prince;Anjou;;; +Marie de Blois;prince;Anjou;;; +Grauquellin;secret;Anjou;;; +Michael;secret;Anjou;;; +Matheus;secret;Anjou;;; +Louis Ier d'Anjou;prince;Anjou;;; +Louis II d'Anjou;prince;Anjou;;; +Louis III d'Anjou;prince;Anjou;;; +Caillot, G.;secret;Anjou;;; +Olivier;secret;Anjou;;; +Benepy;secret;Anjou;;; +Gontier, Col;secret;Berry;;; +Franchome;secret;Anjou;;; +Isabelle de Lorraine;prince;Anjou;;; +Bollumbrellus;secret;Anjou;;; +Nicolao Perigaut;secret;Anjou;;; +De Vaulx;secret;Anjou;;; +Alardeau, Jean;secret;Anjou;;; +Charnières;secret;Anjou;;; +Nicolas;secret;Anjou;;; +Rouxelet;secret;Anjou;;; +Boursier;secret;Anjou;;; +Petre;secret;Anjou;;; +Ponce Caihe;secret;Anjou;;; +J. Crete;secret;Anjou;;; +J. de Vernon;secret;Anjou;;; +Tourneville, Guillaume;secret;Anjou;;; diff --git a/actes-princiers/docs/source/conf.py b/actes-princiers/docs/source/conf.py index 33f1275..c0f0e73 100644 --- a/actes-princiers/docs/source/conf.py +++ b/actes-princiers/docs/source/conf.py @@ -26,7 +26,7 @@ from actes_princiers import __version__ as release # -- Project information ----------------------------------------------------- project = "actes_princiers" -author = "Kedro" +author = "Jean-Damien" # The short X.Y version. version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) diff --git a/actes-princiers/notebooks/LoadDataCatalog.ipynb b/actes-princiers/notebooks/LoadDataCatalog.ipynb index 534f7cf..1d90cd4 100644 --- a/actes-princiers/notebooks/LoadDataCatalog.ipynb +++ b/actes-princiers/notebooks/LoadDataCatalog.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "951f178d", + "id": "aeacd24e", "metadata": {}, "source": [ "# Catalogs\n", @@ -33,18 +33,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 38, "id": "dc290e93", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
[06/16/23 14:56:53] INFO     Loading data from 'actors' (CSVDataSet)...                         data_catalog.py:345\n",
+       "
[06/16/23 15:56:44] INFO     Loading data from 'actors' (CSVDataSet)...                         data_catalog.py:345\n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[06/16/23 14:56:53]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=755052;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=546933;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" + "\u001b[2;36m[06/16/23 15:56:44]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'actors'\u001b[0m \u001b[1m(\u001b[0mCSVDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=858812;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=44255;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, @@ -86,8 +86,8 @@ " prince\n", " Bourbon\n", " 1400\n", - " 1434\n", - " 1456\n", + " 1434.0\n", + " 1456.0\n", " \n", " \n", " 1\n", @@ -95,7 +95,7 @@ " secret\n", " Bourbon\n", " 1425\n", - " 1440\n", + " 1440.0\n", " NaN\n", " \n", " \n", @@ -104,7 +104,7 @@ " secret\n", " Berry\n", " 1404\n", - " 1405\n", + " 1405.0\n", " NaN\n", " \n", " \n", @@ -113,8 +113,8 @@ " prince\n", " Berry\n", " 1337\n", - " 1360\n", - " 1416\n", + " 1360.0\n", + " 1416.0\n", " \n", " \n", " 4\n", @@ -122,23 +122,23 @@ " prince\n", " Bourbon\n", " 1407\n", - " 1434\n", - " 1476\n", + " 1434.0\n", + " 1476.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " NAME ROLE HOUSE DATE1 DATE2 DATE3\n", - "0 Charles Ier de Bourbon prince Bourbon 1400 1434 1456\n", - "1 Gort, Étienne secret Bourbon 1425 1440 NaN\n", - "2 Erart secret Berry 1404 1405 NaN\n", - "3 Jean de Berry prince Berry 1337 1360 1416\n", - "4 Agnès de Bourgogne prince Bourbon 1407 1434 1476" + " NAME ROLE HOUSE DATE1 DATE2 DATE3\n", + "0 Charles Ier de Bourbon prince Bourbon 1400 1434.0 1456.0\n", + "1 Gort, Étienne secret Bourbon 1425 1440.0 NaN\n", + "2 Erart secret Berry 1404 1405.0 NaN\n", + "3 Jean de Berry prince Berry 1337 1360.0 1416.0\n", + "4 Agnès de Bourgogne prince Bourbon 1407 1434.0 1476.0" ] }, - "execution_count": 15, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -150,7 +150,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "fbccaa41", + "id": "eedbc7fb", "metadata": {}, "outputs": [ { @@ -171,7 +171,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "530a8932", + "id": "3168935f", "metadata": {}, "outputs": [ { @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "7f10c2c3", + "id": "902dd387", "metadata": {}, "source": [ "## Nettoyage des valeurs non renseignées\n", @@ -217,7 +217,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "ea0451df", + "id": "24fc62ce", "metadata": {}, "outputs": [ { diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py new file mode 100755 index 0000000..e08ea5f --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py @@ -0,0 +1,3 @@ +"Data Processing pipeline" + +from .pipeline import create_pipeline # NOQA diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py new file mode 100755 index 0000000..c300a92 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py @@ -0,0 +1,16 @@ +import pandas as pd +import numpy as np + +def _is_true(x: pd.Series) -> pd.Series: + return x == "t" + +def _parse_percentage(x: pd.Series) -> pd.Series: + x = x.str.replace("%", "") + x = x.astype(float) / 100 + return x + +def preprocess_actors(actors: pd.DataFrame) -> pd.DataFrame: + actors.replace("XXXX", np.NaN) + return actors + + diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py new file mode 100755 index 0000000..2faa9dc --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py @@ -0,0 +1,16 @@ +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import preprocess_actors + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=preprocess_actors, + inputs="actors", + outputs="preprocessed_actors", + name="preprocess_actors_node", + ), + ] + )