main
gwen 2 weeks ago
parent 89a79ba18c
commit a18ef017c6

@ -0,0 +1,119 @@
Concaténation
==================
Voici un exemple simple de pipeline Kedro qui charge plusieurs fichiers CSV, les concatène et écrit le résultat dans un nouveau fichier CSV.
### Structure du projet
Voici une structure de projet Kedro typique pour cet exemple :
```
my_kedro_project/
├── conf/
│ ├── base/
│ │ ├── catalog.yml
│ │ └── parameters.yml
│ └── local/
│ ├── catalog.yml
│ └── parameters.yml
├── src/
│ └── my_kedro_project/
│ ├── __init__.py
│ ├── pipeline_registry.py
│ ├── nodes.py
│ └── pipeline.py
├── data/
│ ├── 01_raw/
│ │ ├── file1.csv
│ │ ├── file2.csv
│ │ └── file3.csv
│ └── 02_intermediate/
│ └── concatenated.csv
└── pyproject.toml
```
### Configuration du catalogue de données
Dans `conf/base/catalog.yml`, configurez les datasets :
```yaml
file1:
type: pandas.CSVDataSet
filepath: data/01_raw/file1.csv
file2:
type: pandas.CSVDataSet
filepath: data/01_raw/file2.csv
file3:
type: pandas.CSVDataSet
filepath: data/01_raw/file3.csv
concatenated:
type: pandas.CSVDataSet
filepath: data/02_intermediate/concatenated.csv
```
### Définition des nœuds
Dans `src/my_kedro_project/nodes.py`, définissez les nœuds pour concaténer les fichiers CSV :
```python
import pandas as pd
from kedro.pipeline import node
def concatenate_csvs(file1: pd.DataFrame, file2: pd.DataFrame, file3: pd.DataFrame) -> pd.DataFrame:
return pd.concat([file1, file2, file3])
node_concatenate = node(
func=concatenate_csvs,
inputs=["file1", "file2", "file3"],
outputs="concatenated",
name="concatenate_csvs_node"
)
```
### Définition du pipeline
Dans `src/my_kedro_project/pipeline.py`, créez le pipeline :
```python
from kedro.pipeline import Pipeline
from my_kedro_project.nodes import node_concatenate
def create_pipeline(**kwargs) -> Pipeline:
return Pipeline([node_concatenate])
```
### Registre de pipeline
Dans `src/my_kedro_project/pipeline_registry.py`, enregistrez le pipeline :
```python
from kedro.pipeline import Pipeline
from my_kedro_project.pipeline import create_pipeline
def register_pipelines() -> Dict[str, Pipeline]:
return {
"__default__": create_pipeline(),
}
```
### Exécution du pipeline
Pour exécuter le pipeline, utilisez la commande suivante dans le répertoire racine de votre projet :
```bash
kedro run
```
Cela chargera les fichiers CSV `file1.csv`, `file2.csv`, et `file3.csv` depuis le répertoire `data/01_raw/`, les concaténera et écrira le résultat dans `data/02_intermediate/concatenated.csv`.
### Conclusion
Cet exemple montre comment configurer un pipeline Kedro pour charger plusieurs fichiers CSV, les concaténer et écrire le résultat dans un nouveau fichier CSV. Vous pouvez adapter cette structure pour des workflows plus complexes en ajoutant d'autres nœuds et pipelines selon vos besoins.

@ -0,0 +1,45 @@
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project
# Bootstrap le projet Kedro
project_path = "."
bootstrap_project(project_path)
from kedro.pipeline import Pipeline, node
#from kedro.io import DataCatalog
def process_data(data):
# Votre logique de traitement ici
return data.drop(columns=['flag', 'k', 'index', 'rate'])
mynode = node(
func=process_data,
inputs="fake_data",
outputs="processed_data",
name="process_data_node",
)
pipeline = Pipeline([mynode])
# Crée une session Kedro
with KedroSession.create(project_path=project_path) as session:
context = session.load_context()
catalog = context.catalog
# # Exécutez votre pipeline ici
# runner = context.runner
# runner.run(pipeline, catalog)
from kedro.runner import SequentialRunner
# Créez un runner
runner = SequentialRunner()
# Exécutez votre pipeline
runner.run(pipeline, catalog)
#catalog = DataCatalog.from_config("catalog.yml")

@ -0,0 +1,162 @@
##########################
# KEDRO PROJECT
# ignore all local configuration
conf/local/**
!conf/local/.gitkeep
.telemetry
# ignore potentially sensitive credentials files
conf/**/*credentials*
# ignore everything in the following folders
data/**
# except their sub-folders
!data/**/
# also keep all .gitkeep files
!.gitkeep
# ignore kedro-viz metadata
.viz
# ignore file based logs
*.log
##########################
# Common files
# IntelliJ
.idea/
*.iml
out/
.idea_modules/
### macOS
*.DS_Store
.AppleDouble
.LSOverride
.Trashes
# Vim
*~
.*.swo
.*.swp
# emacs
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
# JIRA plugin
atlassian-ide-plugin.xml
# C extensions
*.so
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
.ipython/profile_default/history.sqlite
.ipython/profile_default/startup/README
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# mkdocs documentation
/site
# mypy
.mypy_cache/
# mlflow local runs
mlruns/*

@ -0,0 +1,101 @@
# myproject
[![Powered by Kedro](https://img.shields.io/badge/powered_by-kedro-ffc900?logo=kedro)](https://kedro.org)
## Overview
This is your new Kedro project, which was generated using `kedro 0.19.11`.
Take a look at the [Kedro documentation](https://docs.kedro.org) to get started.
## Rules and guidelines
In order to get the best out of the template:
* Don't remove any lines from the `.gitignore` file we provide
* Make sure your results can be reproduced by following a data engineering convention
* Don't commit data to your repository
* Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/`
## How to install dependencies
Declare any dependencies in `requirements.txt` for `pip` installation.
To install them, run:
```
pip install -r requirements.txt
```
## How to run your Kedro pipeline
You can run your Kedro project with:
```
kedro run
```
## How to test your Kedro project
Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests as follows:
```
pytest
```
You can configure the coverage threshold in your project's `pyproject.toml` file under the `[tool.coverage.report]` section.
## Project dependencies
To see and update the dependency requirements for your project use `requirements.txt`. You can install the project requirements with `pip install -r requirements.txt`.
[Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies)
## How to work with Kedro and notebooks
> Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `context`, 'session', `catalog`, and `pipelines`.
>
> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r requirements.txt` you will not need to take any extra steps before you use them.
### Jupyter
To use Jupyter notebooks in your Kedro project, you need to install Jupyter:
```
pip install jupyter
```
After installing Jupyter, you can start a local notebook server:
```
kedro jupyter notebook
```
### JupyterLab
To use JupyterLab, you need to install it:
```
pip install jupyterlab
```
You can also start JupyterLab:
```
kedro jupyter lab
```
### IPython
And if you want to run an IPython session:
```
kedro ipython
```
### How to ignore notebook output cells in `git`
To automatically strip out all output cell contents before committing to `git`, you can use tools like [`nbstripout`](https://github.com/kynan/nbstripout). For example, you can add a hook in `.git/config` with `nbstripout --install`. This will run `nbstripout` before anything is committed to `git`.
> *Note:* Your output cells will be retained locally.
## Package your Kedro project
[Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html)

@ -0,0 +1,26 @@
# What is this for?
This folder should be used to store configuration files used by Kedro or by separate tools.
This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section.
## Local configuration
The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys).
> *Note:* Please do not check in any local configuration to version control.
## Base configuration
The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members.
WARNING: Please do not put access credentials in the base configuration folder.
## Instructions
## Need help?
[Find out more about configuration from the Kedro documentation](https://docs.kedro.org/en/stable/kedro_project_setup/configuration.html).

@ -0,0 +1,17 @@
fake_data:
type: pandas.CSVDataset
filepath: data/01_raw/fake_data.csv
load_args:
sep: ','
header: 0
save_args:
index: False
processed_data:
type: pandas.CSVDataset
filepath: data/02_intermediate/fake_data2.csv
load_args:
sep: ','
header: 0
save_args:
index: False

@ -0,0 +1,4 @@
# Here you can define all your datasets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://docs.kedro.org/en/stable/data/data_catalog.html

@ -0,0 +1,5 @@
# This is a boilerplate parameters config generated for pipeline 'mon_pipeline'
# using Kedro 0.19.11.
#
# Documentation for this file format can be found in "Parameters"
# Link: https://docs.kedro.org/en/0.19.11/configuration/parameters.html

@ -0,0 +1 @@
kedro pipeline create mypipeline

@ -0,0 +1,33 @@
[build-system]
requires = [ "setuptools",]
build-backend = "setuptools.build_meta"
[project]
requires-python = ">=3.9"
name = "myproject"
readme = "README.md"
dynamic = [ "version",]
dependencies = [ "ipython>=8.10", "jupyterlab>=3.0", "notebook", "kedro~=0.19.11",]
[project.scripts]
myproject = "myproject.__main__:main"
[tool.kedro]
package_name = "myproject"
project_name = "myproject"
kedro_init_version = "0.19.11"
tools = "['None']"
example_pipeline = "False"
source_dir = "src"
[project.entry-points."kedro.hooks"]
[tool.setuptools.dynamic.version]
attr = "myproject.__version__"
[tool.setuptools.packages.find]
where = [ "src",]
namespaces = false
[tool.kedro_telemetry]
project_id = "75d7fbdcd62f438baccd916558ab8048"

@ -0,0 +1,7 @@
ipython>=8.10
jupyterlab>=3.0
kedro~=0.19.11
notebook
pandas
kedro-datasets[pandas.CSVDataSet]

@ -0,0 +1,24 @@
"""myproject file for ensuring the package is executable
as `myproject` and `python -m myproject`
"""
import sys
from pathlib import Path
from typing import Any
from kedro.framework.cli.utils import find_run_command
from kedro.framework.project import configure_project
def main(*args, **kwargs) -> Any:
package_name = Path(__file__).parent.name
configure_project(package_name)
interactive = hasattr(sys, 'ps1')
kwargs["standalone_mode"] = not interactive
run = find_run_command(package_name)
return run(*args, **kwargs)
if __name__ == "__main__":
main()

@ -0,0 +1,21 @@
"""Project pipelines."""
from __future__ import annotations
from kedro.framework.project import find_pipelines
from kedro.pipeline import Pipeline
from myproject.pipelines.process_data.pipeline import pipeline as process_data_pipeline
def register_pipelines() -> dict[str, Pipeline]:
"""Register the project's pipelines.
Returns:
A mapping from pipeline names to ``Pipeline`` objects.
"""
# pipelines = find_pipelines()
# pipelines["process_data"] = process_data_pipeline,
## pipelines["__default__"] = sum(pipelines.values())
# return pipelines
return {
"process_data": process_data_pipeline,
"__default__": process_data_pipeline,
}

@ -0,0 +1,10 @@
"""
This is a boilerplate pipeline 'mon_pipeline'
generated using Kedro 0.19.11
"""
from .pipeline import create_pipeline
__all__ = ["create_pipeline"]
__version__ = "0.1"

@ -0,0 +1,4 @@
"""
This is a boilerplate pipeline 'mon_pipeline'
generated using Kedro 0.19.11
"""

@ -0,0 +1,10 @@
"""
This is a boilerplate pipeline 'mon_pipeline'
generated using Kedro 0.19.11
"""
from kedro.pipeline import node, Pipeline, pipeline # noqa
def create_pipeline(**kwargs) -> Pipeline:
return pipeline([])

@ -0,0 +1,14 @@
import pandas as pd
def process_data(data: pd.DataFrame) -> pd.DataFrame:
"""
Fonction de traitement des données.
Args:
data (pd.DataFrame): Les données d'entrée.
Returns:
pd.DataFrame: Les données traitées.
"""
# Logique de traitement : suppression de certaines colonnes
return data.drop(columns=['flag', 'k', 'index', 'rate'])

@ -0,0 +1,13 @@
from kedro.pipeline import Pipeline, node
from .node import process_data
# Création d'un nœud avec la fonction process_data
mynode = node(
func=process_data,
inputs="fake_data",
outputs="processed_data",
name="process_data_node",
)
# Création du pipeline avec le nœud
pipeline = Pipeline([mynode])

@ -0,0 +1,46 @@
"""Project settings. There is no need to edit this file unless you want to change values
from the Kedro defaults. For further information, including these default values, see
https://docs.kedro.org/en/stable/kedro_project_setup/settings.html."""
# Instantiated project hooks.
# For example, after creating a hooks.py and defining a ProjectHooks class there, do
# from myproject.hooks import ProjectHooks
# Hooks are executed in a Last-In-First-Out (LIFO) order.
# HOOKS = (ProjectHooks(),)
# Installed plugins for which to disable hook auto-registration.
# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
# Class that manages storing KedroSession data.
# from kedro.framework.session.store import BaseSessionStore
# SESSION_STORE_CLASS = BaseSessionStore
# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor.
# SESSION_STORE_ARGS = {
# "path": "./sessions"
# }
# Directory that holds configuration.
# CONF_SOURCE = "conf"
# Class that manages how configuration is loaded.
# from kedro.config import OmegaConfigLoader
# CONFIG_LOADER_CLASS = OmegaConfigLoader
# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor.
CONFIG_LOADER_ARGS = {
"base_env": "base",
"default_run_env": "local",
# "config_patterns": {
# "spark" : ["spark*/"],
# "parameters": ["parameters*", "parameters*/**", "**/parameters*"],
# }
}
# Class that manages Kedro's library components.
# from kedro.framework.context import KedroContext
# CONTEXT_CLASS = KedroContext
# Class that manages the Data Catalog.
# from kedro.io import DataCatalog
# DATA_CATALOG_CLASS = DataCatalog

@ -0,0 +1,9 @@
"""
This is a boilerplate test file for pipeline 'mon_pipeline'
generated using Kedro 0.19.11.
Please add your pipeline tests here.
Kedro recommends using `pytest` framework, more info about it can be found
in the official documentation:
https://docs.pytest.org/en/latest/getting-started.html
"""
Loading…
Cancel
Save