kedro

2 weeks ago · a18ef017c6
parent 89a79ba18c
commit a18ef017c6
27 changed files with 675 additions and 0 deletions
--- a/kedro/concatenation.md
+++ b/kedro/concatenation.md
@ -0,0 +1,119 @@
 Concaténation
 ==================
 Voici un exemple simple de pipeline Kedro qui charge plusieurs fichiers CSV, les concatène et écrit le résultat dans un nouveau fichier CSV.
 ### Structure du projet
 Voici une structure de projet Kedro typique pour cet exemple :
 ```
 my_kedro_project/
 │
 ├── conf/
 │   ├── base/
 │   │   ├── catalog.yml
 │   │   └── parameters.yml
 │   └── local/
 │       ├── catalog.yml
 │       └── parameters.yml
 │
 ├── src/
 │   └── my_kedro_project/
 │       ├── __init__.py
 │       ├── pipeline_registry.py
 │       ├── nodes.py
 │       └── pipeline.py
 │
 ├── data/
 │   ├── 01_raw/
 │   │   ├── file1.csv
 │   │   ├── file2.csv
 │   │   └── file3.csv
 │   └── 02_intermediate/
 │       └── concatenated.csv
 │
 └── pyproject.toml
 ```
 ### Configuration du catalogue de données
 Dans `conf/base/catalog.yml`, configurez les datasets :
 ```yaml
 file1:
  type: pandas.CSVDataSet
  filepath: data/01_raw/file1.csv
 file2:
  type: pandas.CSVDataSet
  filepath: data/01_raw/file2.csv
 file3:
  type: pandas.CSVDataSet
  filepath: data/01_raw/file3.csv
 concatenated:
  type: pandas.CSVDataSet
  filepath: data/02_intermediate/concatenated.csv
 ```
 ### Définition des nœuds
 Dans `src/my_kedro_project/nodes.py`, définissez les nœuds pour concaténer les fichiers CSV :
 ```python
 import pandas as pd
 from kedro.pipeline import node
 def concatenate_csvs(file1: pd.DataFrame, file2: pd.DataFrame, file3: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([file1, file2, file3])
 node_concatenate = node(
    func=concatenate_csvs,
    inputs=["file1", "file2", "file3"],
    outputs="concatenated",
    name="concatenate_csvs_node"
 )
 ```
 ### Définition du pipeline
 Dans `src/my_kedro_project/pipeline.py`, créez le pipeline :
 ```python
 from kedro.pipeline import Pipeline
 from my_kedro_project.nodes import node_concatenate
 def create_pipeline(**kwargs) -> Pipeline:
    return Pipeline([node_concatenate])
 ```
 ### Registre de pipeline
 Dans `src/my_kedro_project/pipeline_registry.py`, enregistrez le pipeline :
 ```python
 from kedro.pipeline import Pipeline
 from my_kedro_project.pipeline import create_pipeline
 def register_pipelines() -> Dict[str, Pipeline]:
    return {
        "__default__": create_pipeline(),
    }
 ```
 ### Exécution du pipeline
 Pour exécuter le pipeline, utilisez la commande suivante dans le répertoire racine de votre projet :
 ```bash
 kedro run
 ```
 Cela chargera les fichiers CSV `file1.csv`, `file2.csv`, et `file3.csv` depuis le répertoire `data/01_raw/`, les concaténera et écrira le résultat dans `data/02_intermediate/concatenated.csv`.
 ### Conclusion
 Cet exemple montre comment configurer un pipeline Kedro pour charger plusieurs fichiers CSV, les concaténer et écrire le résultat dans un nouveau fichier CSV. Vous pouvez adapter cette structure pour des workflows plus complexes en ajoutant d'autres nœuds et pipelines selon vos besoins.
--- a/kedro/kedro_new/essai.py
+++ b/kedro/kedro_new/essai.py
@ -0,0 +1,45 @@
 from kedro.framework.session import KedroSession
 from kedro.framework.startup import bootstrap_project
 # Bootstrap le projet Kedro
 project_path = "."
 bootstrap_project(project_path)
 from kedro.pipeline import Pipeline, node
 #from kedro.io import DataCatalog
 def process_data(data):
    # Votre logique de traitement ici
    return data.drop(columns=['flag', 'k', 'index', 'rate'])
 mynode = node(
    func=process_data,
    inputs="fake_data",
    outputs="processed_data",
    name="process_data_node",
 )
 pipeline = Pipeline([mynode])
 # Crée une session Kedro
 with KedroSession.create(project_path=project_path) as session:
    context = session.load_context()
    catalog = context.catalog
 #    # Exécutez votre pipeline ici
 #    runner = context.runner
 #    runner.run(pipeline, catalog)
 from kedro.runner import SequentialRunner
 # Créez un runner
 runner = SequentialRunner()
 # Exécutez votre pipeline
 runner.run(pipeline, catalog)
 #catalog = DataCatalog.from_config("catalog.yml")
--- a/kedro/kedro_new/kedro_from_scratch/myproject/.gitignore
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/.gitignore
@ -0,0 +1,162 @@
 ##########################
 # KEDRO PROJECT
 # ignore all local configuration
 conf/local/**
 !conf/local/.gitkeep
 .telemetry
 # ignore potentially sensitive credentials files
 conf/**/*credentials*
 # ignore everything in the following folders
 data/**
 # except their sub-folders
 !data/**/
 # also keep all .gitkeep files
 !.gitkeep
 # ignore kedro-viz metadata
 .viz
 # ignore file based logs
 *.log
 ##########################
 # Common files
 # IntelliJ
 .idea/
 *.iml
 out/
 .idea_modules/
 ### macOS
 *.DS_Store
 .AppleDouble
 .LSOverride
 .Trashes
 # Vim
 *~
 .*.swo
 .*.swp
 # emacs
 *~
 \#*\#
 /.emacs.desktop
 /.emacs.desktop.lock
 *.elc
 # JIRA plugin
 atlassian-ide-plugin.xml
 # C extensions
 *.so
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 .static_storage/
 .media/
 local_settings.py
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 .ipython/profile_default/history.sqlite
 .ipython/profile_default/startup/README
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .envrc
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 # mlflow local runs
 mlruns/*
--- a/kedro/kedro_new/kedro_from_scratch/myproject/README.md
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/README.md
@ -0,0 +1,101 @@
 # myproject
 [![Powered by Kedro](https://img.shields.io/badge/powered_by-kedro-ffc900?logo=kedro)](https://kedro.org)
 ## Overview
 This is your new Kedro project, which was generated using `kedro 0.19.11`.
 Take a look at the [Kedro documentation](https://docs.kedro.org) to get started.
 ## Rules and guidelines
 In order to get the best out of the template:
 * Don't remove any lines from the `.gitignore` file we provide
 * Make sure your results can be reproduced by following a data engineering convention
 * Don't commit data to your repository
 * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/`
 ## How to install dependencies
 Declare any dependencies in `requirements.txt` for `pip` installation.
 To install them, run:
 ```
 pip install -r requirements.txt
 ```
 ## How to run your Kedro pipeline
 You can run your Kedro project with:
 ```
 kedro run
 ```
 ## How to test your Kedro project
 Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests as follows:
 ```
 pytest
 ```
 You can configure the coverage threshold in your project's `pyproject.toml` file under the `[tool.coverage.report]` section.
 ## Project dependencies
 To see and update the dependency requirements for your project use `requirements.txt`. You can install the project requirements with `pip install -r requirements.txt`.
 [Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies)
 ## How to work with Kedro and notebooks
 > Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `context`, 'session', `catalog`, and `pipelines`.
 >
 > Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r requirements.txt` you will not need to take any extra steps before you use them.
 ### Jupyter
 To use Jupyter notebooks in your Kedro project, you need to install Jupyter:
 ```
 pip install jupyter
 ```
 After installing Jupyter, you can start a local notebook server:
 ```
 kedro jupyter notebook
 ```
 ### JupyterLab
 To use JupyterLab, you need to install it:
 ```
 pip install jupyterlab
 ```
 You can also start JupyterLab:
 ```
 kedro jupyter lab
 ```
 ### IPython
 And if you want to run an IPython session:
 ```
 kedro ipython
 ```
 ### How to ignore notebook output cells in `git`
 To automatically strip out all output cell contents before committing to `git`, you can use tools like [`nbstripout`](https://github.com/kynan/nbstripout). For example, you can add a hook in `.git/config` with `nbstripout --install`. This will run `nbstripout` before anything is committed to `git`.
 > *Note:* Your output cells will be retained locally.
 ## Package your Kedro project
 [Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html)
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/README.md
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/README.md
@ -0,0 +1,26 @@
 # What is this for?
 This folder should be used to store configuration files used by Kedro or by separate tools.
 This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section.
 ## Local configuration
 The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys).
 > *Note:* Please do not check in any local configuration to version control.
 ## Base configuration
 The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members.
 WARNING: Please do not put access credentials in the base configuration folder.
 ## Instructions
 ## Need help?
 [Find out more about configuration from the Kedro documentation](https://docs.kedro.org/en/stable/kedro_project_setup/configuration.html).
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/catalog.yml
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/catalog.yml
@ -0,0 +1,17 @@
 fake_data:
  type: pandas.CSVDataset
  filepath: data/01_raw/fake_data.csv
  load_args:
    sep: ','
    header: 0
  save_args:
    index: False
 processed_data:
  type: pandas.CSVDataset
  filepath: data/02_intermediate/fake_data2.csv
  load_args:
    sep: ','
    header: 0
  save_args:
    index: False    
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/catalog.yml.ori
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/catalog.yml.ori
@ -0,0 +1,4 @@
 # Here you can define all your datasets by using simple YAML syntax.
 #
 # Documentation for this file format can be found in "The Data Catalog"
 # Link: https://docs.kedro.org/en/stable/data/data_catalog.html
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/parameters.yml
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/parameters.yml
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/parameters_mon_pipeline.yml
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/base/parameters_mon_pipeline.yml
@ -0,0 +1,5 @@
 # This is a boilerplate parameters config generated for pipeline 'mon_pipeline'
 # using Kedro 0.19.11.
 #
 # Documentation for this file format can be found in "Parameters"
 # Link: https://docs.kedro.org/en/0.19.11/configuration/parameters.html
--- a/kedro/kedro_new/kedro_from_scratch/myproject/conf/local/.gitkeep
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/conf/local/.gitkeep
--- a/kedro/kedro_new/kedro_from_scratch/myproject/notebooks/.gitkeep
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/notebooks/.gitkeep
--- a/kedro/kedro_new/kedro_from_scratch/myproject/notes.txt
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/notes.txt
@ -0,0 +1 @@
 kedro pipeline create mypipeline
--- a/kedro/kedro_new/kedro_from_scratch/myproject/pyproject.toml
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/pyproject.toml
@ -0,0 +1,33 @@
 [build-system]
 requires = [ "setuptools",]
 build-backend = "setuptools.build_meta"
 [project]
 requires-python = ">=3.9"
 name = "myproject"
 readme = "README.md"
 dynamic = [ "version",]
 dependencies = [ "ipython>=8.10", "jupyterlab>=3.0", "notebook", "kedro~=0.19.11",]
 [project.scripts]
 myproject = "myproject.__main__:main"
 [tool.kedro]
 package_name = "myproject"
 project_name = "myproject"
 kedro_init_version = "0.19.11"
 tools = "['None']"
 example_pipeline = "False"
 source_dir = "src"
 [project.entry-points."kedro.hooks"]
 [tool.setuptools.dynamic.version]
 attr = "myproject.__version__"
 [tool.setuptools.packages.find]
 where = [ "src",]
 namespaces = false
 [tool.kedro_telemetry]
 project_id = "75d7fbdcd62f438baccd916558ab8048"
--- a/kedro/kedro_new/kedro_from_scratch/myproject/requirements.txt
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/requirements.txt
@ -0,0 +1,7 @@
 ipython>=8.10
 jupyterlab>=3.0
 kedro~=0.19.11
 notebook
 pandas
 kedro-datasets[pandas.CSVDataSet]
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/init.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/init.py
@ -0,0 +1,4 @@
 """myproject
 """
 __version__ = "0.1"
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/main.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/main.py
@ -0,0 +1,24 @@
 """myproject file for ensuring the package is executable
 as `myproject` and `python -m myproject`
 """
 import sys
 from pathlib import Path
 from typing import Any
 from kedro.framework.cli.utils import find_run_command
 from kedro.framework.project import configure_project
 def main(*args, **kwargs) -> Any:
    package_name = Path(__file__).parent.name
    configure_project(package_name)
    interactive = hasattr(sys, 'ps1')
    kwargs["standalone_mode"] = not interactive
    run = find_run_command(package_name)
    return run(*args, **kwargs)
 if __name__ == "__main__":
    main()
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipeline_registry.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipeline_registry.py
@ -0,0 +1,21 @@
 """Project pipelines."""
 from __future__ import annotations
 from kedro.framework.project import find_pipelines
 from kedro.pipeline import Pipeline
 from myproject.pipelines.process_data.pipeline import pipeline as process_data_pipeline
 def register_pipelines() -> dict[str, Pipeline]:
    """Register the project's pipelines.
    Returns:
        A mapping from pipeline names to ``Pipeline`` objects.
    """
 #    pipelines = find_pipelines()
 #    pipelines["process_data"] = process_data_pipeline,
 ##    pipelines["__default__"] = sum(pipelines.values())
 #    return pipelines
    return {
        "process_data": process_data_pipeline,
        "__default__": process_data_pipeline,
    }
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/init.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/init.py
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/init.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/init.py
@ -0,0 +1,10 @@
 """
 This is a boilerplate pipeline 'mon_pipeline'
 generated using Kedro 0.19.11
 """
 from .pipeline import create_pipeline
 __all__ = ["create_pipeline"]
 __version__ = "0.1"
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/nodes.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/nodes.py
@ -0,0 +1,4 @@
 """
 This is a boilerplate pipeline 'mon_pipeline'
 generated using Kedro 0.19.11
 """
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/pipeline.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/mon_pipeline/pipeline.py
@ -0,0 +1,10 @@
 """
 This is a boilerplate pipeline 'mon_pipeline'
 generated using Kedro 0.19.11
 """
 from kedro.pipeline import node, Pipeline, pipeline  # noqa
 def create_pipeline(**kwargs) -> Pipeline:
    return pipeline([])
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/init.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/init.py
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/node.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/node.py
@ -0,0 +1,14 @@
 import pandas as pd
 def process_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Fonction de traitement des données.
    Args:
        data (pd.DataFrame): Les données d'entrée.
    Returns:
        pd.DataFrame: Les données traitées.
    """
    # Logique de traitement : suppression de certaines colonnes
    return data.drop(columns=['flag', 'k', 'index', 'rate'])
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/pipeline.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/pipelines/process_data/pipeline.py
@ -0,0 +1,13 @@
 from kedro.pipeline import Pipeline, node
 from .node import process_data
 # Création d'un nœud avec la fonction process_data
 mynode = node(
    func=process_data,
    inputs="fake_data",
    outputs="processed_data",
    name="process_data_node",
 )
 # Création du pipeline avec le nœud
 pipeline = Pipeline([mynode])
--- a/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/settings.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/src/myproject/settings.py
@ -0,0 +1,46 @@
 """Project settings. There is no need to edit this file unless you want to change values
 from the Kedro defaults. For further information, including these default values, see
 https://docs.kedro.org/en/stable/kedro_project_setup/settings.html."""
 # Instantiated project hooks.
 # For example, after creating a hooks.py and defining a ProjectHooks class there, do
 # from myproject.hooks import ProjectHooks
 # Hooks are executed in a Last-In-First-Out (LIFO) order.
 # HOOKS = (ProjectHooks(),)
 # Installed plugins for which to disable hook auto-registration.
 # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
 # Class that manages storing KedroSession data.
 # from kedro.framework.session.store import BaseSessionStore
 # SESSION_STORE_CLASS = BaseSessionStore
 # Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor.
 # SESSION_STORE_ARGS = {
 #     "path": "./sessions"
 # }
 # Directory that holds configuration.
 # CONF_SOURCE = "conf"
 # Class that manages how configuration is loaded.
 # from kedro.config import OmegaConfigLoader
 # CONFIG_LOADER_CLASS = OmegaConfigLoader
 # Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor.
 CONFIG_LOADER_ARGS = {
    "base_env": "base",
    "default_run_env": "local",
    # "config_patterns": {
    #     "spark" : ["spark*/"],
    #     "parameters": ["parameters*", "parameters*/**", "**/parameters*"],
    # }
 }
 # Class that manages Kedro's library components.
 # from kedro.framework.context import KedroContext
 # CONTEXT_CLASS = KedroContext
 # Class that manages the Data Catalog.
 # from kedro.io import DataCatalog
 # DATA_CATALOG_CLASS = DataCatalog
--- a/kedro/kedro_new/kedro_from_scratch/myproject/tests/pipelines/mon_pipeline/init.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/tests/pipelines/mon_pipeline/init.py
--- a/kedro/kedro_new/kedro_from_scratch/myproject/tests/pipelines/mon_pipeline/test_pipeline.py
+++ b/kedro/kedro_new/kedro_from_scratch/myproject/tests/pipelines/mon_pipeline/test_pipeline.py
@ -0,0 +1,9 @@
 """
 This is a boilerplate test file for pipeline 'mon_pipeline'
 generated using Kedro 0.19.11.
 Please add your pipeline tests here.
 Kedro recommends using `pytest` framework, more info about it can be found
 in the official documentation:
 https://docs.pytest.org/en/latest/getting-started.html
 """