From e6164dc134d4e6e3109c41be8e7252fe2c90a4be Mon Sep 17 00:00:00 2001 From: gwen Date: Thu, 29 Jun 2023 19:22:46 +0200 Subject: [PATCH] cleaning and configure logger --- actes-princiers/conf/README.md | 3 - actes-princiers/conf/base/catalog.yml | 60 ------------------- actes-princiers/conf/base/logging.yml | 2 +- actes-princiers/logs/.gitkeep | 0 .../pipelines/data_processing/__init__.py | 3 - .../pipelines/data_processing/nodes.py | 17 ------ .../pipelines/data_processing/pipeline.py | 22 ------- .../pipelines/test_dataset/__init__.py | 3 - .../pipelines/test_dataset/nodes.py | 16 ----- .../pipelines/test_dataset/pipeline.py | 16 ----- .../pipelines/xml_processing/pipeline.py | 2 +- actes-princiers/src/myowndataset.py | 24 -------- 12 files changed, 2 insertions(+), 166 deletions(-) create mode 100644 actes-princiers/logs/.gitkeep delete mode 100755 actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py delete mode 100755 actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py delete mode 100755 actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py delete mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py delete mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py delete mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py delete mode 100644 actes-princiers/src/myowndataset.py diff --git a/actes-princiers/conf/README.md b/actes-princiers/conf/README.md index 4379b1e..cee6f56 100644 --- a/actes-princiers/conf/README.md +++ b/actes-princiers/conf/README.md @@ -18,9 +18,6 @@ WARNING: Please do not put access credentials in the base configuration folder. ## Instructions - - - ## Need help? [Find out more about configuration from the Kedro documentation](https://docs.kedro.org/en/stable/kedro_project_setup/configuration.html). diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 2ccba3c..9f4cd33 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,6 +1,3 @@ -# _________________________________________________________________________ -# loading some data catalogs - actors: type: pandas.CSVDataSet filepath: data/01_raw/csv/actors.csv @@ -20,60 +17,3 @@ corpus-charles-i: sep: ";" -# _________________________________________________________________________ -# custom csv dataset test sample - -dataset_test: - type: myowndataset.MyOwnDataSet - filepath: data/01_raw/csv/actors.csv - load_args: - sep: ";" - -preprocessed_dataset_test: - type: myowndataset.MyOwnDataSet - filepath: data/02_intermediate/csv/preprocessed_test_dataset.csv - save_args: - sep: ";" - -# _________________________________________________________________________ -# custom xml dataset sample - -load_xml: - type: actesdataset.XMLDataSet - filepath: data/01_raw/xml/anjou/anj_is_i_1441_08_05a.xml - -preprocess_html: - type: actesdataset.XMLDataSet - filepath: data/02_intermediate/xml/anjou/anj_is_i_1441_08_05a.html - -# _________________________________________________________________________ -# same test with kedro.io.PartitionedDataSet - -# warning : -# this kind of yaml data in generated programmatically -# in the generic data loader - -#load_full_xml_catalog: -# type: PartitionedDataSet -# path: data/01_raw/xml/anjou/ -# dataset: -# type: actesdataset.XMLDataSet -# filename_suffix: '.xml' - -#preprocess_full_catalog_html: -# type: PartitionedDataSet -# path: data/02_intermediate/xml/anjou/ -# dataset: -# type: actesdataset.XMLDataSet -# filename_suffix: '.html' - -# _________________________________________________________________________ - - -preprocessed_actors: - type: pandas.CSVDataSet - filepath: data/02_intermediate/csv/preprocessed_actors.csv - save_args: - sep: ";" - - diff --git a/actes-princiers/conf/base/logging.yml b/actes-princiers/conf/base/logging.yml index ac8a096..67b741b 100644 --- a/actes-princiers/conf/base/logging.yml +++ b/actes-princiers/conf/base/logging.yml @@ -17,7 +17,7 @@ handlers: class: logging.handlers.RotatingFileHandler level: INFO formatter: simple - filename: info.log + filename: logs/info.log maxBytes: 10485760 # 10MB backupCount: 20 encoding: utf8 diff --git a/actes-princiers/logs/.gitkeep b/actes-princiers/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py deleted file mode 100755 index e08ea5f..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/data_processing/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"Data Processing pipeline" - -from .pipeline import create_pipeline # NOQA diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py deleted file mode 100755 index b03a03f..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/data_processing/nodes.py +++ /dev/null @@ -1,17 +0,0 @@ -import pandas as pd -import numpy as np - -#def _is_true(x: pd.Series) -> pd.Series: -# return x == "t" - -#def _parse_percentage(x: pd.Series) -> pd.Series: -# x = x.str.replace("%", "") -# x = x.astype(float) / 100 -# return x - -def preprocess_actors(actors: pd.DataFrame) -> pd.DataFrame: - actors.replace("XXXX", np.NaN) - return actors - -#def parse_xsl( - diff --git a/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py deleted file mode 100755 index ee51626..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/data_processing/pipeline.py +++ /dev/null @@ -1,22 +0,0 @@ -from kedro.pipeline import Pipeline, node, pipeline - -from .nodes import preprocess_actors - - -def create_pipeline(**kwargs) -> Pipeline: - return pipeline( - [ - node( - func=preprocess_actors, - inputs="actors", - outputs="preprocessed_actors", - name="preprocess_actors_node", - ), -# node( -# func=parse_xsl, -# inputs="actors", -# outputs="preprocessed_actors", -# name="preprocess_actors_node", -# ), - ] - ) diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py deleted file mode 100755 index e08ea5f..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"Data Processing pipeline" - -from .pipeline import create_pipeline # NOQA diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py deleted file mode 100755 index ed13212..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py +++ /dev/null @@ -1,16 +0,0 @@ -import pandas as pd -import numpy as np - -#def _is_true(x: pd.Series) -> pd.Series: -# return x == "t" - -#def _parse_percentage(x: pd.Series) -> pd.Series: -# x = x.str.replace("%", "") -# x = x.astype(float) / 100 -# return x - -def test_dataset(actors: pd.DataFrame) -> pd.DataFrame: - actors.replace("XXXX", np.NaN) -# print(actors.head()) - return actors - diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py deleted file mode 100755 index ca3b9b6..0000000 --- a/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py +++ /dev/null @@ -1,16 +0,0 @@ -from kedro.pipeline import Pipeline, node, pipeline - -from .nodes import test_dataset - - -def create_pipeline(**kwargs) -> Pipeline: - return pipeline( - [ - node( - func=test_dataset, - inputs="dataset_test", - outputs="preprocessed_dataset_test", - name="process_test_dataset_node", - ), - ] - ) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index 6c2db71..8ca6f28 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -22,7 +22,7 @@ def nodes_factory(nodes_description): nodes = [] for node_description in nodes_description: node_name = node_description['name'] - logger.info(f"building node: {node_name}...") + # logger.info(f"building node: {node_name}...") nodes.append(node( func=parse_xsl, inputs=[node_description['inputs'], "params:xlststylesheet"], diff --git a/actes-princiers/src/myowndataset.py b/actes-princiers/src/myowndataset.py deleted file mode 100644 index 82a9c7f..0000000 --- a/actes-princiers/src/myowndataset.py +++ /dev/null @@ -1,24 +0,0 @@ -from pathlib import Path, PurePosixPath -import pandas as pd -from kedro.io import AbstractDataSet - -class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): - def __init__(self, filepath, load_args=None, save_args=None): -# print("------------------------------------", str(load_args)) -# def __init__(self, filepath, param1, param2=True): - self._filepath = PurePosixPath(filepath) -# self._param1 = param1 -# self._param2 = param2 - - def _load(self) -> pd.DataFrame: - return pd.read_csv(self._filepath) - - def _save(self, df: pd.DataFrame) -> None: - df.to_csv(str(self._filepath)) -# raise NotImplementedError("Attention : dataset en lecture seule !") - - def _exists(self) -> bool: - return Path(self._filepath.as_posix()).exists() - - def _describe(self): - return dict(name="my own dataset")