cleaning and configure logger

develop
gwen 3 years ago
parent 9535597e91
commit e6164dc134

@ -18,9 +18,6 @@ WARNING: Please do not put access credentials in the base configuration folder.
## Instructions
## Need help?
[Find out more about configuration from the Kedro documentation](https://docs.kedro.org/en/stable/kedro_project_setup/configuration.html).

@ -1,6 +1,3 @@
# _________________________________________________________________________
# loading some data catalogs
actors:
type: pandas.CSVDataSet
filepath: data/01_raw/csv/actors.csv
@ -20,60 +17,3 @@ corpus-charles-i:
sep: ";"
# _________________________________________________________________________
# custom csv dataset test sample
dataset_test:
type: myowndataset.MyOwnDataSet
filepath: data/01_raw/csv/actors.csv
load_args:
sep: ";"
preprocessed_dataset_test:
type: myowndataset.MyOwnDataSet
filepath: data/02_intermediate/csv/preprocessed_test_dataset.csv
save_args:
sep: ";"
# _________________________________________________________________________
# custom xml dataset sample
load_xml:
type: actesdataset.XMLDataSet
filepath: data/01_raw/xml/anjou/anj_is_i_1441_08_05a.xml
preprocess_html:
type: actesdataset.XMLDataSet
filepath: data/02_intermediate/xml/anjou/anj_is_i_1441_08_05a.html
# _________________________________________________________________________
# same test with kedro.io.PartitionedDataSet
# warning :
# this kind of yaml data in generated programmatically
# in the generic data loader
#load_full_xml_catalog:
# type: PartitionedDataSet
# path: data/01_raw/xml/anjou/
# dataset:
# type: actesdataset.XMLDataSet
# filename_suffix: '.xml'
#preprocess_full_catalog_html:
# type: PartitionedDataSet
# path: data/02_intermediate/xml/anjou/
# dataset:
# type: actesdataset.XMLDataSet
# filename_suffix: '.html'
# _________________________________________________________________________
preprocessed_actors:
type: pandas.CSVDataSet
filepath: data/02_intermediate/csv/preprocessed_actors.csv
save_args:
sep: ";"

@ -17,7 +17,7 @@ handlers:
class: logging.handlers.RotatingFileHandler
level: INFO
formatter: simple
filename: info.log
filename: logs/info.log
maxBytes: 10485760 # 10MB
backupCount: 20
encoding: utf8

@ -1,3 +0,0 @@
"Data Processing pipeline"
from .pipeline import create_pipeline # NOQA

@ -1,17 +0,0 @@
import pandas as pd
import numpy as np
#def _is_true(x: pd.Series) -> pd.Series:
# return x == "t"
#def _parse_percentage(x: pd.Series) -> pd.Series:
# x = x.str.replace("%", "")
# x = x.astype(float) / 100
# return x
def preprocess_actors(actors: pd.DataFrame) -> pd.DataFrame:
actors.replace("XXXX", np.NaN)
return actors
#def parse_xsl(

@ -1,22 +0,0 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import preprocess_actors
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=preprocess_actors,
inputs="actors",
outputs="preprocessed_actors",
name="preprocess_actors_node",
),
# node(
# func=parse_xsl,
# inputs="actors",
# outputs="preprocessed_actors",
# name="preprocess_actors_node",
# ),
]
)

@ -1,3 +0,0 @@
"Data Processing pipeline"
from .pipeline import create_pipeline # NOQA

@ -1,16 +0,0 @@
import pandas as pd
import numpy as np
#def _is_true(x: pd.Series) -> pd.Series:
# return x == "t"
#def _parse_percentage(x: pd.Series) -> pd.Series:
# x = x.str.replace("%", "")
# x = x.astype(float) / 100
# return x
def test_dataset(actors: pd.DataFrame) -> pd.DataFrame:
actors.replace("XXXX", np.NaN)
# print(actors.head())
return actors

@ -1,16 +0,0 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import test_dataset
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=test_dataset,
inputs="dataset_test",
outputs="preprocessed_dataset_test",
name="process_test_dataset_node",
),
]
)

@ -22,7 +22,7 @@ def nodes_factory(nodes_description):
nodes = []
for node_description in nodes_description:
node_name = node_description['name']
logger.info(f"building node: {node_name}...")
# logger.info(f"building node: {node_name}...")
nodes.append(node(
func=parse_xsl,
inputs=[node_description['inputs'], "params:xlststylesheet"],

@ -1,24 +0,0 @@
from pathlib import Path, PurePosixPath
import pandas as pd
from kedro.io import AbstractDataSet
class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]):
def __init__(self, filepath, load_args=None, save_args=None):
# print("------------------------------------", str(load_args))
# def __init__(self, filepath, param1, param2=True):
self._filepath = PurePosixPath(filepath)
# self._param1 = param1
# self._param2 = param2
def _load(self) -> pd.DataFrame:
return pd.read_csv(self._filepath)
def _save(self, df: pd.DataFrame) -> None:
df.to_csv(str(self._filepath))
# raise NotImplementedError("Attention : dataset en lecture seule !")
def _exists(self) -> bool:
return Path(self._filepath.as_posix()).exists()
def _describe(self):
return dict(name="my own dataset")
Loading…
Cancel
Save