first revision
parent
91ec377864
commit
99717bc384
@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@ -0,0 +1,15 @@
|
||||
Documentation
|
||||
=====================
|
||||
|
||||
Pour lancer le build de la doc, taper::
|
||||
|
||||
make html
|
||||
|
||||
Pour consulter la documentation en HTML, lancer::
|
||||
|
||||
python -m"http.server" --directory ./build/
|
||||
|
||||
Ouvrir un navigateur, et allez à l'url suivante::
|
||||
|
||||
localhost:8000/
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
@ -0,0 +1,24 @@
|
||||
#pip freeze > requirements.txt
|
||||
Sphinx==6.2.1
|
||||
sphinx-rtd-theme==1.2.2
|
||||
alabaster==0.7.13
|
||||
Babel==2.12.1
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.2.0
|
||||
docutils==0.18.1
|
||||
idna==3.4
|
||||
imagesize==1.4.1
|
||||
Jinja2==3.1.2
|
||||
MarkupSafe==2.1.3
|
||||
packaging==23.1
|
||||
Pygments==2.16.1
|
||||
requests==2.31.0
|
||||
snowballstemmer==2.2.0
|
||||
sphinxcontrib-applehelp==1.0.6
|
||||
sphinxcontrib-devhelp==1.0.4
|
||||
sphinxcontrib-htmlhelp==2.0.3
|
||||
sphinxcontrib-jquery==4.1
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==1.0.5
|
||||
sphinxcontrib-serializinghtml==1.1.7
|
||||
urllib3==2.0.4
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
@ -0,0 +1,23 @@
|
||||
Coding Standards
|
||||
====================
|
||||
|
||||
Import ordering
|
||||
-------------------
|
||||
|
||||
1. builtins imports
|
||||
2. pip installed imports
|
||||
3. framework imports
|
||||
4. local project imports
|
||||
|
||||
.. rubric:: Sample
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
from kedro.framework.context import KedroContext, load_package_context
|
||||
from kedro.pipeline import Pipeline
|
||||
|
||||
from actes_princiers.pipeline_registry import register_pipelines
|
||||
|
||||
@ -0,0 +1,214 @@
|
||||
# actes_princiers documentation build
|
||||
# configuration file, created by sphinx-quickstart.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
release = "0.1"
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = "Actes Princiers"
|
||||
author = "Jean-Damien Genero"
|
||||
|
||||
# The short X.Y version.
|
||||
version = "0.1"
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
# "sphinx.ext.autodoc",
|
||||
# "sphinx.ext.napoleon",
|
||||
# "sphinx.ext.todo",
|
||||
# "sphinx.ext.coverage",
|
||||
# "sphinx.ext.ifconfig",
|
||||
# "sphinx.ext.viewcode",
|
||||
# "myst_parser",
|
||||
#"nbsphinx",
|
||||
#"sphinx_copybutton",
|
||||
]
|
||||
|
||||
# enable autosummary plugin (table of contents for modules/classes/class
|
||||
# methods)
|
||||
#autosummary_generate = True
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = 'fr'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path .
|
||||
exclude_patterns = ["_build", "**.ipynb_checkpoints"]
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = "sphinx"
|
||||
default_role = 'code'
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
html_title = "Actes Princiers"
|
||||
html_short_title = "Actes Princiers"
|
||||
html_show_sourcelink = False
|
||||
html_show_sphinx = False
|
||||
html_show_copyright = True
|
||||
html_logo = "_static/logo.jpg"
|
||||
copyright = '2023, Jean-Damien Genero'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
html_theme_options = {
|
||||
"collapse_navigation": False,
|
||||
"style_external_links": True,
|
||||
'display_version': False,
|
||||
'logo_only': True,
|
||||
# 'style_nav_header_background': 'white'
|
||||
}
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# Custom sidebar templates, must be a dictionary that maps document names
|
||||
# to template names.
|
||||
#
|
||||
# The default sidebars (for documents that don't match any pattern) are
|
||||
# defined by theme itself. Builtin themes are using these templates by
|
||||
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
|
||||
# 'searchbox.html']``.
|
||||
#
|
||||
# html_sidebars = {}
|
||||
|
||||
html_show_sourcelink = False
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = "actes_princiersdoc"
|
||||
|
||||
# -- Options for LaTeX output ------------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
#
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
#
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
#
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers.tex",
|
||||
"actes_princiers Documentation",
|
||||
"Kedro",
|
||||
"manual",
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for manual page output ------------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers",
|
||||
"actes_princiers Documentation",
|
||||
[author],
|
||||
1,
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for Texinfo output ----------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers",
|
||||
"actes_princiers Documentation",
|
||||
author,
|
||||
"actes_princiers",
|
||||
"Project actes_princiers codebase.",
|
||||
"Data-Science",
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for todo extension ----------------------------------------------
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
||||
# -- Extension configuration -------------------------------------------------
|
||||
|
||||
# nbsphinx_prolog = """
|
||||
# see here for prolog/epilog details:
|
||||
# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html
|
||||
# """
|
||||
|
||||
# -- NBconvert kernel config -------------------------------------------------
|
||||
nbsphinx_kernel_name = "python3"
|
||||
|
||||
|
||||
#def remove_arrows_in_examples(lines):
|
||||
# for i, line in enumerate(lines):
|
||||
# lines[i] = line.replace(">>>", "")
|
||||
|
||||
|
||||
#def autodoc_process_docstring(app, what, name, obj, options, lines):
|
||||
# remove_arrows_in_examples(lines)
|
||||
|
||||
|
||||
#def skip(app, what, name, obj, skip, options):
|
||||
# if name == "__init__":
|
||||
# return False
|
||||
# return skip
|
||||
|
||||
|
||||
#def setup(app):
|
||||
# app.connect("autodoc-process-docstring", autodoc_process_docstring)
|
||||
# app.connect("autodoc-skip-member", skip)
|
||||
# # add Kedro stylesheets
|
||||
# for stylesheet in find_stylesheets():
|
||||
# app.add_css_file(stylesheet)
|
||||
@ -0,0 +1,214 @@
|
||||
# actes_princiers documentation build
|
||||
# configuration file, created by sphinx-quickstart.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
release = "0.1"
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = "actes_princiers"
|
||||
author = "Jean-Damien"
|
||||
|
||||
# The short X.Y version.
|
||||
version = "0.1"
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx.ext.todo",
|
||||
"sphinx.ext.coverage",
|
||||
"sphinx.ext.ifconfig",
|
||||
"sphinx.ext.viewcode",
|
||||
# "myst_parser",
|
||||
#"nbsphinx",
|
||||
#"sphinx_copybutton",
|
||||
]
|
||||
|
||||
# enable autosummary plugin (table of contents for modules/classes/class
|
||||
# methods)
|
||||
#autosummary_generate = True
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = 'fr'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path .
|
||||
exclude_patterns = ["_build", "**.ipynb_checkpoints"]
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = "sphinx"
|
||||
default_role = 'code'
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
html_title = "Actes Princiers"
|
||||
html_short_title = "Actes Princiers"
|
||||
html_show_sourcelink = False
|
||||
html_show_sphinx = False
|
||||
html_show_copyright = True
|
||||
html_logo = "_static/logo.jpg"
|
||||
copyright = '2020, Jean-Damien Genero'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
html_theme_options = {
|
||||
"collapse_navigation": False,
|
||||
"style_external_links": True,
|
||||
'display_version': False,
|
||||
'logo_only': True,
|
||||
# 'style_nav_header_background': 'white'
|
||||
}
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# Custom sidebar templates, must be a dictionary that maps document names
|
||||
# to template names.
|
||||
#
|
||||
# The default sidebars (for documents that don't match any pattern) are
|
||||
# defined by theme itself. Builtin themes are using these templates by
|
||||
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
|
||||
# 'searchbox.html']``.
|
||||
#
|
||||
# html_sidebars = {}
|
||||
|
||||
html_show_sourcelink = False
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = "actes_princiersdoc"
|
||||
|
||||
# -- Options for LaTeX output ------------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
#
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
#
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
#
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers.tex",
|
||||
"actes_princiers Documentation",
|
||||
"Kedro",
|
||||
"manual",
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for manual page output ------------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers",
|
||||
"actes_princiers Documentation",
|
||||
[author],
|
||||
1,
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for Texinfo output ----------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"actes_princiers",
|
||||
"actes_princiers Documentation",
|
||||
author,
|
||||
"actes_princiers",
|
||||
"Project actes_princiers codebase.",
|
||||
"Data-Science",
|
||||
)
|
||||
]
|
||||
|
||||
# -- Options for todo extension ----------------------------------------------
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
||||
# -- Extension configuration -------------------------------------------------
|
||||
|
||||
# nbsphinx_prolog = """
|
||||
# see here for prolog/epilog details:
|
||||
# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html
|
||||
# """
|
||||
|
||||
# -- NBconvert kernel config -------------------------------------------------
|
||||
nbsphinx_kernel_name = "python3"
|
||||
|
||||
|
||||
#def remove_arrows_in_examples(lines):
|
||||
# for i, line in enumerate(lines):
|
||||
# lines[i] = line.replace(">>>", "")
|
||||
|
||||
|
||||
#def autodoc_process_docstring(app, what, name, obj, options, lines):
|
||||
# remove_arrows_in_examples(lines)
|
||||
|
||||
|
||||
#def skip(app, what, name, obj, skip, options):
|
||||
# if name == "__init__":
|
||||
# return False
|
||||
# return skip
|
||||
|
||||
|
||||
#def setup(app):
|
||||
# app.connect("autodoc-process-docstring", autodoc_process_docstring)
|
||||
# app.connect("autodoc-skip-member", skip)
|
||||
# # add Kedro stylesheets
|
||||
# for stylesheet in find_stylesheets():
|
||||
# app.add_css_file(stylesheet)
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.6 MiB |
Binary file not shown.
|
After Width: | Height: | Size: 77 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
@ -0,0 +1,16 @@
|
||||
Projet Actes Princiers
|
||||
=========================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
kedro
|
||||
pipeline
|
||||
data
|
||||
xml
|
||||
coding_standards
|
||||
pipeline_actes_princiers
|
||||
|
||||
.. rubric:: Index
|
||||
|
||||
* :ref:`genindex`
|
||||
@ -0,0 +1,117 @@
|
||||
.. meta::
|
||||
:description: data engineering
|
||||
:keywords: reproducible, maintainable, modular data science code
|
||||
|
||||
Orientation Data du projet
|
||||
============================
|
||||
|
||||
A quoi ça sert ?
|
||||
--------------------
|
||||
|
||||
La question peut se poser : à quoi ça sert de passer à du code datascience ?
|
||||
Pourquoi s'acharner à produire du code maintenable, modulaire, de manière
|
||||
à pouvoir reproduire une interprétation de data ?
|
||||
|
||||
.. admonition:: Les questions que posent l'orientation datas d'un projet
|
||||
|
||||
- Qu'est-ce qu'une orientation data ?
|
||||
- Quel est l'intérêt de produire du "datascience code" ?
|
||||
- Ne faut-il pas mieux revenir au code `database -> html renderer ?`
|
||||
|
||||
|
||||
Le code d'une web app
|
||||
-------------------------
|
||||
|
||||
Pour partir d'un exemple voici un bout du code de la maquette :
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
source_doc = etree.parse(
|
||||
os.path.join(APPPATH, "static", "xml", house, acte_id + '.xml'))
|
||||
# remove namespace :
|
||||
query = "descendant-or-self::*[namespace-uri()!='']"
|
||||
for element in source_doc.xpath(query):
|
||||
#replace element name with its local name
|
||||
element.tag = etree.QName(element).localname
|
||||
etree.cleanup_namespaces(source_doc)
|
||||
|
||||
xslt_doc = etree.parse(os.path.join(APPPATH, "static", "xsl", "actes_princiers.xsl"))
|
||||
xslt_transformer = etree.XSLT(xslt_doc)
|
||||
output_doc = xslt_transformer(source_doc)
|
||||
return render_template("acte.html", house=house, prince=prince,
|
||||
infos=q_acte, place=place[0], doc=doc[0][0], arch=inst[0],
|
||||
diplo=diplo_t[0].replace("_", " "), state=state[0],
|
||||
output_doc=output_doc, name_prince=prince_name[0],
|
||||
transcribers=transcribers)
|
||||
|
||||
Ce code est :
|
||||
|
||||
- difficilement compréhensible,
|
||||
- difficilement maintenable par quelqu'un d'autre que
|
||||
celui qui a produit ce code,
|
||||
- est fortement lié à l'organisation d'une base de données
|
||||
et d'un document xml, il n'y a donc pas d'unité des sources
|
||||
de données,
|
||||
- etc...
|
||||
|
||||
|
||||
Un data science pipeline ?
|
||||
--------------------------------
|
||||
|
||||
Un datascience framework, plutôt :
|
||||
|
||||
.. image:: img/KedroRunTimeline.png
|
||||
|
||||
- on charge d'abord un catalogue de données sources
|
||||
- on fait le traitement dans des étapes bien distinctes appelées pipeline
|
||||
|
||||
.. glossary::
|
||||
|
||||
pipeline
|
||||
|
||||
Un pipeline est un processus ordonné.
|
||||
Plusieurs actions sont lancées de manière successives ou
|
||||
bien en parallèle, ces actions sont dépendantes les unes des autres
|
||||
et sont encapsulées dans des nodes.
|
||||
|
||||
`Voici la définition d'un pipeline d'après kedro <https://docs.kedro.org/en/stable/get_started/kedro_concepts.html#pipeline>`_ :
|
||||
|
||||
A pipeline organises the dependencies and execution order of a collection of nodes and connects inputs and outputs while keeping your code modular. The pipeline determines the node execution order by resolving dependencies and does not necessarily run the nodes in the order in which they are passed in.
|
||||
|
||||
Here is a pipeline comprised of the nodes shown above::
|
||||
|
||||
from kedro.pipeline import pipeline
|
||||
|
||||
# Assemble nodes into a pipeline
|
||||
greeting_pipeline = pipeline([return_greeting_node, join_statements_node])
|
||||
|
||||
|
||||
node
|
||||
|
||||
Un node encapsule (enveloppe) une action.
|
||||
Cette action est une fonction (un traitement) python.
|
||||
|
||||
`Voici la définition d'un node d'après kedro <https://docs.kedro.org/en/stable/get_started/kedro_concepts.html#node>`_ :
|
||||
|
||||
In Kedro, a node is a wrapper for a pure Python function that names the inputs and outputs of that function. Nodes are the building block of a pipeline, and the output of one node can be the input of another.
|
||||
|
||||
Here are two simple nodes as an example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from kedro.pipeline import node
|
||||
|
||||
# First node
|
||||
def return_greeting():
|
||||
return "Hello"
|
||||
|
||||
|
||||
return_greeting_node = node(func=return_greeting, inputs=None, outputs="my_salutation")
|
||||
|
||||
# Second node
|
||||
def join_statements(greeting):
|
||||
return f"{greeting} Kedro!"
|
||||
|
||||
|
||||
join_statements_node = node(
|
||||
join_statements, inputs="my_salutation", outputs="my_message"
|
||||
@ -0,0 +1,103 @@
|
||||
La pipeline kedro des actes princiers
|
||||
=====================================
|
||||
|
||||
L'oritentation data science du projet a pour but principal de séparer en plusieurs étapes les différentes problématiques du traitement des données.
|
||||
|
||||
La commande `kedro run` permet de lancer la pipeline, c'est-à-dire le script `pipelines.py` :
|
||||
|
||||
#. `pipelines.py` charge les `catalog.yml` et va chercher `nodes.py`
|
||||
|
||||
#. `nodes.py` va chercher `actesdataset.py`
|
||||
|
||||
#. `actesdataset.py` réalise les traitements de données.
|
||||
|
||||
La pipeline du projet *Actes princiers* possède deux points d'entrée :
|
||||
- Le premier charge les XML sources avec `xml.etree.ElementTree` de la librairie `lxml` et permet de générer un dictionnaire.
|
||||
- Le deuxième charge les XML source avec la librairie `BeautifulSoup` et permet de générer un JSON.
|
||||
|
||||
`catalog.yml`
|
||||
-------------
|
||||
|
||||
- Dans `/actes-princiers/conf/base/`
|
||||
|
||||
- Ce fichier définit des catalogues de données (*data catalog*) :
|
||||
- de type brut (*raw*) : ce sont les données d'origine qui sont immuables et en lecture seule (les XML et les CSV).
|
||||
- elles se trouvent dans `/actes-princiers/data/01_raw/`.
|
||||
- de type intermédiaire (*intermediate*) = ce sont les données créées après le traitement des données brutes.
|
||||
- elles se trouvent dans `actes-princiers/data/02_intermediate/`.
|
||||
|
||||
`pipeline.py`
|
||||
-------------
|
||||
|
||||
- Dans `actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py`.
|
||||
|
||||
- Ce code commence par importer les fonctions du fichier `nodes.py`. Les `nodes` kedro sont des blocs de la pipeline qui prennent en paramètre des dataset (les XML), font des traitements et renvoient des outputs (un dictionnaire ou un JSON).
|
||||
|
||||
- Il y a deux nodes, qui correspondent à chaque point d'entrée :
|
||||
- `parse_xml_collection`, qui correspond au traitement sur le datacatalog `bourbon` vers le datacatlog `bourbon_xmlcontent`.
|
||||
- La fonction appelée depuis `nodes.py` est `parse_xml_collection()`.
|
||||
- `parse_json_collection`, qui correspond au traitement sur le datacatalog `bourbon_json` vers le datacatlog `bourbon_jsonoutput`
|
||||
- La fonction appelée depuis `nodes.py` est `parse_json_collection()`.
|
||||
|
||||
.. code-block:: py
|
||||
|
||||
def create_pipeline(**kwargs) -> Pipeline:
|
||||
return pipeline(
|
||||
[
|
||||
node(
|
||||
func=parse_xml_collection,
|
||||
inputs="bourbon",
|
||||
outputs="bourbon_xmlcontent",
|
||||
name="bourbon_ds_collection",
|
||||
),
|
||||
node(
|
||||
func=parse_json_collection,
|
||||
inputs="bourbon_json",
|
||||
outputs="bourbon_jsonoutput",
|
||||
name="bourbon_json_ds_collection",
|
||||
),
|
||||
|
||||
`nodes.py`
|
||||
--------
|
||||
|
||||
- Dans `actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py`.
|
||||
|
||||
- Ce fichier contient les fonctions qui permettent de traiter les données. Il est appelé dans `pipeline.py`.
|
||||
|
||||
- `parse_xml_collection` lit les dataset (= les XML) avec la librairie lxml.
|
||||
- elle commence par lister les fichiers XML d'un dossier donné (correspondant à une maison princière).
|
||||
- les xml sont parsés avec lxml.
|
||||
- la `<div>` du `<body>` est récupérée grâce à la fonction `_xslt` de `actes-princiers/src/actesdataset.py`.
|
||||
- le résultat est stocké dans un fichier avec l'extension `.pseudoxml` car il ne s'agit pas d'un fichier XML bien formé. Le but est d'envoyer ces "bouts" de XML dans un JSON.
|
||||
- la fonction retourne un dictionnaire où le nom du fichier est la clef et le pseudoxml la valeur.
|
||||
|
||||
- `make_json_collection` =
|
||||
- elle commence par lister les fichiers XML d'un dossier donné (correspondant à une maison princière)
|
||||
- ces fichiers sont parsés avec BeautifulSoup.
|
||||
- BeautifulSoup recherche les metadonnées dans le XML
|
||||
- Un dictionnaire est créé avec en clef le nom du fichier et en valeur les métadonnées récupérées par BeautifulSoup.
|
||||
- quand c'est sérialisé (enregister dans le disque dur) c'est en json
|
||||
|
||||
|
||||
`actesdataset.py`
|
||||
-----------------
|
||||
|
||||
- Dans `actes-princiers/src/actesdataset.py`
|
||||
|
||||
- Il s'agit du fichier maître de la pipeline, où sont définies dans classes qui sont ensuite instanciées dans le `node.py`.
|
||||
|
||||
les représentations des dataset, qui va chercher les fonctions précédentes
|
||||
|
||||
- `_xslt(xsltstylesheet)` : va chercher la partie du XML (`<div>` du `<body>` qui nous intéresse)
|
||||
|
||||
- `class XMLDataSet` : classe abstraite qui n'est jamais instanciée = classe mère en héritage dans la classe suivante.
|
||||
|
||||
- `class EtreeXMLDataSet` : va chercher les `<div>` contenant les actes et fait un dictionnaire, avec la fonction `parse_xml_collection` dans `nodes.py`
|
||||
|
||||
- `class BsXMLDataSet` : va chercher les métadonnées des actes et les met dans un dictionnaire qui est ensuite transformé en json.
|
||||
|
||||
- `class DataSetCollection` : collection abstraite de dataset.
|
||||
|
||||
- `class XMLDataSetCollection` : construit un dictionnaire (= data set container) dans un attribut dataset qui liste les fichiers xml.
|
||||
|
||||
- `class JSONDataSetCollection` : construit un dictionnaire (= data set container) dans un attribut dataset qui liste les fichiers xml.
|
||||
@ -0,0 +1,16 @@
|
||||
Les documents XML
|
||||
======================
|
||||
|
||||
dans `data/01_raw/houses`
|
||||
|
||||
le XML est **bien formé**
|
||||
|
||||
`xmllint --noout *.xml`
|
||||
|
||||
`find ./ -name "*.xml" -exec xmllint --noout {} \;`
|
||||
|
||||
Il reste à montrer qu'il est **valide** au regard des structures_ DTD de la TEI_.
|
||||
|
||||
.. _TEI: https://tei-c.org/Vault/P4/doc/html/DT.html
|
||||
.. _structures: https://tei-c.org/Vault/P4/doc/html/ST.html
|
||||
|
||||
Loading…
Reference in New Issue