From ed01c1c5a472196e6f9e391eda62e8d5ea8fecd6 Mon Sep 17 00:00:00 2001 From: gwen Date: Tue, 20 Jun 2023 14:42:22 +0200 Subject: [PATCH] add xml pipeline --- README.md | 36 +- actes-princiers/README.md | 122 ----- actes-princiers/conf/base/catalog.yml | 8 + .../xml/Anjou/anj_is_i_1441_08_05a.html | 13 + actes-princiers/data/05_model_input/.gitkeep | 0 actes-princiers/data/06_models/.gitkeep | 0 actes-princiers/data/07_model_output/.gitkeep | 0 .../pipelines/xml_processing/__init__.py | 3 + .../xml_processing/actes_princiers.xsl | 483 ++++++++++++++++++ .../pipelines/xml_processing/nodes.py | 26 + .../pipelines/xml_processing/pipeline.py | 17 + actes-princiers/src/requirements.txt | 2 + 12 files changed, 573 insertions(+), 137 deletions(-) delete mode 100644 actes-princiers/README.md create mode 100644 actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html delete mode 100644 actes-princiers/data/05_model_input/.gitkeep delete mode 100644 actes-princiers/data/06_models/.gitkeep delete mode 100644 actes-princiers/data/07_model_output/.gitkeep create mode 100755 actes-princiers/src/actes_princiers/pipelines/xml_processing/__init__.py create mode 100644 actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl create mode 100755 actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py create mode 100755 actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py diff --git a/README.md b/README.md index 32ef2fa..5fb7c35 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,14 @@ Open a terminal in the `actes-princiers`'s folder and launch kedro `kedro run` +or launch a specific node in the pipeline with: + +`kedro run --nodes=preprocess_html` + +or a search by tags with: + +`kedro run --tags=xsl` + ## Visualizing the pipelines `kedro viz` @@ -39,21 +47,6 @@ Declare any dependencies in `src/requirements.txt` for `pip` installation. To install them, run: `pip install -r src/requirements.txt` -## Project dependencies - -To generate or update the dependency requirements for your project: - -``` -kedro build-reqs -``` - -This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. - -After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. - -[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) - - ## tips You need to reload Kedro variables by calling `%reload_kedro` in your notebook and re-run the code snippet @@ -73,3 +66,16 @@ Dans `actes-princiers/.gitignore`, # ignore everything in the following folders # data/** +## make a package for deployment + +[package based deployment](https://docs.kedro.org/en/stable/deployment/single_machine.html#package-based) + +If you prefer not to use containerisation, you can instead package your Kedro project using kedro package. + +Run the following in your project’s root directory: + +kedro package + +Kedro builds the package into the dist/ folder of your project, and creates a .whl file, which is a Python packaging format for binary distribution. + + diff --git a/actes-princiers/README.md b/actes-princiers/README.md deleted file mode 100644 index 35f6697..0000000 --- a/actes-princiers/README.md +++ /dev/null @@ -1,122 +0,0 @@ -# Actes Princiers - -## Overview - -This is your new Kedro project, which was generated using `Kedro 0.18.10`. - -Take a look at the [Kedro documentation](https://docs.kedro.org) to get started. - -## Rules and guidelines - -In order to get the best out of the template: - -* Don't remove any lines from the `.gitignore` file we provide -* Make sure your results can be reproduced by following a data engineering convention -* Don't commit data to your repository -* Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` - -## How to install dependencies - -Declare any dependencies in `src/requirements.txt` for `pip` installation and `src/environment.yml` for `conda` installation. - -To install them, run: - -``` -pip install -r src/requirements.txt -``` - -## How to run your Kedro pipeline - -You can run your Kedro project with: - -``` -kedro run -``` - -## How to test your Kedro project - -Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests as follows: - -``` -kedro test -``` - -To configure the coverage threshold, go to the `.coveragerc` file. - -## Project dependencies - -To generate or update the dependency requirements for your project: - -``` -kedro build-reqs -``` - -This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. - -After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. - -[Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) - -## How to work with Kedro and notebooks - -> Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `context`, `catalog`, and `startup_error`. -> -> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r src/requirements.txt` you will not need to take any extra steps before you use them. - -### Jupyter -To use Jupyter notebooks in your Kedro project, you need to install Jupyter: - -``` -pip install jupyter -``` - -After installing Jupyter, you can start a local notebook server: - -``` -kedro jupyter notebook -``` - -### JupyterLab -To use JupyterLab, you need to install it: - -``` -pip install jupyterlab -``` - -You can also start JupyterLab: - -``` -kedro jupyter lab -``` - -### IPython -And if you want to run an IPython session: - -``` -kedro ipython -``` - -### How to convert notebook cells to nodes in a Kedro project -You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#release-5-0-0) and Kedro CLI commands. - -By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src//nodes/`: - -``` -kedro jupyter convert -``` -> *Note:* The name of the Python file matches the name of the original notebook. - -Alternatively, you may want to transform all your notebooks in one go. Run the following command to convert all notebook files found in the project root directory and under any of its sub-folders: - -``` -kedro jupyter convert --all -``` - -### How to ignore notebook output cells in `git` -To automatically strip out all output cell contents before committing to `git`, you can run `kedro activate-nbstripout`. This will add a hook in `.git/config` which will run `nbstripout` before anything is committed to `git`. - -> *Note:* Your output cells will be retained locally. - -## Package your Kedro project - -[Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index b9912e0..b195964 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -25,3 +25,11 @@ preprocessed_actors: save_args: sep: ";" +parse_xsl: + type: pandas.XMLDataSet + filepath: data/01_raw/xml/Anjou/anj_is_i_1441_08_05a.xml + +preprocess_html: + type: pandas.XMLDataSet + filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html + diff --git a/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html b/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html new file mode 100644 index 0000000..68ebad0 --- /dev/null +++ b/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/actes-princiers/data/05_model_input/.gitkeep b/actes-princiers/data/05_model_input/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/actes-princiers/data/06_models/.gitkeep b/actes-princiers/data/06_models/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/actes-princiers/data/07_model_output/.gitkeep b/actes-princiers/data/07_model_output/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/__init__.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/__init__.py new file mode 100755 index 0000000..e08ea5f --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/__init__.py @@ -0,0 +1,3 @@ +"Data Processing pipeline" + +from .pipeline import create_pipeline # NOQA diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl b/actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl new file mode 100644 index 0000000..98015a6 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl @@ -0,0 +1,483 @@ + + + + + + + + + + +
+ +
+ +
+
+ + +
+
    + +
+
+
+ + + + + + + + + + + + + + + + + + font-variant: small-caps; background-color: inherit; + + + + + + + + + + + + _blank + + + + + + + + + text_etabli + + + + + + + + + + . + + + + + . + + + + + + + + + + + + + , + + + + + + « + + + », dans + + + + + + + , + + + + + , n° + + + + + + + + + - + + + + + + + + + + + + + , + + + , + + + + + + + + , + + ( + + ), + + + + + , + + + + + , + + + + , + + + + + + p. + + + + + pp. + + - + + + + + + + , n° + + + + + + + + + + + + + . + + + + + + + + + + + , + + + + + + + + , p. + + + + + , pp. + + - + + + + + + , n° + + + + . + + + + + + + text-center + + + + + + . — + + . + + + + + . + + + + + + + + + + + + + + + + analyse + + + + + + + + + + + + + + + + + + tradition + + + + + + + + + + Cliquer pour afficher une image de l'acte. + + + + + + 100% + auto + + + + + + + + + + + + + + + + + + + + + + font-variant: small-caps; background-color: inherit; + + Analyse : + + + + + + + + + + font-variant: small-caps; background-color: inherit; + + Mention : + + + + + + + + + + font-variant: small-caps; background-color: inherit; + + Indiqué : + + + + + + + + + + + + + + + + + + + + # + + + + + + + + + + + + fnref: + + + + + #fn: + + + + footnote + + + + + + + + + + + act + + + + + + + + + + + + + + mht + + + + + (À gauche :) + + + (À droite :) + + + (Sur le repli, à droite :) + + + (Sur le repli, à gauche :) + + + + + + + + + signature + + + + (Signé :) + + + + + + + + + + + + + + . + + + + + + + + + fn: + + + + footnote + + + + + . + + + + + +
\ No newline at end of file diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py new file mode 100755 index 0000000..269e216 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -0,0 +1,26 @@ +import pandas as pd +from lxml import etree + +from pathlib import Path + +# path and file configuration +_here = Path(__file__).resolve().parent +xsl_stylesheet = _here / "actes_princiers.xsl" + +def parse_xsl(xmldoc: pd.DataFrame) -> pd.DataFrame: +# source_doc = etree.fromstring(xmldoc.to_xml()) +## xmlstring = xmldoc.to_xml() +## source_doc = ET.fromstring(xmlstring) +## source_doc = etree.parse(to_xml) +# # removing namespace : +# query = "descendant-or-self::*[namespace-uri()!='']" +# for element in source_doc.xpath(query): +# #replace element name with its local name +# element.tag = etree.QName(element).localname +# etree.cleanup_namespaces(source_doc) + +# xslt_doc = etree.parse(str(xsl_stylesheet)) +# xslt_transformer = etree.XSLT(xslt_doc) +# output_doc = xslt_transformer(source_doc) +# return pd.read_html(output_doc) + return xmldoc diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py new file mode 100755 index 0000000..e5ec404 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -0,0 +1,17 @@ +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import parse_xsl + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=parse_xsl, + inputs="parse_xsl", + outputs="preprocess_html", + name="preprocess_html", + tags="xsl", + ), + ] + ) diff --git a/actes-princiers/src/requirements.txt b/actes-princiers/src/requirements.txt index 78cce8f..e491140 100644 --- a/actes-princiers/src/requirements.txt +++ b/actes-princiers/src/requirements.txt @@ -1,3 +1,5 @@ +lxml>=4.6.3 +python-slugify>=8.0.1 black~=22.0 flake8>=3.7.9, <5.0 ipython>=7.31.1, <8.0; python_version < '3.8'