From 3693d1650c5d8f8cababc3095bc274ec2e42db46 Mon Sep 17 00:00:00 2001 From: gwen Date: Wed, 21 Jun 2023 08:21:44 +0200 Subject: [PATCH] traitement xslt sur un fichier --- actes-princiers/conf/base/catalog.yml | 38 ++++++++--- actes-princiers/conf/base/parameters.yml | 1 + .../csv/preprocessed_test_dataset.csv | 50 +++++++++++++++ .../xml/Anjou/anj_is_i_1441_08_05a.html | 37 +++++++---- .../notebooks/LoadDataCatalog.ipynb | 63 +++++++++++++++++++ .../pipelines/test_dataset/__init__.py | 3 + .../pipelines/test_dataset/nodes.py | 16 +++++ .../pipelines/test_dataset/pipeline.py | 16 +++++ .../pipelines/xml_processing/nodes.py | 33 ++++------ .../pipelines/xml_processing/pipeline.py | 2 +- actes-princiers/src/actesdataset.py | 46 ++++++++++++++ actes-princiers/src/myowndataset.py | 24 +++++++ .../xsl}/actes_princiers.xsl | 0 13 files changed, 285 insertions(+), 44 deletions(-) create mode 100644 actes-princiers/data/02_intermediate/csv/preprocessed_test_dataset.csv create mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py create mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py create mode 100755 actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py create mode 100644 actes-princiers/src/actesdataset.py create mode 100644 actes-princiers/src/myowndataset.py rename actes-princiers/{src/actes_princiers/pipelines/xml_processing => static/xsl}/actes_princiers.xsl (100%) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index b195964..d3517d7 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,5 +1,5 @@ -# Documentation for this file format can be found in "The Data Catalog" -# Link: https://docs.kedro.org/en/stable/data/data_catalog.html +# _________________________________________________________________________ +# loading some data catalogs actors: type: pandas.CSVDataSet @@ -19,17 +19,39 @@ corpus-charles-i: load_args: sep: ";" -preprocessed_actors: - type: pandas.CSVDataSet - filepath: data/02_intermediate/csv/preprocessed_actors.csv + +# _________________________________________________________________________ +# custom csv dataset test sample + +dataset_test: + type: myowndataset.MyOwnDataSet + filepath: data/01_raw/csv/actors.csv + load_args: + sep: ";" + +preprocessed_dataset_test: + type: myowndataset.MyOwnDataSet + filepath: data/02_intermediate/csv/preprocessed_test_dataset.csv save_args: sep: ";" -parse_xsl: - type: pandas.XMLDataSet +# _________________________________________________________________________ +# custom xml dataset sample + +load_xml: + type: actesdataset.XMLDataSet filepath: data/01_raw/xml/Anjou/anj_is_i_1441_08_05a.xml preprocess_html: - type: pandas.XMLDataSet + type: actesdataset.XMLDataSet filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html +# _________________________________________________________________________ + +preprocessed_actors: + type: pandas.CSVDataSet + filepath: data/02_intermediate/csv/preprocessed_actors.csv + save_args: + sep: ";" + + diff --git a/actes-princiers/conf/base/parameters.yml b/actes-princiers/conf/base/parameters.yml index e69de29..b1176c5 100644 --- a/actes-princiers/conf/base/parameters.yml +++ b/actes-princiers/conf/base/parameters.yml @@ -0,0 +1 @@ +xlststylesheet: static/xsl/actes_princiers.xsl diff --git a/actes-princiers/data/02_intermediate/csv/preprocessed_test_dataset.csv b/actes-princiers/data/02_intermediate/csv/preprocessed_test_dataset.csv new file mode 100644 index 0000000..3bad8df --- /dev/null +++ b/actes-princiers/data/02_intermediate/csv/preprocessed_test_dataset.csv @@ -0,0 +1,50 @@ +,"NAME;""ROLE"";""HOUSE"";""DATE1"";""DATE2"";""DATE3""" +0,"Charles Ier de Bourbon;""prince"";""Bourbon"";""1400"";""1434"";""1456""" +1,"Gort, Étienne;""secret"";""Bourbon"";""1425"";""1440"";" +2,"Erart;""secret"";""Berry"";""1404"";""1405"";" +3,"Jean de Berry;""prince"";""Berry"";""1337"";""1360"";""1416""" +4,"Agnès de Bourgogne;""prince"";""Bourbon"";""1407"";""1434"";""1476""" +5,"Marghas, Philippe;""secret"";""Bourbon"";""1426"";""1433"";" +6,"Marie de Berry;""prince"";""Bourbon"";""1480"";""1410"";""1434""" +7,"René d'Anjou;""prince"";""Anjou"";"""";"""";""""" +8,"Arthur de Richemont;""prince"";""Bretagne"";"""";"""";""""" +9,"Bernard d'Armagnac;""prince"";""Armagnac"";"""";"""";""""" +10,"Philippe le Bon;""prince"";""Bourgogne"";"""";"""";""""" +11,"Gourriet, Lorrin;""secret"";""Bourbon"";"""";"""";""""" +12,"De Bar, Étienne;""secret"";""Bourbon"";"""";"""";""""" +13,"Gon, Jean;""secret"";""Bourbon"";"""";"""";""""" +14,"Trichon, Jean;""secret"";""Bourbon"";"""";"""";""""" +15,"Chevalier, E.;""secret"";""Bretagne"";"""";"""";""""" +16,"Cadier, Guillaume;""secret"";""Bourbon"";"""";"""";""""" +17,"Decharmeres, J.;""secret"";""Anjou"";"""";"""";""""" +18,"Dommessent;""secret"";""Bourgogne"";""Bretagne"";"""";""""" +19,"Andraut, Laurent;""secret"";""Bourbon"";"""";"""";""""" +20,"Breneal, Jean;""secret"";""Bourgogne"";"""";"""";""""" +21,"De Castillione;""secret"";""Anjou"";"""";"""";""""" +22,"Yollande d'Aragon;""prince"";""Anjou"";"""";"""";""""" +23,"Marie de Blois;""prince"";""Anjou"";"""";"""";""""" +24,"Grauquellin;""secret"";""Anjou"";"""";"""";""""" +25,"Michael;""secret"";""Anjou"";"""";"""";""""" +26,"Matheus;""secret"";""Anjou"";"""";"""";""""" +27,"Louis Ier d'Anjou;""prince"";""Anjou"";"""";"""";""""" +28,"Louis II d'Anjou;""prince"";""Anjou"";"""";"""";""""" +29,"Louis III d'Anjou;""prince"";""Anjou"";"""";"""";""""" +30,"Caillot, G.;""secret"";""Anjou"";"""";"""";""""" +31,"Olivier;""secret"";""Anjou"";"""";"""";""""" +32,"Benepy;""secret"";""Anjou"";"""";"""";""""" +33,"Gontier, Col;""secret"";""Berry"";"""";"""";""""" +34,"Franchome;""secret"";""Anjou"";"""";"""";""""" +35,"Isabelle de Lorraine;""prince"";""Anjou"";"""";"""";""""" +36,"Bollumbrellus;""secret"";""Anjou"";"""";"""";""""" +37,"Nicolao Perigaut;""secret"";""Anjou"";"""";"""";""""" +38,"De Vaulx;""secret"";""Anjou"";"""";"""";""""" +39,"Alardeau, Jean;""secret"";""Anjou"";"""";"""";""""" +40,"Charnières;""secret"";""Anjou"";"""";"""";""""" +41,"Nicolas;""secret"";""Anjou"";"""";"""";""""" +42,"Rouxelet;""secret"";""Anjou"";"""";"""";""""" +43,"Boursier;""secret"";""Anjou"";"""";"""";""""" +44,"Petre;""secret"";""Anjou"";"""";"""";""""" +45,"Ponce Caihe;""secret"";""Anjou"";"""";"""";""""" +46,"J. Crete;""secret"";""Anjou"";"""";"""";""""" +47,"J. de Vernon;""secret"";""Anjou"";"""";"""";""""" +48,"Tourneville, Guillaume;""secret"";""Anjou"";"""";"""";""""" diff --git a/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html b/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html index 68ebad0..21783a2 100644 --- a/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html +++ b/actes-princiers/data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html @@ -1,13 +1,24 @@ - - - - - - - - - - - - - + + +

1441, 5 août. — Château de Tarascon.

+
+

Mandement d'Isabelle de Lorraine, reine de Jérusalem et de Sicile, duchesse d'Anjou, etc., pour le paiement des mille florins de dot assignés par le roi René à Lionne de la Sellana de Brusa, suivant une convention passée avec son mari, Antoine de la Salle

+
+ +
+

A. Original perdu.

+

B. Copie de A. dans le registre Rosa de la Chambre des comptes de Provence1. Archives départementales des Bouches-du-Rhône, B12, f. CLIIIIv [copie numérisée].

+

+ a. Léon-Honoré Labande, « Antoine de La Salle. Nouveaux documents sur sa vie et ses relations avec la maison d'Anjou (Appendices) », dans Bibliothèque de l'École des chartes, n°124, 1904, pp. 352-354 , n°XII [article numérisé]. +

+
+
+

Ysabel, Dei gratia Jherusalem et Sicilie regina, Andegavie, Barri et Lothoringue ducissa, comitatiuum Provincie et Forcalquerii, Cenomanie ac Pedemontis comitissa et in dictis Provincie et Forcalquerii terrisque adjacentibus regia vicaria et locuntenentes generalis, thesaurariis generalibus Provincie, necnon clavariis curiarum regiarum de Forcalquerio, Sistarici... presentibus et futuris et eorum cuilibet, gratiam et bonam voluntatem.

+

Cum pro solucione et satisfacione mille florenorum monete Provincie in dotem constitutorum et assignatorum per metuendissimum dominum meum regem nobili domicelle Lione, uxori magnifici Anthonii de Sala, consiliarii regii atque nostri dilecti, habendorum quidem et percipiendorum super juribus laudimiorum et trezenorum ac super juribus retencionum nostre curie competentibus in dictis vicaria Forcalquerii ac bajuliis Sistarici, Mosteriarum, Digne, Gastellane, Collismarcii, Guillelmi et Sancti Pauli Vencesii, appunctatum fuerit inter nos et dictum Anthonetum de la Sala, presentibus et consencientibus magistris racionalibus magne regie curie, ut actentis oneribus occurentibus curie regie propinque, opus est ut de parte dictorum proventuum, tam pro gagiis magistrorum racionalium exsolvendis, quam pro aliis negociis regiis succurratur et subveniatur, dictus Anthonius seu ipsa nobilis Liona dicta jura uno anno integraliter recipiat per manus vestras, incipiendo die prima mensis novembris proxime futuri, anno finito et completo jura ipsa pro ipsis gagiis magistrorum racionalium et aliis oneribus more cedant per alium annum inde sequentem, et ipso anno revoluto dictus Anthonius seu dicta uxor ejus dicta jura iterum habeat alterius anni, donec eidem Lione de dicta summa mille florenorum fuerit integraliter persolutum et satisfactum.

+

Igitur in executionem dicti appunctamenti, volumus et vobis tenore presencium, cum deliberacione regii nobis assistentis consilii, precipimus et mandamus quatinus prefatis Anthonio et Lione, seu ejus (sic) legitimo procuratori, peccunias dictorum jurium que proveniunt a die prima maii proxime preteriti nunc usque, et que provenient exinde usque primam diem maii proxime futuri, sive infra unum annum finiendum prima maii venientis quo computabitur millesimo quadringentesimo quadragesimo secundo, proveniendorum, alternis annis, quousque de dicta summa mille florenorum in dotem pro dote (sic) constituta et assignata eisdem fuerit integraliter satisfactum et exsolutum, tradatis et exsolvatis, vestrumque quilibet, prout ad eum spectabit, tradat et exsolvat integre et sine diminucione et contradicione quibuscumque...

+

Datum in Castro nostro Tharasconis, per magnificum virum Jeronimum de Mirabollis, de Neapoli, juris utriusque professorem, curie camere summarie regni Sicilie presidentem, majoremque et secundarum appellacionum et nullitatum comitatuum Provincie et Forcalquerii predictorum judicem, consiliarium regium atque nostrum dilectum, die quinta mensis augusti, anno Domini millesimo quadringentesimo quadragesimo primo.

+

Per reginam, domino senescallo Provincie, domino Valliscluse...

+

Gratis Registrata.

+

Matheus

+
+

1. En-tête de folio : Pro nobili Antonio de la Sala et ejus consorti provisio originalis super modum solucionis dotis ipsorum. Anno Domini Mo IIIIc XLI, die tercia octobris, ad requisicionem supplicem magistri Michaelis Matharoni, procuratorio nomine nobilis Antonii de la Sala et ejus consortis, quedam patentes litere originales... in presenti regestro unacum annexa thesaurarii archivate extiterunt, velut ecce. Tenor ipsarum litterarum reginalium (d'après a.).

    diff --git a/actes-princiers/notebooks/LoadDataCatalog.ipynb b/actes-princiers/notebooks/LoadDataCatalog.ipynb index 1d90cd4..3240455 100644 --- a/actes-princiers/notebooks/LoadDataCatalog.ipynb +++ b/actes-princiers/notebooks/LoadDataCatalog.ipynb @@ -245,6 +245,69 @@ "#actors.values\n", "cleaned_actors.iloc[9]" ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "053ed17c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['actors',\n", + " 'corpus-agnes-bourgogne',\n", + " 'corpus-charles-i',\n", + " 'dataset_test',\n", + " 'preprocessed_dataset_test',\n", + " 'load_xml',\n", + " 'preprocess_html',\n", + " 'preprocessed_actors',\n", + " 'parameters']" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.list()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "660b898c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    [06/20/23 16:44:19] INFO     Loading data from 'load_xml' (XMLDataSet)...                       data_catalog.py:345\n",
    +       "
    \n" + ], + "text/plain": [ + "\u001b[2;36m[06/20/23 16:44:19]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_xml'\u001b[0m \u001b[1m(\u001b[0mXMLDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=813727;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=696103;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.load(\"load_xml\")" + ] } ], "metadata": { diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py new file mode 100755 index 0000000..e08ea5f --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/test_dataset/__init__.py @@ -0,0 +1,3 @@ +"Data Processing pipeline" + +from .pipeline import create_pipeline # NOQA diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py new file mode 100755 index 0000000..ed13212 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/test_dataset/nodes.py @@ -0,0 +1,16 @@ +import pandas as pd +import numpy as np + +#def _is_true(x: pd.Series) -> pd.Series: +# return x == "t" + +#def _parse_percentage(x: pd.Series) -> pd.Series: +# x = x.str.replace("%", "") +# x = x.astype(float) / 100 +# return x + +def test_dataset(actors: pd.DataFrame) -> pd.DataFrame: + actors.replace("XXXX", np.NaN) +# print(actors.head()) + return actors + diff --git a/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py new file mode 100755 index 0000000..ca3b9b6 --- /dev/null +++ b/actes-princiers/src/actes_princiers/pipelines/test_dataset/pipeline.py @@ -0,0 +1,16 @@ +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import test_dataset + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=test_dataset, + inputs="dataset_test", + outputs="preprocessed_dataset_test", + name="process_test_dataset_node", + ), + ] + ) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 269e216..d6b63e0 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -1,26 +1,15 @@ -import pandas as pd from lxml import etree -from pathlib import Path +#from pathlib import Path +## path and file configuration +#_here = Path(__file__).resolve().parent +#xsl_stylesheet = _here / "actes_princiers.xsl" -# path and file configuration -_here = Path(__file__).resolve().parent -xsl_stylesheet = _here / "actes_princiers.xsl" +def parse_xsl(source_doc, parameters): + # + #'write', ou 'write_output + # FIXME recuperer la feuille de style xsl + xslt_doc = etree.parse(parameters['xlststylesheet']) + xslt_transformer = etree.XSLT(xslt_doc) + return str(xslt_transformer(source_doc)) -def parse_xsl(xmldoc: pd.DataFrame) -> pd.DataFrame: -# source_doc = etree.fromstring(xmldoc.to_xml()) -## xmlstring = xmldoc.to_xml() -## source_doc = ET.fromstring(xmlstring) -## source_doc = etree.parse(to_xml) -# # removing namespace : -# query = "descendant-or-self::*[namespace-uri()!='']" -# for element in source_doc.xpath(query): -# #replace element name with its local name -# element.tag = etree.QName(element).localname -# etree.cleanup_namespaces(source_doc) - -# xslt_doc = etree.parse(str(xsl_stylesheet)) -# xslt_transformer = etree.XSLT(xslt_doc) -# output_doc = xslt_transformer(source_doc) -# return pd.read_html(output_doc) - return xmldoc diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index e5ec404..c0ce8b1 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -8,7 +8,7 @@ def create_pipeline(**kwargs) -> Pipeline: [ node( func=parse_xsl, - inputs="parse_xsl", + inputs=["load_xml", "parameters"], outputs="preprocess_html", name="preprocess_html", tags="xsl", diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py new file mode 100644 index 0000000..6b0eb16 --- /dev/null +++ b/actes-princiers/src/actesdataset.py @@ -0,0 +1,46 @@ +import json +from typing import Dict, Any + +from lxml import etree + +from kedro.io import AbstractDataSet, DataSetError + +class XMLDataSet(AbstractDataSet): + "lxml.etree._ElementTree loader" + # FIXME set the typing signature !!!! + + def __init__(self, filepath: str): + self._filepath = filepath + + def _load(self): + source_doc = etree.parse(self._filepath) + # remove namespace : + query = "descendant-or-self::*[namespace-uri()!='']" + for element in source_doc.xpath(query): + #replace element name with its local name + element.tag = etree.QName(element).localname + etree.cleanup_namespaces(source_doc) + return source_doc + + def _save(self, data:str) -> None: +# raise NotImplementedError("pas encore implemente !!!!") + with open(self._filepath, 'w') as fhandle: + fhandle.write(data) + + def _describe(self) -> Dict[str, Any]: + return dict(filepath=self._filepath) + +class JSONDataSet(AbstractDataSet): + def __init__(self, filepath: str): + self._filepath = filepath + + def _load(self) -> Dict: + with open(self._filepath, 'r') as f: + return json.load(f) + + def _save(self, data: Dict) -> None: + with open(self._filepath, 'w') as f: + json.dump(data, f) + + def _describe(self) -> Dict[str, Any]: + return dict(filepath=self._filepath) diff --git a/actes-princiers/src/myowndataset.py b/actes-princiers/src/myowndataset.py new file mode 100644 index 0000000..82a9c7f --- /dev/null +++ b/actes-princiers/src/myowndataset.py @@ -0,0 +1,24 @@ +from pathlib import Path, PurePosixPath +import pandas as pd +from kedro.io import AbstractDataSet + +class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): + def __init__(self, filepath, load_args=None, save_args=None): +# print("------------------------------------", str(load_args)) +# def __init__(self, filepath, param1, param2=True): + self._filepath = PurePosixPath(filepath) +# self._param1 = param1 +# self._param2 = param2 + + def _load(self) -> pd.DataFrame: + return pd.read_csv(self._filepath) + + def _save(self, df: pd.DataFrame) -> None: + df.to_csv(str(self._filepath)) +# raise NotImplementedError("Attention : dataset en lecture seule !") + + def _exists(self) -> bool: + return Path(self._filepath.as_posix()).exists() + + def _describe(self): + return dict(name="my own dataset") diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl b/actes-princiers/static/xsl/actes_princiers.xsl similarity index 100% rename from actes-princiers/src/actes_princiers/pipelines/xml_processing/actes_princiers.xsl rename to actes-princiers/static/xsl/actes_princiers.xsl