traitement xslt sur un fichier

develop
gwen 3 years ago
parent ed01c1c5a4
commit 3693d1650c

@ -1,5 +1,5 @@
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://docs.kedro.org/en/stable/data/data_catalog.html
# _________________________________________________________________________
# loading some data catalogs
actors:
type: pandas.CSVDataSet
@ -19,17 +19,39 @@ corpus-charles-i:
load_args:
sep: ";"
preprocessed_actors:
type: pandas.CSVDataSet
filepath: data/02_intermediate/csv/preprocessed_actors.csv
# _________________________________________________________________________
# custom csv dataset test sample
dataset_test:
type: myowndataset.MyOwnDataSet
filepath: data/01_raw/csv/actors.csv
load_args:
sep: ";"
preprocessed_dataset_test:
type: myowndataset.MyOwnDataSet
filepath: data/02_intermediate/csv/preprocessed_test_dataset.csv
save_args:
sep: ";"
parse_xsl:
type: pandas.XMLDataSet
# _________________________________________________________________________
# custom xml dataset sample
load_xml:
type: actesdataset.XMLDataSet
filepath: data/01_raw/xml/Anjou/anj_is_i_1441_08_05a.xml
preprocess_html:
type: pandas.XMLDataSet
type: actesdataset.XMLDataSet
filepath: data/02_intermediate/xml/Anjou/anj_is_i_1441_08_05a.html
# _________________________________________________________________________
preprocessed_actors:
type: pandas.CSVDataSet
filepath: data/02_intermediate/csv/preprocessed_actors.csv
save_args:
sep: ";"

@ -0,0 +1 @@
xlststylesheet: static/xsl/actes_princiers.xsl

@ -0,0 +1,50 @@
,"NAME;""ROLE"";""HOUSE"";""DATE1"";""DATE2"";""DATE3"""
0,"Charles Ier de Bourbon;""prince"";""Bourbon"";""1400"";""1434"";""1456"""
1,"Gort, Étienne;""secret"";""Bourbon"";""1425"";""1440"";"
2,"Erart;""secret"";""Berry"";""1404"";""1405"";"
3,"Jean de Berry;""prince"";""Berry"";""1337"";""1360"";""1416"""
4,"Agnès de Bourgogne;""prince"";""Bourbon"";""1407"";""1434"";""1476"""
5,"Marghas, Philippe;""secret"";""Bourbon"";""1426"";""1433"";"
6,"Marie de Berry;""prince"";""Bourbon"";""1480"";""1410"";""1434"""
7,"René d'Anjou;""prince"";""Anjou"";"""";"""";"""""
8,"Arthur de Richemont;""prince"";""Bretagne"";"""";"""";"""""
9,"Bernard d'Armagnac;""prince"";""Armagnac"";"""";"""";"""""
10,"Philippe le Bon;""prince"";""Bourgogne"";"""";"""";"""""
11,"Gourriet, Lorrin;""secret"";""Bourbon"";"""";"""";"""""
12,"De Bar, Étienne;""secret"";""Bourbon"";"""";"""";"""""
13,"Gon, Jean;""secret"";""Bourbon"";"""";"""";"""""
14,"Trichon, Jean;""secret"";""Bourbon"";"""";"""";"""""
15,"Chevalier, E.;""secret"";""Bretagne"";"""";"""";"""""
16,"Cadier, Guillaume;""secret"";""Bourbon"";"""";"""";"""""
17,"Decharmeres, J.;""secret"";""Anjou"";"""";"""";"""""
18,"Dommessent;""secret"";""Bourgogne"";""Bretagne"";"""";"""""
19,"Andraut, Laurent;""secret"";""Bourbon"";"""";"""";"""""
20,"Breneal, Jean;""secret"";""Bourgogne"";"""";"""";"""""
21,"De Castillione;""secret"";""Anjou"";"""";"""";"""""
22,"Yollande d'Aragon;""prince"";""Anjou"";"""";"""";"""""
23,"Marie de Blois;""prince"";""Anjou"";"""";"""";"""""
24,"Grauquellin;""secret"";""Anjou"";"""";"""";"""""
25,"Michael;""secret"";""Anjou"";"""";"""";"""""
26,"Matheus;""secret"";""Anjou"";"""";"""";"""""
27,"Louis Ier d'Anjou;""prince"";""Anjou"";"""";"""";"""""
28,"Louis II d'Anjou;""prince"";""Anjou"";"""";"""";"""""
29,"Louis III d'Anjou;""prince"";""Anjou"";"""";"""";"""""
30,"Caillot, G.;""secret"";""Anjou"";"""";"""";"""""
31,"Olivier;""secret"";""Anjou"";"""";"""";"""""
32,"Benepy;""secret"";""Anjou"";"""";"""";"""""
33,"Gontier, Col;""secret"";""Berry"";"""";"""";"""""
34,"Franchome;""secret"";""Anjou"";"""";"""";"""""
35,"Isabelle de Lorraine;""prince"";""Anjou"";"""";"""";"""""
36,"Bollumbrellus;""secret"";""Anjou"";"""";"""";"""""
37,"Nicolao Perigaut;""secret"";""Anjou"";"""";"""";"""""
38,"De Vaulx;""secret"";""Anjou"";"""";"""";"""""
39,"Alardeau, Jean;""secret"";""Anjou"";"""";"""";"""""
40,"Charnières;""secret"";""Anjou"";"""";"""";"""""
41,"Nicolas;""secret"";""Anjou"";"""";"""";"""""
42,"Rouxelet;""secret"";""Anjou"";"""";"""";"""""
43,"Boursier;""secret"";""Anjou"";"""";"""";"""""
44,"Petre;""secret"";""Anjou"";"""";"""";"""""
45,"Ponce Caihe;""secret"";""Anjou"";"""";"""";"""""
46,"J. Crete;""secret"";""Anjou"";"""";"""";"""""
47,"J. de Vernon;""secret"";""Anjou"";"""";"""";"""""
48,"Tourneville, Guillaume;""secret"";""Anjou"";"""";"""";"""""
1 NAME;"ROLE";"HOUSE";"DATE1";"DATE2";"DATE3"
2 0 Charles Ier de Bourbon;"prince";"Bourbon";"1400";"1434";"1456"
3 1 Gort, Étienne;"secret";"Bourbon";"1425";"1440";
4 2 Erart;"secret";"Berry";"1404";"1405";
5 3 Jean de Berry;"prince";"Berry";"1337";"1360";"1416"
6 4 Agnès de Bourgogne;"prince";"Bourbon";"1407";"1434";"1476"
7 5 Marghas, Philippe;"secret";"Bourbon";"1426";"1433";
8 6 Marie de Berry;"prince";"Bourbon";"1480";"1410";"1434"
9 7 René d'Anjou;"prince";"Anjou";"";"";""
10 8 Arthur de Richemont;"prince";"Bretagne";"";"";""
11 9 Bernard d'Armagnac;"prince";"Armagnac";"";"";""
12 10 Philippe le Bon;"prince";"Bourgogne";"";"";""
13 11 Gourriet, Lorrin;"secret";"Bourbon";"";"";""
14 12 De Bar, Étienne;"secret";"Bourbon";"";"";""
15 13 Gon, Jean;"secret";"Bourbon";"";"";""
16 14 Trichon, Jean;"secret";"Bourbon";"";"";""
17 15 Chevalier, E.;"secret";"Bretagne";"";"";""
18 16 Cadier, Guillaume;"secret";"Bourbon";"";"";""
19 17 Decharmeres, J.;"secret";"Anjou";"";"";""
20 18 Dommessent;"secret";"Bourgogne";"Bretagne";"";""
21 19 Andraut, Laurent;"secret";"Bourbon";"";"";""
22 20 Breneal, Jean;"secret";"Bourgogne";"";"";""
23 21 De Castillione;"secret";"Anjou";"";"";""
24 22 Yollande d'Aragon;"prince";"Anjou";"";"";""
25 23 Marie de Blois;"prince";"Anjou";"";"";""
26 24 Grauquellin;"secret";"Anjou";"";"";""
27 25 Michael;"secret";"Anjou";"";"";""
28 26 Matheus;"secret";"Anjou";"";"";""
29 27 Louis Ier d'Anjou;"prince";"Anjou";"";"";""
30 28 Louis II d'Anjou;"prince";"Anjou";"";"";""
31 29 Louis III d'Anjou;"prince";"Anjou";"";"";""
32 30 Caillot, G.;"secret";"Anjou";"";"";""
33 31 Olivier;"secret";"Anjou";"";"";""
34 32 Benepy;"secret";"Anjou";"";"";""
35 33 Gontier, Col;"secret";"Berry";"";"";""
36 34 Franchome;"secret";"Anjou";"";"";""
37 35 Isabelle de Lorraine;"prince";"Anjou";"";"";""
38 36 Bollumbrellus;"secret";"Anjou";"";"";""
39 37 Nicolao Perigaut;"secret";"Anjou";"";"";""
40 38 De Vaulx;"secret";"Anjou";"";"";""
41 39 Alardeau, Jean;"secret";"Anjou";"";"";""
42 40 Charnières;"secret";"Anjou";"";"";""
43 41 Nicolas;"secret";"Anjou";"";"";""
44 42 Rouxelet;"secret";"Anjou";"";"";""
45 43 Boursier;"secret";"Anjou";"";"";""
46 44 Petre;"secret";"Anjou";"";"";""
47 45 Ponce Caihe;"secret";"Anjou";"";"";""
48 46 J. Crete;"secret";"Anjou";"";"";""
49 47 J. de Vernon;"secret";"Anjou";"";"";""
50 48 Tourneville, Guillaume;"secret";"Anjou";"";"";""

@ -1,13 +1,24 @@
<?xml version='1.0' encoding='utf-8'?>
<data>
<row>
<fileDesc/>
<profileDesc/>
<body/>
</row>
<row>
<fileDesc/>
<profileDesc/>
<body/>
</row>
</data>
<h1 class="text-center">1441, 5 août. — Château de Tarascon.</h1>
<div class="analyse">
<p>Mandement d'Isabelle de Lorraine, reine de Jérusalem et de Sicile, duchesse d'Anjou, etc., pour le paiement des mille florins de dot assignés par le roi René à Lionne de la Sellana de Brusa, suivant une convention passée avec son mari, Antoine de la Salle</p>
</div>
<div class="tradition">
<p>A. Original perdu.</p>
<p>B. Copie de A. dans le registre <em>Rosa</em> de la Chambre des comptes de Provence<sup><a href="#1">1</a></sup>. Archives départementales des Bouches-du-Rhône, B12, f. CLIIIIv <a href="https://www.archives13.fr/ark:/40700/151236.2619996/dao/0/328" target="_blank">[copie numérisée]</a>.</p>
<p>
a. Léon-Honoré Labande, « Antoine de La Salle. Nouveaux documents sur sa vie et ses relations avec la maison d'Anjou (Appendices) », dans <em>Bibliothèque de l'École des chartes</em>, n°124, 1904, pp. 352-354 , n°XII <a href="https://doi.org/10.3406/bec.1904.448202" target="_blank">[article numérisé]</a>.
</p>
</div>
<div class="act">
<p> Ysabel, Dei gratia Jherusalem et Sicilie regina, Andegavie, Barri et Lothoringue ducissa, comitatiuum Provincie et Forcalquerii, Cenomanie ac Pedemontis comitissa et in dictis Provincie et Forcalquerii terrisque adjacentibus regia vicaria et locuntenentes generalis, thesaurariis generalibus Provincie, necnon clavariis curiarum regiarum de Forcalquerio, Sistarici... presentibus et futuris et eorum cuilibet, gratiam et bonam voluntatem.</p>
<p>Cum pro solucione et satisfacione mille florenorum monete Provincie in dotem constitutorum et assignatorum per metuendissimum dominum meum regem nobili domicelle Lione, uxori magnifici Anthonii de Sala, consiliarii regii atque nostri dilecti, habendorum quidem et percipiendorum super juribus laudimiorum et trezenorum ac super juribus retencionum nostre curie competentibus in dictis vicaria Forcalquerii ac bajuliis Sistarici, Mosteriarum, Digne, Gastellane, Collismarcii, Guillelmi et Sancti Pauli Vencesii, appunctatum fuerit inter nos et dictum Anthonetum de la Sala, presentibus et consencientibus magistris racionalibus magne regie curie, ut actentis oneribus occurentibus curie regie propinque, opus est ut de parte dictorum proventuum, tam pro gagiis magistrorum racionalium exsolvendis, quam pro aliis negociis regiis succurratur et subveniatur, dictus Anthonius seu ipsa nobilis Liona dicta jura uno anno integraliter recipiat per manus vestras, incipiendo die prima mensis novembris proxime futuri, anno finito et completo jura ipsa pro ipsis gagiis magistrorum racionalium et aliis oneribus more cedant per alium annum inde sequentem, et ipso anno revoluto dictus Anthonius seu dicta uxor ejus dicta jura iterum habeat alterius anni, donec eidem Lione de dicta summa mille florenorum fuerit integraliter persolutum et satisfactum.</p>
<p>Igitur in executionem dicti appunctamenti, volumus et vobis tenore presencium, cum deliberacione regii nobis assistentis consilii, precipimus et mandamus quatinus prefatis Anthonio et Lione, seu ejus (sic) legitimo procuratori, peccunias dictorum jurium que proveniunt a die prima maii proxime preteriti nunc usque, et que provenient exinde usque primam diem maii proxime futuri, sive infra unum annum finiendum prima maii venientis quo computabitur millesimo quadringentesimo quadragesimo secundo, proveniendorum, alternis annis, quousque de dicta summa mille florenorum in dotem pro dote (sic) constituta et assignata eisdem fuerit integraliter satisfactum et exsolutum, tradatis et exsolvatis, vestrumque quilibet, prout ad eum spectabit, tradat et exsolvat integre et sine diminucione et contradicione quibuscumque...</p>
<p>Datum in Castro nostro Tharasconis, per magnificum virum Jeronimum de Mirabollis, de Neapoli, juris utriusque professorem, curie camere summarie regni Sicilie presidentem, majoremque et secundarum appellacionum et nullitatum comitatuum Provincie et Forcalquerii predictorum judicem, consiliarium regium atque nostrum dilectum, die quinta mensis augusti, anno Domini millesimo quadringentesimo quadragesimo primo.</p>
<p>Per reginam, domino senescallo Provincie, domino Valliscluse...</p>
<p>Gratis Registrata.</p>
<p>Matheus</p>
</div>
<div><div class="note-global"><p id="1">1. En-tête de folio : <em>Pro nobili Antonio de la Sala et ejus consorti provisio originalis super modum solucionis dotis ipsorum. Anno Domini M<sup>o</sup> IIII<sup>c</sup> XLI, die tercia octobris, ad requisicionem supplicem magistri Michaelis Matharoni, procuratorio nomine nobilis Antonii de la Sala et ejus consortis, quedam patentes litere originales... in presenti regestro unacum annexa thesaurarii archivate extiterunt, velut ecce. Tenor ipsarum litterarum reginalium</em> (d'après <em>a.</em>).</p></div></div><div class="footnote"><ol></ol></div>

@ -245,6 +245,69 @@
"#actors.values\n",
"cleaned_actors.iloc[9]"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "053ed17c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['actors',\n",
" 'corpus-agnes-bourgogne',\n",
" 'corpus-charles-i',\n",
" 'dataset_test',\n",
" 'preprocessed_dataset_test',\n",
" 'load_xml',\n",
" 'preprocess_html',\n",
" 'preprocessed_actors',\n",
" 'parameters']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.list()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "660b898c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[06/20/23 16:44:19] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading data from <span style=\"color: #008000; text-decoration-color: #008000\">'load_xml'</span> <span style=\"font-weight: bold\">(</span>XMLDataSet<span style=\"font-weight: bold\">)</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> <a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">data_catalog.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">345</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[06/20/23 16:44:19]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading data from \u001b[32m'load_xml'\u001b[0m \u001b[1m(\u001b[0mXMLDataSet\u001b[1m)\u001b[0m\u001b[33m...\u001b[0m \u001b]8;id=813727;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py\u001b\\\u001b[2mdata_catalog.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=696103;file:///media/gwen/maxtor/gwen/entrepot/cnrs/nicolas/depot/datascience/.venv/lib/python3.9/site-packages/kedro/io/data_catalog.py#345\u001b\\\u001b[2m345\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<lxml.etree._ElementTree at 0x7f3e4c3b99c0>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog.load(\"load_xml\")"
]
}
],
"metadata": {

@ -0,0 +1,3 @@
"Data Processing pipeline"
from .pipeline import create_pipeline # NOQA

@ -0,0 +1,16 @@
import pandas as pd
import numpy as np
#def _is_true(x: pd.Series) -> pd.Series:
# return x == "t"
#def _parse_percentage(x: pd.Series) -> pd.Series:
# x = x.str.replace("%", "")
# x = x.astype(float) / 100
# return x
def test_dataset(actors: pd.DataFrame) -> pd.DataFrame:
actors.replace("XXXX", np.NaN)
# print(actors.head())
return actors

@ -0,0 +1,16 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import test_dataset
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=test_dataset,
inputs="dataset_test",
outputs="preprocessed_dataset_test",
name="process_test_dataset_node",
),
]
)

@ -1,26 +1,15 @@
import pandas as pd
from lxml import etree
from pathlib import Path
#from pathlib import Path
## path and file configuration
#_here = Path(__file__).resolve().parent
#xsl_stylesheet = _here / "actes_princiers.xsl"
# path and file configuration
_here = Path(__file__).resolve().parent
xsl_stylesheet = _here / "actes_princiers.xsl"
def parse_xsl(source_doc, parameters):
#<class 'lxml.etree._XSLTResultTree'>
#'write', ou 'write_output
# FIXME recuperer la feuille de style xsl
xslt_doc = etree.parse(parameters['xlststylesheet'])
xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc))
def parse_xsl(xmldoc: pd.DataFrame) -> pd.DataFrame:
# source_doc = etree.fromstring(xmldoc.to_xml())
## xmlstring = xmldoc.to_xml()
## source_doc = ET.fromstring(xmlstring)
## source_doc = etree.parse(to_xml)
# # removing namespace :
# query = "descendant-or-self::*[namespace-uri()!='']"
# for element in source_doc.xpath(query):
# #replace element name with its local name
# element.tag = etree.QName(element).localname
# etree.cleanup_namespaces(source_doc)
# xslt_doc = etree.parse(str(xsl_stylesheet))
# xslt_transformer = etree.XSLT(xslt_doc)
# output_doc = xslt_transformer(source_doc)
# return pd.read_html(output_doc)
return xmldoc

@ -8,7 +8,7 @@ def create_pipeline(**kwargs) -> Pipeline:
[
node(
func=parse_xsl,
inputs="parse_xsl",
inputs=["load_xml", "parameters"],
outputs="preprocess_html",
name="preprocess_html",
tags="xsl",

@ -0,0 +1,46 @@
import json
from typing import Dict, Any
from lxml import etree
from kedro.io import AbstractDataSet, DataSetError
class XMLDataSet(AbstractDataSet):
"lxml.etree._ElementTree loader"
# FIXME set the typing signature !!!!
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self):
source_doc = etree.parse(self._filepath)
# remove namespace :
query = "descendant-or-self::*[namespace-uri()!='']"
for element in source_doc.xpath(query):
#replace element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(source_doc)
return source_doc
def _save(self, data:str) -> None:
# raise NotImplementedError("pas encore implemente !!!!")
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)
class JSONDataSet(AbstractDataSet):
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self) -> Dict:
with open(self._filepath, 'r') as f:
return json.load(f)
def _save(self, data: Dict) -> None:
with open(self._filepath, 'w') as f:
json.dump(data, f)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)

@ -0,0 +1,24 @@
from pathlib import Path, PurePosixPath
import pandas as pd
from kedro.io import AbstractDataSet
class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]):
def __init__(self, filepath, load_args=None, save_args=None):
# print("------------------------------------", str(load_args))
# def __init__(self, filepath, param1, param2=True):
self._filepath = PurePosixPath(filepath)
# self._param1 = param1
# self._param2 = param2
def _load(self) -> pd.DataFrame:
return pd.read_csv(self._filepath)
def _save(self, df: pd.DataFrame) -> None:
df.to_csv(str(self._filepath))
# raise NotImplementedError("Attention : dataset en lecture seule !")
def _exists(self) -> bool:
return Path(self._filepath.as_posix()).exists()
def _describe(self):
return dict(name="my own dataset")
Loading…
Cancel
Save