Prise en compte de Berry et Anjou dans le pipeline etl

develop
gwen 2 years ago
parent 1d4eedc1a0
commit 23af0f580a

@ -1,9 +1,5 @@
# Actes princiers -- data transformations
1. kedro run --tags="etl_transform"
2. run --tags="populate_database"
## Project Name
human readable name : `Actes Princiers`
@ -31,6 +27,8 @@ or start the ipython prompt : `kedro ipython`
## Launching the pipelines
**go to `actes-princiers`'s folder**
Open a terminal in the `actes-princiers`'s folder and launch kedro
`kedro run`
@ -43,8 +41,16 @@ or a search by tags with:
`kedro run --tags=<tag_name>`
The current tags are:
- `kedro run --tags="etl_transform"`: launches the XML to JSON transformations
- `kedro run --tags="populate_database"`: populates the mongodb distant database
on the target server
## Visualizing the pipelines
**you shall install kedro-viz before**
install kedro viz with
`pip install kedro-viz`

@ -1,5 +1,5 @@
# ________________________________________________________________________
# BOURBON
# input (read only) dataset
bourbon:
type: actesdataset.XMLDataSetCollection
@ -14,14 +14,14 @@ bourbon_json:
folderpath: data/01_raw/xml/Bourbon
# _________________________________________________________________________
# output (write) **pseudo xml** dataset
bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/Bourbon/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used
bourbon_pseudoxmlcontent:
@ -41,28 +41,89 @@ bourbon_fulljsonoutput:
housename: bourbon
folderpath: data/02_intermediate/Bourbon/fulljson
# ________________________________________________________________________
# BERRY
# input (read only) dataset
berry:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/xml/Berry
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read only) dataset
berry_json:
type: actesdataset.BsXMLDataSetCollection
housename: berry
folderpath: data/01_raw/xml/Berry
# _________________________________________________________________________
# output (write) **pseudo xml** dataset
berry_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used
berry_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/pseudoxml
# input (read) and output (write) dataset
berry_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/json
# output (write) and input (read) dataset
berry_fulljsonoutput:
type: actesdataset.JSONDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/fulljson
## ________________________________________________________________________
# ________________________________________________________________________
# ANJOU
# input (read only) dataset
anjou:
type: actesdataset.XMLDataSetCollection
housename: anjou
folderpath: data/01_raw/xml/Anjou
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read only) dataset
anjou_json:
type: actesdataset.BsXMLDataSetCollection
housename: anjou
folderpath: data/01_raw/xml/Anjou
#berry:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/01_raw/xml/berry
# _________________________________________________________________________
#berry_xmlcontent:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/02_intermediate/xml/berry/xml
# output (write) **pseudo xml** dataset
anjou_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: anjou
folderpath: data/02_intermediate/Anjou/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl
## ________________________________________________________________________
# input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used
anjou_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: anjou
folderpath: data/02_intermediate/Anjou/pseudoxml
#anjou:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/01_raw/xml/anjou
# input (read) and output (write) dataset
anjou_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: anjou
folderpath: data/02_intermediate/Anjou/json
#anjou_xmlcontent:
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/02_intermediate/xml/anjou/xml
# output (write) and input (read) dataset
anjou_fulljsonoutput:
type: actesdataset.JSONDataSetCollection
housename: anjou
folderpath: data/02_intermediate/Anjou/fulljson

@ -1,6 +1,7 @@
# XXX cette conf est descriptive
# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
# dans son état de généricité actuel.
# TODO: utiliser cette conf pour augmenter la généricité
# du traitement des datas
# du traitement des datas dans une iteration ulterieure
# 
raw_datapath: data/01_raw
houses:

@ -8,6 +8,7 @@ from .nodes import (parse_xml_collection, make_json_collection,
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
# bourbon
node(
func=parse_xml_collection,
inputs=["bourbon"],
@ -29,19 +30,50 @@ def create_pipeline(**kwargs) -> Pipeline:
name="bourbon_fulljson_ds_collection",
tags="etl_transform"
),
# node(
# func=parse_xml_collection,
# inputs="berry",
# outputs="berry_xmlcontent",
# name="berry_ds_collection",
# ),
# node(
# func=parse_xml_collection,
# inputs="anjou",
# outputs=None, # "anjou_xmlcontent",
# name="anjou_ds_collection",
# ),
# berry
node(
func=parse_xml_collection,
inputs=["berry"],
outputs="berry_xmlcontent",
name="berry_ds_collection",
tags="etl_transform"
),
node(
func=make_json_collection,
inputs="berry_json",
outputs="berry_jsonoutput",
name="berry_json_ds_collection",
tags="etl_transform"
),
node(
func=add_xmlcontent_tojson,
inputs=["berry_jsonoutput", "berry_pseudoxmlcontent"],
outputs="berry_fulljsonoutput",
name="berry_fulljson_ds_collection",
tags="etl_transform"
),
# anjou
node(
func=parse_xml_collection,
inputs=["anjou"],
outputs="anjou_xmlcontent",
name="anjou_ds_collection",
tags="etl_transform"
),
node(
func=make_json_collection,
inputs="anjou_json",
outputs="anjou_jsonoutput",
name="anjou_json_ds_collection",
tags="etl_transform"
),
node(
func=add_xmlcontent_tojson,
inputs=["anjou_jsonoutput", "anjou_pseudoxmlcontent"],
outputs="anjou_fulljsonoutput",
name="anjou_fulljson_ds_collection",
tags="etl_transform"
),
]
)

@ -0,0 +1 @@
git remote add data git@gitlab.huma-num.fr:medieval-acts/princely-acts/data.git
Loading…
Cancel
Save