Prise en compte de Berry et Anjou dans le pipeline etl

develop
gwen 2 years ago
parent 1d4eedc1a0
commit 23af0f580a

@ -1,9 +1,5 @@
# Actes princiers -- data transformations # Actes princiers -- data transformations
1. kedro run --tags="etl_transform"
2. run --tags="populate_database"
## Project Name ## Project Name
human readable name : `Actes Princiers` human readable name : `Actes Princiers`
@ -31,6 +27,8 @@ or start the ipython prompt : `kedro ipython`
## Launching the pipelines ## Launching the pipelines
**go to `actes-princiers`'s folder**
Open a terminal in the `actes-princiers`'s folder and launch kedro Open a terminal in the `actes-princiers`'s folder and launch kedro
`kedro run` `kedro run`
@ -43,8 +41,16 @@ or a search by tags with:
`kedro run --tags=<tag_name>` `kedro run --tags=<tag_name>`
The current tags are:
- `kedro run --tags="etl_transform"`: launches the XML to JSON transformations
- `kedro run --tags="populate_database"`: populates the mongodb distant database
on the target server
## Visualizing the pipelines ## Visualizing the pipelines
**you shall install kedro-viz before**
install kedro viz with install kedro viz with
`pip install kedro-viz` `pip install kedro-viz`

@ -1,5 +1,5 @@
# ________________________________________________________________________ # ________________________________________________________________________
# BOURBON
# input (read only) dataset # input (read only) dataset
bourbon: bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
@ -14,14 +14,14 @@ bourbon_json:
folderpath: data/01_raw/xml/Bourbon folderpath: data/01_raw/xml/Bourbon
# _________________________________________________________________________ # _________________________________________________________________________
# output (write) **pseudo xml** dataset # output (write) **pseudo xml** dataset
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/Bourbon/pseudoxml folderpath: data/02_intermediate/Bourbon/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read) **pseudo xml** dataset # input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used # as it is **not** regular xml, an xml loader cannot be used
bourbon_pseudoxmlcontent: bourbon_pseudoxmlcontent:
@ -41,28 +41,89 @@ bourbon_fulljsonoutput:
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/Bourbon/fulljson folderpath: data/02_intermediate/Bourbon/fulljson
# ________________________________________________________________________
# BERRY
# input (read only) dataset
berry:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/xml/Berry
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read only) dataset
berry_json:
type: actesdataset.BsXMLDataSetCollection
housename: berry
folderpath: data/01_raw/xml/Berry
# _________________________________________________________________________
# output (write) **pseudo xml** dataset
berry_xmlcontent:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used
berry_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/pseudoxml
# input (read) and output (write) dataset
berry_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/json
# output (write) and input (read) dataset
berry_fulljsonoutput:
type: actesdataset.JSONDataSetCollection
housename: berry
folderpath: data/02_intermediate/Berry/fulljson
## ________________________________________________________________________ # ________________________________________________________________________
# ANJOU
# input (read only) dataset
anjou:
type: actesdataset.XMLDataSetCollection
housename: anjou
folderpath: data/01_raw/xml/Anjou
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read only) dataset
anjou_json:
type: actesdataset.BsXMLDataSetCollection
housename: anjou
folderpath: data/01_raw/xml/Anjou
#berry: # _________________________________________________________________________
# type: actesdataset.XMLDataSetCollection
# housename: berry
# folderpath: data/01_raw/xml/berry
#berry_xmlcontent: # output (write) **pseudo xml** dataset
# type: actesdataset.XMLDataSetCollection anjou_xmlcontent:
# housename: berry type: actesdataset.XMLDataSetCollection
# folderpath: data/02_intermediate/xml/berry/xml housename: anjou
folderpath: data/02_intermediate/Anjou/pseudoxml
xsltstylesheet: templates/xsl/actes_princiers.xsl
## ________________________________________________________________________ # input (read) **pseudo xml** dataset
# as it is **not** regular xml, an xml loader cannot be used
anjou_pseudoxmlcontent:
type: actesdataset.TextDataSetCollection
housename: anjou
folderpath: data/02_intermediate/Anjou/pseudoxml
#anjou: # input (read) and output (write) dataset
# type: actesdataset.XMLDataSetCollection anjou_jsonoutput:
# housename: berry type: actesdataset.JSONDataSetCollection
# folderpath: data/01_raw/xml/anjou housename: anjou
folderpath: data/02_intermediate/Anjou/json
#anjou_xmlcontent: # output (write) and input (read) dataset
# type: actesdataset.XMLDataSetCollection anjou_fulljsonoutput:
# housename: berry type: actesdataset.JSONDataSetCollection
# folderpath: data/02_intermediate/xml/anjou/xml housename: anjou
folderpath: data/02_intermediate/Anjou/fulljson

@ -1,6 +1,7 @@
# XXX cette conf est descriptive # XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
# dans son état de généricité actuel.
# TODO: utiliser cette conf pour augmenter la généricité # TODO: utiliser cette conf pour augmenter la généricité
# du traitement des datas # du traitement des datas dans une iteration ulterieure
#  # 
raw_datapath: data/01_raw raw_datapath: data/01_raw
houses: houses:

@ -8,6 +8,7 @@ from .nodes import (parse_xml_collection, make_json_collection,
def create_pipeline(**kwargs) -> Pipeline: def create_pipeline(**kwargs) -> Pipeline:
return pipeline( return pipeline(
[ [
# bourbon
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs=["bourbon"], inputs=["bourbon"],
@ -29,19 +30,50 @@ def create_pipeline(**kwargs) -> Pipeline:
name="bourbon_fulljson_ds_collection", name="bourbon_fulljson_ds_collection",
tags="etl_transform" tags="etl_transform"
), ),
# berry
# node( node(
# func=parse_xml_collection, func=parse_xml_collection,
# inputs="berry", inputs=["berry"],
# outputs="berry_xmlcontent", outputs="berry_xmlcontent",
# name="berry_ds_collection", name="berry_ds_collection",
# ), tags="etl_transform"
# node( ),
# func=parse_xml_collection, node(
# inputs="anjou", func=make_json_collection,
# outputs=None, # "anjou_xmlcontent", inputs="berry_json",
# name="anjou_ds_collection", outputs="berry_jsonoutput",
# ), name="berry_json_ds_collection",
tags="etl_transform"
),
node(
func=add_xmlcontent_tojson,
inputs=["berry_jsonoutput", "berry_pseudoxmlcontent"],
outputs="berry_fulljsonoutput",
name="berry_fulljson_ds_collection",
tags="etl_transform"
),
# anjou
node(
func=parse_xml_collection,
inputs=["anjou"],
outputs="anjou_xmlcontent",
name="anjou_ds_collection",
tags="etl_transform"
),
node(
func=make_json_collection,
inputs="anjou_json",
outputs="anjou_jsonoutput",
name="anjou_json_ds_collection",
tags="etl_transform"
),
node(
func=add_xmlcontent_tojson,
inputs=["anjou_jsonoutput", "anjou_pseudoxmlcontent"],
outputs="anjou_fulljsonoutput",
name="anjou_fulljson_ds_collection",
tags="etl_transform"
),
] ]
) )

@ -0,0 +1 @@
git remote add data git@gitlab.huma-num.fr:medieval-acts/princely-acts/data.git
Loading…
Cancel
Save