diff --git a/README.md b/README.md index 57c1ba0..d54fd6e 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,5 @@ # Actes princiers -- data transformations -1. kedro run --tags="etl_transform" -2. run --tags="populate_database" - - ## Project Name human readable name : `Actes Princiers` @@ -31,6 +27,8 @@ or start the ipython prompt : `kedro ipython` ## Launching the pipelines +**go to `actes-princiers`'s folder** + Open a terminal in the `actes-princiers`'s folder and launch kedro `kedro run` @@ -43,8 +41,16 @@ or a search by tags with: `kedro run --tags=` +The current tags are: + +- `kedro run --tags="etl_transform"`: launches the XML to JSON transformations +- `kedro run --tags="populate_database"`: populates the mongodb distant database + on the target server + ## Visualizing the pipelines +**you shall install kedro-viz before** + install kedro viz with `pip install kedro-viz` diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 23793b3..c5c2c76 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,5 +1,5 @@ # ________________________________________________________________________ - +# BOURBON # input (read only) dataset bourbon: type: actesdataset.XMLDataSetCollection @@ -14,14 +14,14 @@ bourbon_json: folderpath: data/01_raw/xml/Bourbon # _________________________________________________________________________ - + # output (write) **pseudo xml** dataset bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/Bourbon/pseudoxml xsltstylesheet: templates/xsl/actes_princiers.xsl - + # input (read) **pseudo xml** dataset # as it is **not** regular xml, an xml loader cannot be used bourbon_pseudoxmlcontent: @@ -41,28 +41,89 @@ bourbon_fulljsonoutput: housename: bourbon folderpath: data/02_intermediate/Bourbon/fulljson +# ________________________________________________________________________ +# BERRY +# input (read only) dataset +berry: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/01_raw/xml/Berry + xsltstylesheet: templates/xsl/actes_princiers.xsl + +# input (read only) dataset +berry_json: + type: actesdataset.BsXMLDataSetCollection + housename: berry + folderpath: data/01_raw/xml/Berry + +# _________________________________________________________________________ + +# output (write) **pseudo xml** dataset +berry_xmlcontent: + type: actesdataset.XMLDataSetCollection + housename: berry + folderpath: data/02_intermediate/Berry/pseudoxml + xsltstylesheet: templates/xsl/actes_princiers.xsl + +# input (read) **pseudo xml** dataset +# as it is **not** regular xml, an xml loader cannot be used +berry_pseudoxmlcontent: + type: actesdataset.TextDataSetCollection + housename: berry + folderpath: data/02_intermediate/Berry/pseudoxml + +# input (read) and output (write) dataset +berry_jsonoutput: + type: actesdataset.JSONDataSetCollection + housename: berry + folderpath: data/02_intermediate/Berry/json + +# output (write) and input (read) dataset +berry_fulljsonoutput: + type: actesdataset.JSONDataSetCollection + housename: berry + folderpath: data/02_intermediate/Berry/fulljson -## ________________________________________________________________________ +# ________________________________________________________________________ +# ANJOU +# input (read only) dataset +anjou: + type: actesdataset.XMLDataSetCollection + housename: anjou + folderpath: data/01_raw/xml/Anjou + xsltstylesheet: templates/xsl/actes_princiers.xsl + +# input (read only) dataset +anjou_json: + type: actesdataset.BsXMLDataSetCollection + housename: anjou + folderpath: data/01_raw/xml/Anjou -#berry: -# type: actesdataset.XMLDataSetCollection -# housename: berry -# folderpath: data/01_raw/xml/berry +# _________________________________________________________________________ -#berry_xmlcontent: -# type: actesdataset.XMLDataSetCollection -# housename: berry -# folderpath: data/02_intermediate/xml/berry/xml +# output (write) **pseudo xml** dataset +anjou_xmlcontent: + type: actesdataset.XMLDataSetCollection + housename: anjou + folderpath: data/02_intermediate/Anjou/pseudoxml + xsltstylesheet: templates/xsl/actes_princiers.xsl -## ________________________________________________________________________ +# input (read) **pseudo xml** dataset +# as it is **not** regular xml, an xml loader cannot be used +anjou_pseudoxmlcontent: + type: actesdataset.TextDataSetCollection + housename: anjou + folderpath: data/02_intermediate/Anjou/pseudoxml -#anjou: -# type: actesdataset.XMLDataSetCollection -# housename: berry -# folderpath: data/01_raw/xml/anjou +# input (read) and output (write) dataset +anjou_jsonoutput: + type: actesdataset.JSONDataSetCollection + housename: anjou + folderpath: data/02_intermediate/Anjou/json -#anjou_xmlcontent: -# type: actesdataset.XMLDataSetCollection -# housename: berry -# folderpath: data/02_intermediate/xml/anjou/xml +# output (write) and input (read) dataset +anjou_fulljsonoutput: + type: actesdataset.JSONDataSetCollection + housename: anjou + folderpath: data/02_intermediate/Anjou/fulljson diff --git a/actes-princiers/conf/base/houses.yml b/actes-princiers/conf/base/houses.yml index 733d446..759fc55 100644 --- a/actes-princiers/conf/base/houses.yml +++ b/actes-princiers/conf/base/houses.yml @@ -1,6 +1,7 @@ -# XXX cette conf est descriptive +# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli +# dans son état de généricité actuel. # TODO: utiliser cette conf pour augmenter la généricité -# du traitement des datas +# du traitement des datas dans une iteration ulterieure #  raw_datapath: data/01_raw houses: diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index a43fe65..aa49953 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -8,6 +8,7 @@ from .nodes import (parse_xml_collection, make_json_collection, def create_pipeline(**kwargs) -> Pipeline: return pipeline( [ + # bourbon node( func=parse_xml_collection, inputs=["bourbon"], @@ -29,19 +30,50 @@ def create_pipeline(**kwargs) -> Pipeline: name="bourbon_fulljson_ds_collection", tags="etl_transform" ), - -# node( -# func=parse_xml_collection, -# inputs="berry", -# outputs="berry_xmlcontent", -# name="berry_ds_collection", -# ), -# node( -# func=parse_xml_collection, -# inputs="anjou", -# outputs=None, # "anjou_xmlcontent", -# name="anjou_ds_collection", -# ), + # berry + node( + func=parse_xml_collection, + inputs=["berry"], + outputs="berry_xmlcontent", + name="berry_ds_collection", + tags="etl_transform" + ), + node( + func=make_json_collection, + inputs="berry_json", + outputs="berry_jsonoutput", + name="berry_json_ds_collection", + tags="etl_transform" + ), + node( + func=add_xmlcontent_tojson, + inputs=["berry_jsonoutput", "berry_pseudoxmlcontent"], + outputs="berry_fulljsonoutput", + name="berry_fulljson_ds_collection", + tags="etl_transform" + ), + # anjou + node( + func=parse_xml_collection, + inputs=["anjou"], + outputs="anjou_xmlcontent", + name="anjou_ds_collection", + tags="etl_transform" + ), + node( + func=make_json_collection, + inputs="anjou_json", + outputs="anjou_jsonoutput", + name="anjou_json_ds_collection", + tags="etl_transform" + ), + node( + func=add_xmlcontent_tojson, + inputs=["anjou_jsonoutput", "anjou_pseudoxmlcontent"], + outputs="anjou_fulljsonoutput", + name="anjou_fulljson_ds_collection", + tags="etl_transform" + ), ] ) diff --git a/git_remote_add_data_registry.sh b/git_remote_add_data_registry.sh new file mode 100755 index 0000000..810206f --- /dev/null +++ b/git_remote_add_data_registry.sh @@ -0,0 +1 @@ +git remote add data git@gitlab.huma-num.fr:medieval-acts/princely-acts/data.git