add houses collection

2 years ago · 4b6a15416d
parent 23af0f580a
commit 4b6a15416d
6 changed files with 80 additions and 44 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -1,3 +1,9 @@
+# houses
+# input (read only) dataset
+houses:
+  type: yaml.YAMLDataSet
+  filepath: data/01_raw/yaml/houses.yaml
+
 # ________________________________________________________________________
 # BOURBON
 # input (read only) dataset
--- a/actes-princiers/conf/base/houses.yml
+++ b/actes-princiers/conf/base/houses.yml
@ -1,22 +0,0 @@
-# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
-# dans son état de généricité actuel.
-# TODO: utiliser cette conf pour augmenter la généricité 
-# du traitement des datas dans une iteration ulterieure
-# 
-raw_datapath: data/01_raw
-houses:
-  bourbon:
-    name: Bourbon
-    path: houses/bourbon
-  berry:
-    name: Berry
-    path: houses/berry
-  anjou:
-    name: Anjou
-    path: houses/anjou
-
-# TODO 
-#  - Bretagne
-#  - Bourgogne
-#  - Orléans
-#  - Armagnac
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
@ -4,10 +4,12 @@ import urllib.parse
 from pathlib import Path
 from typing import Dict

+import pymongo
+
 from kedro.framework.session import KedroSession
-from actesdataset import JSONDataSetCollection
+from kedro.extras.datasets.yaml import YAMLDataSet

-import pymongo
+from actesdataset import JSONDataSetCollection


 logger = logging.getLogger(__name__)
@ -21,7 +23,6 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
    jsondatasets = jsondoc.datasets
    housename = jsondoc._housename
    #mongodb://%s:%s@149.202.41.75:27017' % (username, password)
-    # FIXME passer en parametres
    username = urllib.parse.quote_plus(mongodb_admin)
    password = urllib.parse.quote_plus(mongodb_password)
    mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
@ -34,14 +35,41 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
    actesdb = myclient[db_name]
    actes_collection = actesdb[db_collection_name]

-    # TODO faire un insert_many directement ? 
+    # TODO faire un insert_many directement ?
    for dataset_filenamestem, dataset in jsondatasets.items():
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
        document = dataset._load()
-        # FIXME que mettre comme id ? le filename ?  
+        # FIXME que mettre comme id ? le filename ?
        document["_id"] = document["filename"]
        #logger.info(str(document))
        res = actes_collection.insert_one(document)
        logger.info(res.inserted_id)
-    return         
+    # properly closes the db connection
+    # FIXME with MongoClient() as client
+    myclient.close()
+    return
+    
+
+def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_admin: str, mongodb_password: str) -> None:
+
+    username = urllib.parse.quote_plus(mongodb_admin)
+    password = urllib.parse.quote_plus(mongodb_password)
+    mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
+    logger.info("connection to the mongodb server: " + mongodb_url)
+
+    # pymongo settings
+    myclient = pymongo.MongoClient(mongodb_url)
+
+    actesdb = myclient[db_name]
+    houses_col = actesdb['houses']
+    
+    for house in yamldoc['houses'].values():
+        logger.info(str(house))
+        houses_col.insert_one(house)
+
+    # properly closes the db connection
+    # FIXME with MongoClient() as client
+    myclient.close()
+    
+    return
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py
@ -1,6 +1,6 @@
 from kedro.pipeline import Pipeline, node, pipeline

-from .nodes import populate_mongo
+from .nodes import populate_mongo, load_houses


 def create_pipeline(**kwargs) -> Pipeline:
@ -14,7 +14,16 @@ def create_pipeline(**kwargs) -> Pipeline:
                outputs=None,
                name="populate_mongo",
                tags="populate_database",
+            ),
+            node(
+                func=load_houses,
+                inputs=["houses", "params:storage_ip", "params:db_name", 
+                        "params:mongodb_admin", "params:mongodb_password"],
+                outputs=None,
+                name="load_houses",
+                tags="load_houses",
            )
+            
        ]
    )

--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -7,6 +7,7 @@ from abc import ABC, abstractmethod

 from lxml import etree
 from bs4 import BeautifulSoup
+#from folium import Map

 from kedro.io import AbstractDataSet, DataSetError
 from kedro.framework.session import KedroSession
@ -15,12 +16,12 @@ logger = logging.getLogger(__name__)

 class XMLDataSet(ABC):
    "Abstract base class for an XML dataset loader"
-        
+
    def __init__(self, filepath: str) -> None:
        self._filepath = filepath

    @property
-    def filepath(self) -> str: 
+    def filepath(self) -> str:
        "xml file's filename getters"
        return self._filepath

@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet):
    def __init__(self, filepath, params):
        self._filepath = filepath
        self.xsltstylesheet = params
-        
+
    def _load(self):
        "from the xml file loads a internal xml repr (with element tree)"
        # self.source_doc is an etree internal xml repr document
@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet):
        else:  # there is no analysis
            ref_acte = "NS"
 #        prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
-        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
+        # //sourceDesc//msIdentifier/idno[@n='1'] is always the
        # archive box or manuscript collection id
 #        #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
 #        #type_diplo = self.soup.body.div["subtype"]
@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet):
    """Stores instances of ``DataSetCollection``
    implementations to provide ``_load`` and ``_save`` capabilities.
    """
-    def __init__(self, 
+    def __init__(self,
        housename: str,
        folderpath: str) -> None:
        self._housename = housename
        self._folderpath = Path(folderpath)
        # the collections key: file name, value: dataset object
        self.datasets = dict()
-                    
+
    def _save(self, data) -> None:
        """kedro's API saver method
-        
-         There is **nothing to save**, because  
+
+         There is **nothing to save**, because
         this dataset collections is a *container* dataset.
         this method is here only because kedro requires it.
-         """ 
+         """
        pass
-        
+
    def _describe(self) -> dict[str, Any]:
        "kedro's API repr()"
-        return dict(name=self._housename, 
+        return dict(name=self._housename,
                    folderpath=str(self._folderpath))


@ -206,18 +207,18 @@ class TextDataSet:
    """
    def __init__(self, filepath: str):
        self._filepath = filepath
-        
+
    def _load(self) -> str:
        with open(self._filepath, 'r') as fhandle:
            return fhandle.read()
-        
+
    def _save(self, data: str) -> None:
        with open(self._filepath, 'w') as fhandle:
            fhandle.write(data)

    def _describe(self) -> Dict[str, Any]:
        return dict(filepath=self._filepath)
-        
+
 class TextDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, JSONDataSet]:
        "kedro's API loader method"
@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection):
            self.datasets[filepath.stem] = TextDataSet(
                filepath=str(filepath))
        return self
-        
+
+#class FoliumHTMLDataSet(AbstractDataSet):
+#    def __init__(self, filepath: str):
+#        self._filepath = filepath
+#
+#    def _load(self) -> None:
+#        raise DataSetError('This dataset is WriteOnly')
+#
+#    def _describe(self) -> Dict[str, Any]:
+#        return dict(filepath=self._filepath)
+#
+#    def _save(self, data: Map) -> None:
+#        data.save(self._filepath)
+#
--- a/actes-princiers/src/requirements.txt
+++ b/actes-princiers/src/requirements.txt
@ -12,6 +12,7 @@ jupyter~=1.0
 jupyterlab_server>=2.11.1, <2.16.0
 jupyterlab~=3.0, <3.6.0
 kedro~=0.18.12
+kedro-datasets~=1.7.0
 kedro-telemetry~=0.2.5
 lxml~=4.9.3
 nbstripout~=0.4