add houses collection

3 years ago · 4b6a15416d
parent 23af0f580a
commit 4b6a15416d
6 changed files with 80 additions and 44 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -1,3 +1,9 @@
 # houses
 # input (read only) dataset
 houses:
  type: yaml.YAMLDataSet
  filepath: data/01_raw/yaml/houses.yaml
 # ________________________________________________________________________
 # BOURBON
 # input (read only) dataset
--- a/actes-princiers/conf/base/houses.yml
+++ b/actes-princiers/conf/base/houses.yml
@ -1,22 +0,0 @@
 # XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
 # dans son état de généricité actuel.
 # TODO: utiliser cette conf pour augmenter la généricité 
 # du traitement des datas dans une iteration ulterieure
 # 
 raw_datapath: data/01_raw
 houses:
  bourbon:
    name: Bourbon
    path: houses/bourbon
  berry:
    name: Berry
    path: houses/berry
  anjou:
    name: Anjou
    path: houses/anjou
 # TODO 
 #  - Bretagne
 #  - Bourgogne
 #  - Orléans
 #  - Armagnac
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
@ -4,10 +4,12 @@ import urllib.parse
 from pathlib import Path
 from typing import Dict
 import pymongo
 from kedro.framework.session import KedroSession
-from actesdataset import JSONDataSetCollection
+from kedro.extras.datasets.yaml import YAMLDataSet
-import pymongo
+from actesdataset import JSONDataSetCollection
 logger = logging.getLogger(__name__)
@ -21,7 +23,6 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
    jsondatasets = jsondoc.datasets
    housename = jsondoc._housename
    #mongodb://%s:%s@149.202.41.75:27017' % (username, password)
    # FIXME passer en parametres
    username = urllib.parse.quote_plus(mongodb_admin)
    password = urllib.parse.quote_plus(mongodb_password)
    mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
@ -34,14 +35,41 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
    actesdb = myclient[db_name]
    actes_collection = actesdb[db_collection_name]
-    # TODO faire un insert_many directement ? 
+    # TODO faire un insert_many directement ?
    for dataset_filenamestem, dataset in jsondatasets.items():
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
        document = dataset._load()
-        # FIXME que mettre comme id ? le filename ?  
+        # FIXME que mettre comme id ? le filename ?
        document["_id"] = document["filename"]
        #logger.info(str(document))
        res = actes_collection.insert_one(document)
        logger.info(res.inserted_id)
-    return         
+    # properly closes the db connection
    # FIXME with MongoClient() as client
    myclient.close()
    return
 def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_admin: str, mongodb_password: str) -> None:
    username = urllib.parse.quote_plus(mongodb_admin)
    password = urllib.parse.quote_plus(mongodb_password)
    mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
    logger.info("connection to the mongodb server: " + mongodb_url)
    # pymongo settings
    myclient = pymongo.MongoClient(mongodb_url)
    actesdb = myclient[db_name]
    houses_col = actesdb['houses']
    for house in yamldoc['houses'].values():
        logger.info(str(house))
        houses_col.insert_one(house)
    # properly closes the db connection
    # FIXME with MongoClient() as client
    myclient.close()
    return
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py
@ -1,6 +1,6 @@
 from kedro.pipeline import Pipeline, node, pipeline
-from .nodes import populate_mongo
+from .nodes import populate_mongo, load_houses
 def create_pipeline(**kwargs) -> Pipeline:
@ -14,7 +14,16 @@ def create_pipeline(**kwargs) -> Pipeline:
                outputs=None,
                name="populate_mongo",
                tags="populate_database",
            ),
            node(
                func=load_houses,
                inputs=["houses", "params:storage_ip", "params:db_name", 
                        "params:mongodb_admin", "params:mongodb_password"],
                outputs=None,
                name="load_houses",
                tags="load_houses",
            )
        ]
    )
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
 from lxml import etree
 from bs4 import BeautifulSoup
 #from folium import Map
 from kedro.io import AbstractDataSet, DataSetError
 from kedro.framework.session import KedroSession
@ -15,12 +16,12 @@ logger = logging.getLogger(__name__)
 class XMLDataSet(ABC):
    "Abstract base class for an XML dataset loader"
-        
+
    def __init__(self, filepath: str) -> None:
        self._filepath = filepath
    @property
-    def filepath(self) -> str: 
+    def filepath(self) -> str:
        "xml file's filename getters"
        return self._filepath
@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet):
    def __init__(self, filepath, params):
        self._filepath = filepath
        self.xsltstylesheet = params
-        
+
    def _load(self):
        "from the xml file loads a internal xml repr (with element tree)"
        # self.source_doc is an etree internal xml repr document
@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet):
        else:  # there is no analysis
            ref_acte = "NS"
 #        prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
-        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
+        # //sourceDesc//msIdentifier/idno[@n='1'] is always the
        # archive box or manuscript collection id
 #        #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
 #        #type_diplo = self.soup.body.div["subtype"]
@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet):
    """Stores instances of ``DataSetCollection``
    implementations to provide ``_load`` and ``_save`` capabilities.
    """
-    def __init__(self, 
+    def __init__(self,
        housename: str,
        folderpath: str) -> None:
        self._housename = housename
        self._folderpath = Path(folderpath)
        # the collections key: file name, value: dataset object
        self.datasets = dict()
-                    
+
    def _save(self, data) -> None:
        """kedro's API saver method
-        
+
-         There is **nothing to save**, because  
+         There is **nothing to save**, because
         this dataset collections is a *container* dataset.
         this method is here only because kedro requires it.
-         """ 
+         """
        pass
-        
+
    def _describe(self) -> dict[str, Any]:
        "kedro's API repr()"
-        return dict(name=self._housename, 
+        return dict(name=self._housename,
                    folderpath=str(self._folderpath))
@ -206,18 +207,18 @@ class TextDataSet:
    """
    def __init__(self, filepath: str):
        self._filepath = filepath
-        
+
    def _load(self) -> str:
        with open(self._filepath, 'r') as fhandle:
            return fhandle.read()
-        
+
    def _save(self, data: str) -> None:
        with open(self._filepath, 'w') as fhandle:
            fhandle.write(data)
    def _describe(self) -> Dict[str, Any]:
        return dict(filepath=self._filepath)
-        
+
 class TextDataSetCollection(DataSetCollection):
    def _load(self) -> dict[str, JSONDataSet]:
        "kedro's API loader method"
@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection):
            self.datasets[filepath.stem] = TextDataSet(
                filepath=str(filepath))
        return self
-        
+
 #class FoliumHTMLDataSet(AbstractDataSet):
 #    def __init__(self, filepath: str):
 #        self._filepath = filepath
 #
 #    def _load(self) -> None:
 #        raise DataSetError('This dataset is WriteOnly')
 #
 #    def _describe(self) -> Dict[str, Any]:
 #        return dict(filepath=self._filepath)
 #
 #    def _save(self, data: Map) -> None:
 #        data.save(self._filepath)
 #
--- a/actes-princiers/src/requirements.txt
+++ b/actes-princiers/src/requirements.txt
@ -12,6 +12,7 @@ jupyter~=1.0
 jupyterlab_server>=2.11.1, <2.16.0
 jupyterlab~=3.0, <3.6.0
 kedro~=0.18.12
 kedro-datasets~=1.7.0
 kedro-telemetry~=0.2.5
 lxml~=4.9.3
 nbstripout~=0.4