From 4b6a15416dfa03e5195dabd31506f98459594126 Mon Sep 17 00:00:00 2001 From: gwen Date: Wed, 13 Sep 2023 23:01:21 +0200 Subject: [PATCH] add houses collection --- actes-princiers/conf/base/catalog.yml | 6 +++ actes-princiers/conf/base/houses.yml | 22 ---------- .../pipelines/populate_mongo/nodes.py | 40 ++++++++++++++--- .../pipelines/populate_mongo/pipeline.py | 11 ++++- actes-princiers/src/actesdataset.py | 44 ++++++++++++------- actes-princiers/src/requirements.txt | 1 + 6 files changed, 80 insertions(+), 44 deletions(-) delete mode 100644 actes-princiers/conf/base/houses.yml diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index c5c2c76..50fbde8 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,3 +1,9 @@ +# houses +# input (read only) dataset +houses: + type: yaml.YAMLDataSet + filepath: data/01_raw/yaml/houses.yaml + # ________________________________________________________________________ # BOURBON # input (read only) dataset diff --git a/actes-princiers/conf/base/houses.yml b/actes-princiers/conf/base/houses.yml deleted file mode 100644 index 759fc55..0000000 --- a/actes-princiers/conf/base/houses.yml +++ /dev/null @@ -1,22 +0,0 @@ -# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli -# dans son état de généricité actuel. -# TODO: utiliser cette conf pour augmenter la généricité -# du traitement des datas dans une iteration ulterieure -#  -raw_datapath: data/01_raw -houses: - bourbon: - name: Bourbon - path: houses/bourbon - berry: - name: Berry - path: houses/berry - anjou: - name: Anjou - path: houses/anjou - -# TODO -# - Bretagne -# - Bourgogne -# - Orléans -# - Armagnac diff --git a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py index c0b8c53..01f484e 100755 --- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py @@ -4,10 +4,12 @@ import urllib.parse from pathlib import Path from typing import Dict +import pymongo + from kedro.framework.session import KedroSession -from actesdataset import JSONDataSetCollection +from kedro.extras.datasets.yaml import YAMLDataSet -import pymongo +from actesdataset import JSONDataSetCollection logger = logging.getLogger(__name__) @@ -21,7 +23,6 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str jsondatasets = jsondoc.datasets housename = jsondoc._housename #mongodb://%s:%s@149.202.41.75:27017' % (username, password) - # FIXME passer en parametres username = urllib.parse.quote_plus(mongodb_admin) password = urllib.parse.quote_plus(mongodb_password) mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/" @@ -34,14 +35,41 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str actesdb = myclient[db_name] actes_collection = actesdb[db_collection_name] - # TODO faire un insert_many directement ? + # TODO faire un insert_many directement ? for dataset_filenamestem, dataset in jsondatasets.items(): # a manual load is required here, because # the dataset **is not** registered in kedro's catalog document = dataset._load() - # FIXME que mettre comme id ? le filename ? + # FIXME que mettre comme id ? le filename ? document["_id"] = document["filename"] #logger.info(str(document)) res = actes_collection.insert_one(document) logger.info(res.inserted_id) - return + # properly closes the db connection + # FIXME with MongoClient() as client + myclient.close() + return + + +def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_admin: str, mongodb_password: str) -> None: + + username = urllib.parse.quote_plus(mongodb_admin) + password = urllib.parse.quote_plus(mongodb_password) + mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/" + logger.info("connection to the mongodb server: " + mongodb_url) + + # pymongo settings + myclient = pymongo.MongoClient(mongodb_url) + + actesdb = myclient[db_name] + houses_col = actesdb['houses'] + + for house in yamldoc['houses'].values(): + logger.info(str(house)) + houses_col.insert_one(house) + + # properly closes the db connection + # FIXME with MongoClient() as client + myclient.close() + + return diff --git a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py index fc983b8..80d0323 100755 --- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/pipeline.py @@ -1,6 +1,6 @@ from kedro.pipeline import Pipeline, node, pipeline -from .nodes import populate_mongo +from .nodes import populate_mongo, load_houses def create_pipeline(**kwargs) -> Pipeline: @@ -14,7 +14,16 @@ def create_pipeline(**kwargs) -> Pipeline: outputs=None, name="populate_mongo", tags="populate_database", + ), + node( + func=load_houses, + inputs=["houses", "params:storage_ip", "params:db_name", + "params:mongodb_admin", "params:mongodb_password"], + outputs=None, + name="load_houses", + tags="load_houses", ) + ] ) diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 6ff8357..dfd339f 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -7,6 +7,7 @@ from abc import ABC, abstractmethod from lxml import etree from bs4 import BeautifulSoup +#from folium import Map from kedro.io import AbstractDataSet, DataSetError from kedro.framework.session import KedroSession @@ -15,12 +16,12 @@ logger = logging.getLogger(__name__) class XMLDataSet(ABC): "Abstract base class for an XML dataset loader" - + def __init__(self, filepath: str) -> None: self._filepath = filepath @property - def filepath(self) -> str: + def filepath(self) -> str: "xml file's filename getters" return self._filepath @@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet): def __init__(self, filepath, params): self._filepath = filepath self.xsltstylesheet = params - + def _load(self): "from the xml file loads a internal xml repr (with element tree)" # self.source_doc is an etree internal xml repr document @@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet): else: # there is no analysis ref_acte = "NS" # prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text - # //sourceDesc//msIdentifier/idno[@n='1'] is always the + # //sourceDesc//msIdentifier/idno[@n='1'] is always the # archive box or manuscript collection id # #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0] # #type_diplo = self.soup.body.div["subtype"] @@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet): """Stores instances of ``DataSetCollection`` implementations to provide ``_load`` and ``_save`` capabilities. """ - def __init__(self, + def __init__(self, housename: str, folderpath: str) -> None: self._housename = housename self._folderpath = Path(folderpath) # the collections key: file name, value: dataset object self.datasets = dict() - + def _save(self, data) -> None: """kedro's API saver method - -  There is **nothing to save**, because + +  There is **nothing to save**, because  this dataset collections is a *container* dataset. this method is here only because kedro requires it. -  """ +  """ pass - + def _describe(self) -> dict[str, Any]: "kedro's API repr()" - return dict(name=self._housename, + return dict(name=self._housename, folderpath=str(self._folderpath)) @@ -206,18 +207,18 @@ class TextDataSet: """ def __init__(self, filepath: str): self._filepath = filepath - + def _load(self) -> str: with open(self._filepath, 'r') as fhandle: return fhandle.read() - + def _save(self, data: str) -> None: with open(self._filepath, 'w') as fhandle: fhandle.write(data) def _describe(self) -> Dict[str, Any]: return dict(filepath=self._filepath) - + class TextDataSetCollection(DataSetCollection): def _load(self) -> dict[str, JSONDataSet]: "kedro's API loader method" @@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection): self.datasets[filepath.stem] = TextDataSet( filepath=str(filepath)) return self - + +#class FoliumHTMLDataSet(AbstractDataSet): +# def __init__(self, filepath: str): +# self._filepath = filepath +# +# def _load(self) -> None: +# raise DataSetError('This dataset is WriteOnly') +# +# def _describe(self) -> Dict[str, Any]: +# return dict(filepath=self._filepath) +# +# def _save(self, data: Map) -> None: +# data.save(self._filepath) +# diff --git a/actes-princiers/src/requirements.txt b/actes-princiers/src/requirements.txt index 7f0a16d..1590992 100644 --- a/actes-princiers/src/requirements.txt +++ b/actes-princiers/src/requirements.txt @@ -12,6 +12,7 @@ jupyter~=1.0 jupyterlab_server>=2.11.1, <2.16.0 jupyterlab~=3.0, <3.6.0 kedro~=0.18.12 +kedro-datasets~=1.7.0 kedro-telemetry~=0.2.5 lxml~=4.9.3 nbstripout~=0.4