add houses collection

develop
gwen 2 years ago
parent 23af0f580a
commit 4b6a15416d

@ -1,3 +1,9 @@
# houses
# input (read only) dataset
houses:
type: yaml.YAMLDataSet
filepath: data/01_raw/yaml/houses.yaml
# ________________________________________________________________________
# BOURBON
# input (read only) dataset

@ -1,22 +0,0 @@
# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
# dans son état de généricité actuel.
# TODO: utiliser cette conf pour augmenter la généricité
# du traitement des datas dans une iteration ulterieure
# 
raw_datapath: data/01_raw
houses:
bourbon:
name: Bourbon
path: houses/bourbon
berry:
name: Berry
path: houses/berry
anjou:
name: Anjou
path: houses/anjou
# TODO
# - Bretagne
# - Bourgogne
# - Orléans
# - Armagnac

@ -4,10 +4,12 @@ import urllib.parse
from pathlib import Path
from typing import Dict
import pymongo
from kedro.framework.session import KedroSession
from actesdataset import JSONDataSetCollection
from kedro.extras.datasets.yaml import YAMLDataSet
import pymongo
from actesdataset import JSONDataSetCollection
logger = logging.getLogger(__name__)
@ -21,7 +23,6 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
jsondatasets = jsondoc.datasets
housename = jsondoc._housename
#mongodb://%s:%s@149.202.41.75:27017' % (username, password)
# FIXME passer en parametres
username = urllib.parse.quote_plus(mongodb_admin)
password = urllib.parse.quote_plus(mongodb_password)
mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
@ -34,14 +35,41 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
actesdb = myclient[db_name]
actes_collection = actesdb[db_collection_name]
# TODO faire un insert_many directement ?
# TODO faire un insert_many directement ?
for dataset_filenamestem, dataset in jsondatasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
document = dataset._load()
# FIXME que mettre comme id ? le filename ?
# FIXME que mettre comme id ? le filename ?
document["_id"] = document["filename"]
#logger.info(str(document))
res = actes_collection.insert_one(document)
logger.info(res.inserted_id)
return
# properly closes the db connection
# FIXME with MongoClient() as client
myclient.close()
return
def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_admin: str, mongodb_password: str) -> None:
username = urllib.parse.quote_plus(mongodb_admin)
password = urllib.parse.quote_plus(mongodb_password)
mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
logger.info("connection to the mongodb server: " + mongodb_url)
# pymongo settings
myclient = pymongo.MongoClient(mongodb_url)
actesdb = myclient[db_name]
houses_col = actesdb['houses']
for house in yamldoc['houses'].values():
logger.info(str(house))
houses_col.insert_one(house)
# properly closes the db connection
# FIXME with MongoClient() as client
myclient.close()
return

@ -1,6 +1,6 @@
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import populate_mongo
from .nodes import populate_mongo, load_houses
def create_pipeline(**kwargs) -> Pipeline:
@ -14,7 +14,16 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs=None,
name="populate_mongo",
tags="populate_database",
),
node(
func=load_houses,
inputs=["houses", "params:storage_ip", "params:db_name",
"params:mongodb_admin", "params:mongodb_password"],
outputs=None,
name="load_houses",
tags="load_houses",
)
]
)

@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
from lxml import etree
from bs4 import BeautifulSoup
#from folium import Map
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
@ -15,12 +16,12 @@ logger = logging.getLogger(__name__)
class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None:
self._filepath = filepath
@property
def filepath(self) -> str:
def filepath(self) -> str:
"xml file's filename getters"
return self._filepath
@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet):
def __init__(self, filepath, params):
self._filepath = filepath
self.xsltstylesheet = params
def _load(self):
"from the xml file loads a internal xml repr (with element tree)"
# self.source_doc is an etree internal xml repr document
@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet):
else: # there is no analysis
ref_acte = "NS"
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# #type_diplo = self.soup.body.div["subtype"]
@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet):
"""Stores instances of ``DataSetCollection``
implementations to provide ``_load`` and ``_save`` capabilities.
"""
def __init__(self,
def __init__(self,
housename: str,
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
# the collections key: file name, value: dataset object
self.datasets = dict()
def _save(self, data) -> None:
"""kedro's API saver method
 There is **nothing to save**, because
 There is **nothing to save**, because
 this dataset collections is a *container* dataset.
this method is here only because kedro requires it.
 """
 """
pass
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename,
return dict(name=self._housename,
folderpath=str(self._folderpath))
@ -206,18 +207,18 @@ class TextDataSet:
"""
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self) -> str:
with open(self._filepath, 'r') as fhandle:
return fhandle.read()
def _save(self, data: str) -> None:
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)
class TextDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection):
self.datasets[filepath.stem] = TextDataSet(
filepath=str(filepath))
return self
#class FoliumHTMLDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> None:
# raise DataSetError('This dataset is WriteOnly')
#
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
#
# def _save(self, data: Map) -> None:
# data.save(self._filepath)
#

@ -12,6 +12,7 @@ jupyter~=1.0
jupyterlab_server>=2.11.1, <2.16.0
jupyterlab~=3.0, <3.6.0
kedro~=0.18.12
kedro-datasets~=1.7.0
kedro-telemetry~=0.2.5
lxml~=4.9.3
nbstripout~=0.4

Loading…
Cancel
Save