add houses collection

develop
gwen 3 years ago
parent 23af0f580a
commit 4b6a15416d

@ -1,3 +1,9 @@
# houses
# input (read only) dataset
houses:
type: yaml.YAMLDataSet
filepath: data/01_raw/yaml/houses.yaml
# ________________________________________________________________________ # ________________________________________________________________________
# BOURBON # BOURBON
# input (read only) dataset # input (read only) dataset

@ -1,22 +0,0 @@
# XXX: cette conf est descriptive, elle n'est pas (plus) utilisée par l'apli
# dans son état de généricité actuel.
# TODO: utiliser cette conf pour augmenter la généricité
# du traitement des datas dans une iteration ulterieure
# 
raw_datapath: data/01_raw
houses:
bourbon:
name: Bourbon
path: houses/bourbon
berry:
name: Berry
path: houses/berry
anjou:
name: Anjou
path: houses/anjou
# TODO
# - Bretagne
# - Bourgogne
# - Orléans
# - Armagnac

@ -4,10 +4,12 @@ import urllib.parse
from pathlib import Path from pathlib import Path
from typing import Dict from typing import Dict
import pymongo
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
from actesdataset import JSONDataSetCollection from kedro.extras.datasets.yaml import YAMLDataSet
import pymongo from actesdataset import JSONDataSetCollection
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,7 +23,6 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
jsondatasets = jsondoc.datasets jsondatasets = jsondoc.datasets
housename = jsondoc._housename housename = jsondoc._housename
#mongodb://%s:%s@149.202.41.75:27017' % (username, password) #mongodb://%s:%s@149.202.41.75:27017' % (username, password)
# FIXME passer en parametres
username = urllib.parse.quote_plus(mongodb_admin) username = urllib.parse.quote_plus(mongodb_admin)
password = urllib.parse.quote_plus(mongodb_password) password = urllib.parse.quote_plus(mongodb_password)
mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/" mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
@ -34,14 +35,41 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
actesdb = myclient[db_name] actesdb = myclient[db_name]
actes_collection = actesdb[db_collection_name] actes_collection = actesdb[db_collection_name]
# TODO faire un insert_many directement ? # TODO faire un insert_many directement ?
for dataset_filenamestem, dataset in jsondatasets.items(): for dataset_filenamestem, dataset in jsondatasets.items():
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
document = dataset._load() document = dataset._load()
# FIXME que mettre comme id ? le filename ? # FIXME que mettre comme id ? le filename ?
document["_id"] = document["filename"] document["_id"] = document["filename"]
#logger.info(str(document)) #logger.info(str(document))
res = actes_collection.insert_one(document) res = actes_collection.insert_one(document)
logger.info(res.inserted_id) logger.info(res.inserted_id)
return # properly closes the db connection
# FIXME with MongoClient() as client
myclient.close()
return
def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_admin: str, mongodb_password: str) -> None:
username = urllib.parse.quote_plus(mongodb_admin)
password = urllib.parse.quote_plus(mongodb_password)
mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
logger.info("connection to the mongodb server: " + mongodb_url)
# pymongo settings
myclient = pymongo.MongoClient(mongodb_url)
actesdb = myclient[db_name]
houses_col = actesdb['houses']
for house in yamldoc['houses'].values():
logger.info(str(house))
houses_col.insert_one(house)
# properly closes the db connection
# FIXME with MongoClient() as client
myclient.close()
return

@ -1,6 +1,6 @@
from kedro.pipeline import Pipeline, node, pipeline from kedro.pipeline import Pipeline, node, pipeline
from .nodes import populate_mongo from .nodes import populate_mongo, load_houses
def create_pipeline(**kwargs) -> Pipeline: def create_pipeline(**kwargs) -> Pipeline:
@ -14,7 +14,16 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs=None, outputs=None,
name="populate_mongo", name="populate_mongo",
tags="populate_database", tags="populate_database",
),
node(
func=load_houses,
inputs=["houses", "params:storage_ip", "params:db_name",
"params:mongodb_admin", "params:mongodb_password"],
outputs=None,
name="load_houses",
tags="load_houses",
) )
] ]
) )

@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
from lxml import etree from lxml import etree
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
#from folium import Map
from kedro.io import AbstractDataSet, DataSetError from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
@ -15,12 +16,12 @@ logger = logging.getLogger(__name__)
class XMLDataSet(ABC): class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader" "Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None: def __init__(self, filepath: str) -> None:
self._filepath = filepath self._filepath = filepath
@property @property
def filepath(self) -> str: def filepath(self) -> str:
"xml file's filename getters" "xml file's filename getters"
return self._filepath return self._filepath
@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet):
def __init__(self, filepath, params): def __init__(self, filepath, params):
self._filepath = filepath self._filepath = filepath
self.xsltstylesheet = params self.xsltstylesheet = params
def _load(self): def _load(self):
"from the xml file loads a internal xml repr (with element tree)" "from the xml file loads a internal xml repr (with element tree)"
# self.source_doc is an etree internal xml repr document # self.source_doc is an etree internal xml repr document
@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet):
else: # there is no analysis else: # there is no analysis
ref_acte = "NS" ref_acte = "NS"
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text # prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the # //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id # archive box or manuscript collection id
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0] # #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# #type_diplo = self.soup.body.div["subtype"] # #type_diplo = self.soup.body.div["subtype"]
@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet):
"""Stores instances of ``DataSetCollection`` """Stores instances of ``DataSetCollection``
implementations to provide ``_load`` and ``_save`` capabilities. implementations to provide ``_load`` and ``_save`` capabilities.
""" """
def __init__(self, def __init__(self,
housename: str, housename: str,
folderpath: str) -> None: folderpath: str) -> None:
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
# the collections key: file name, value: dataset object # the collections key: file name, value: dataset object
self.datasets = dict() self.datasets = dict()
def _save(self, data) -> None: def _save(self, data) -> None:
"""kedro's API saver method """kedro's API saver method
 There is **nothing to save**, because  There is **nothing to save**, because
 this dataset collections is a *container* dataset.  this dataset collections is a *container* dataset.
this method is here only because kedro requires it. this method is here only because kedro requires it.
 """  """
pass pass
def _describe(self) -> dict[str, Any]: def _describe(self) -> dict[str, Any]:
"kedro's API repr()" "kedro's API repr()"
return dict(name=self._housename, return dict(name=self._housename,
folderpath=str(self._folderpath)) folderpath=str(self._folderpath))
@ -206,18 +207,18 @@ class TextDataSet:
""" """
def __init__(self, filepath: str): def __init__(self, filepath: str):
self._filepath = filepath self._filepath = filepath
def _load(self) -> str: def _load(self) -> str:
with open(self._filepath, 'r') as fhandle: with open(self._filepath, 'r') as fhandle:
return fhandle.read() return fhandle.read()
def _save(self, data: str) -> None: def _save(self, data: str) -> None:
with open(self._filepath, 'w') as fhandle: with open(self._filepath, 'w') as fhandle:
fhandle.write(data) fhandle.write(data)
def _describe(self) -> Dict[str, Any]: def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath) return dict(filepath=self._filepath)
class TextDataSetCollection(DataSetCollection): class TextDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]: def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method" "kedro's API loader method"
@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection):
self.datasets[filepath.stem] = TextDataSet( self.datasets[filepath.stem] = TextDataSet(
filepath=str(filepath)) filepath=str(filepath))
return self return self
#class FoliumHTMLDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> None:
# raise DataSetError('This dataset is WriteOnly')
#
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
#
# def _save(self, data: Map) -> None:
# data.save(self._filepath)
#

@ -12,6 +12,7 @@ jupyter~=1.0
jupyterlab_server>=2.11.1, <2.16.0 jupyterlab_server>=2.11.1, <2.16.0
jupyterlab~=3.0, <3.6.0 jupyterlab~=3.0, <3.6.0
kedro~=0.18.12 kedro~=0.18.12
kedro-datasets~=1.7.0
kedro-telemetry~=0.2.5 kedro-telemetry~=0.2.5
lxml~=4.9.3 lxml~=4.9.3
nbstripout~=0.4 nbstripout~=0.4

Loading…
Cancel
Save