houses trigram

develop
gwen 2 years ago
parent 4c0f113bd7
commit 9bd5f29198

@ -15,11 +15,27 @@ from actesdataset import JSONDataSetCollection
logger = logging.getLogger(__name__)
# Database schemas
class House(Document):
"_id is the name"
_id = StringField(required=True, max_length=100)
name = StringField(required=True, max_length=100)
trigram = StringField(required=True, max_length=3)
particle = StringField(required=True, max_length=150)
class Acte(Document):
"_id is the filename"
_id = StringField(required=True, max_length=150)
analysis = StringField(required=True, max_length=3000)
date = StringField(required=True, max_length=250)
# FIMXE make a real date object ? or not.
date_time = StringField(required=True, max_length=15)
filename = StringField(required=True, max_length=100)
ref_acte = StringField(required=True, max_length=100)
xmlcontent = StringField(required=True) # no max_length
# pipeline functions
def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str, db_collection_name: str, mongodb_admin: str, mongodb_password: str) -> None:
@ -33,27 +49,27 @@ def populate_mongo(jsondoc: JSONDataSetCollection, storage_ip: str, db_name: str
password = urllib.parse.quote_plus(mongodb_password)
mongodb_url = f"mongodb://{username}:{password}@{storage_ip}:27017/"
#mongodb_url = "mongodb://{}:27017/".format(storage_ip)
logger.info("connection to the mongodb server: " + mongodb_url)
logger.info("connection to the mongodb server")
# pymongo settings
myclient = pymongo.MongoClient(mongodb_url)
# myclient = pymongo.MongoClient(mongodb_url)
myclient = connect(db=db_name, host=mongodb_url, authentication_source='admin', alias="default")
actesdb = myclient[db_name]
actes_collection = actesdb[db_collection_name]
#actesdb = myclient[db_name]
#actes_collection = actesdb[db_collection_name]
# TODO faire un insert_many directement ?
for dataset_filenamestem, dataset in jsondatasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
document = dataset._load()
# FIXME que mettre comme id ? le filename ?
document["_id"] = document["filename"]
#logger.info(str(document))
res = actes_collection.insert_one(document)
logger.info(res.inserted_id)
json_document = dataset._load()
json_document["_id"] = json_document["filename"]
acte_entry = Acte(**json_document)
logger.info("adding entry: " + json_document["filename"])
acte_entry.save()
#res = actes_collection.insert_one(document)
#logger.info(res.inserted_id)
# properly closes the db connection
# FIXME with MongoClient() as client
myclient.close()
# myclient.close()
return
@ -67,6 +83,7 @@ def load_houses(yamldoc: YAMLDataSet, storage_ip: str, db_name: str, mongodb_adm
myclient = connect(db=db_name, host=mongodb_url, authentication_source='admin', alias="default")
for house_dict in yamldoc['houses'].values():
house_dict['_id'] = house_dict['name']
logger.info("inserting: " + str(house_dict))
#houses_col.insert_one(house_dict)
house_entry = House(**house_dict)

@ -12,7 +12,7 @@ def create_pipeline(**kwargs) -> Pipeline:
"params:db_collection_name", "params:mongodb_admin",
"params:mongodb_password"],
outputs=None,
name="populate_mongo",
name="populate_actes",
tags="populate_database",
),
node(
@ -20,8 +20,8 @@ def create_pipeline(**kwargs) -> Pipeline:
inputs=["houses", "params:storage_ip", "params:db_name",
"params:mongodb_admin", "params:mongodb_password"],
outputs=None,
name="load_houses",
tags="load_houses",
name="polulate_houses",
tags="populate_database",
)
]

Loading…
Cancel
Save