From 8b5b3d83c8918272b4fe975a410638df60c1bf11 Mon Sep 17 00:00:00 2001 From: gwen Date: Fri, 22 Sep 2023 16:11:35 +0200 Subject: [PATCH] add datetime type --- .../pipelines/populate_mongo/nodes.py | 5 ++- actes-princiers/src/actesdataset.py | 33 +++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py index cef527d..1ef67f2 100755 --- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Dict from mongoengine import connect -from mongoengine import Document, StringField, DictField, ListField +from mongoengine import Document, StringField, DictField, ListField, DateTimeField #import folium from kedro.framework.session import KedroSession @@ -45,8 +45,7 @@ class Acte(Document): analysis = StringField(required=True, max_length=3000) date = StringField(required=True, max_length=250) transcribers = ListField(required=True) - # FIXME date_teim type shal it be a **real** date object ? - date_time = StringField(required=True, max_length=15) + date_time = DateTimeField(required=True) filename = StringField(required=True, max_length=100) ref_acte = StringField(required=True, max_length=100) xmlcontent = StringField(required=True) # no max_length diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 472cfe6..e399347 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -122,16 +122,29 @@ class BsXMLDataSet(XMLDataSet): #soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID - date_time = self.soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date - #from datetime import datetime - #if not len(date_time.split('-')) == 2: - # # time format '%Y-%m-%d' - # isotime = datetime.strptime(date_time,'%Y-%m-%d') - # date_time = isotime.isoformat() - #else: - # # time format '%Y-%m' - # isotime = datetime.strptime(date_time,'%Y-%m') - # date_time = isotime.isoformat() + # date formats : YYYY-MM-DD, YYYY-MM or just YYYY + date_time = self.soup.msItem.docDate["when"] + + # datetime parsing + from datetime import datetime + if len(date_time.split('-')) == 1: + # time format 'YYYY' + isotime = datetime.strptime(date_time,'%Y') + date_time = isotime.isoformat() + + elif len(date_time.split('-')) == 2: + # time format '%Y-%m' + isotime = datetime.strptime(date_time,'%Y-%m') + date_time = isotime.isoformat() + + elif len(date_time.split('-')) == 3: + # time format '%Y-%m-%d' + isotime = datetime.strptime(date_time,'%Y-%m-%d') + date_time = isotime.isoformat() + else: + # FIXME raise exception + pass + date = self.soup.msItem.docDate.text # verbose date analyse = self.soup.abstract.p.text # acte's short analysis ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})