diff --git a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py index 4e15d41..d0f8c19 100755 --- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py @@ -31,12 +31,13 @@ class House(Document): class Acte(Document): - "_id is the filename" + """_id is the filename""" _id = StringField(required=True, max_length=150) house = StringField(required=True, max_length=100) + prince_name = StringField(required=True, max_length=150) analysis = StringField(required=True, max_length=3000) date = StringField(required=True, max_length=250) - # FIMXE make a real date object ? or not. + # FIXME type it as a **real** date object ? date_time = StringField(required=True, max_length=15) filename = StringField(required=True, max_length=100) ref_acte = StringField(required=True, max_length=100) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 51adbf3..b421b1f 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -45,7 +45,7 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle output_datasets = context.catalog.load(housename + '_jsonoutput') outputfolderpath = output_datasets._folderpath for dataset_filenamestem, dataset in datasets.items(): - logger.info("filestem:" + dataset_filenamestem) + #logger.info("filestem:" + dataset_filenamestem) # a manual load is required here, because # the dataset **is not** registered in kedro's catalog dataset._load() diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index dfd339f..e72876e 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -92,6 +92,15 @@ class BsXMLDataSet(XMLDataSet): date = self.soup.msItem.docDate.text # verbose date analyse = self.soup.abstract.p.text # acte's short analysis ref = self.soup.msIdentifier.find_all("idno", {"n": "2"}) + + #prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()') + # XXX ugly : I HATE BEAUTIFULL SOUP + persons = self.soup.find_all("listPerson") + for pers in persons: + if pers.attrs.get('type') == "prince": + ps = pers.find_next() + ps_name = pers.find_next() + prince_name = ps_name.get_text() # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the # archive box or the page number inside a manuscript (see _create_doc) # warning: the analysis may not have been written yet, @@ -100,6 +109,7 @@ class BsXMLDataSet(XMLDataSet): ref_acte = ref[0].text else: # there is no analysis ref_acte = "NS" + # FIXME: use this location -> geolocallisation # prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text # //sourceDesc//msIdentifier/idno[@n='1'] is always the # archive box or manuscript collection id @@ -109,6 +119,7 @@ class BsXMLDataSet(XMLDataSet): return { # "num_acte": counter, + "prince_name": prince_name, "filename": numb, "date_time": date_time, "date": date,