add prince_name in acte schema

develop
gwen 2 years ago
parent 318213034e
commit e83d0097a6

@ -31,12 +31,13 @@ class House(Document):
class Acte(Document):
"_id is the filename"
"""_id is the filename"""
_id = StringField(required=True, max_length=150)
house = StringField(required=True, max_length=100)
prince_name = StringField(required=True, max_length=150)
analysis = StringField(required=True, max_length=3000)
date = StringField(required=True, max_length=250)
# FIMXE make a real date object ? or not.
# FIXME type it as a **real** date object ?
date_time = StringField(required=True, max_length=15)
filename = StringField(required=True, max_length=100)
ref_acte = StringField(required=True, max_length=100)

@ -45,7 +45,7 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
output_datasets = context.catalog.load(housename + '_jsonoutput')
outputfolderpath = output_datasets._folderpath
for dataset_filenamestem, dataset in datasets.items():
logger.info("filestem:" + dataset_filenamestem)
#logger.info("filestem:" + dataset_filenamestem)
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
dataset._load()

@ -92,6 +92,15 @@ class BsXMLDataSet(XMLDataSet):
date = self.soup.msItem.docDate.text # verbose date
analyse = self.soup.abstract.p.text # acte's short analysis
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
#prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
# XXX ugly : I HATE BEAUTIFULL SOUP
persons = self.soup.find_all("listPerson")
for pers in persons:
if pers.attrs.get('type') == "prince":
ps = pers.find_next()
ps_name = pers.find_next()
prince_name = ps_name.get_text()
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
@ -100,6 +109,7 @@ class BsXMLDataSet(XMLDataSet):
ref_acte = ref[0].text
else: # there is no analysis
ref_acte = "NS"
# FIXME: use this location -> geolocallisation
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
@ -109,6 +119,7 @@ class BsXMLDataSet(XMLDataSet):
return {
# "num_acte": counter,
"prince_name": prince_name,
"filename": numb,
"date_time": date_time,
"date": date,

Loading…
Cancel
Save