add prince_name in acte schema

3 years ago · e83d0097a6
parent 318213034e
commit e83d0097a6
3 changed files with 15 additions and 3 deletions
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
@ -31,12 +31,13 @@ class House(Document):


 class Acte(Document):
-    "_id is the filename"
+    """_id is the filename"""
    _id = StringField(required=True, max_length=150)
    house = StringField(required=True, max_length=100)
+    prince_name = StringField(required=True, max_length=150)
    analysis = StringField(required=True, max_length=3000)
    date = StringField(required=True, max_length=250)
-    # FIMXE make a real date object ? or not.
+    # FIXME type it as a **real** date object ?
    date_time = StringField(required=True, max_length=15)
    filename = StringField(required=True, max_length=100)
    ref_acte = StringField(required=True, max_length=100)
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -45,7 +45,7 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
    output_datasets = context.catalog.load(housename + '_jsonoutput')
    outputfolderpath = output_datasets._folderpath
    for dataset_filenamestem, dataset in datasets.items():
-        logger.info("filestem:" + dataset_filenamestem)
+        #logger.info("filestem:" + dataset_filenamestem)
        # a manual load is required here, because
        # the dataset **is not** registered in kedro's catalog
        dataset._load()
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -92,6 +92,15 @@ class BsXMLDataSet(XMLDataSet):
        date = self.soup.msItem.docDate.text  # verbose date
        analyse = self.soup.abstract.p.text  # acte's short analysis
        ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
+
+        #prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
+        # XXX ugly : I HATE BEAUTIFULL SOUP
+        persons = self.soup.find_all("listPerson")
+        for pers in persons:
+            if pers.attrs.get('type') == "prince":
+                ps = pers.find_next()
+                ps_name = pers.find_next()
+                prince_name = ps_name.get_text()
        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
        # archive box or the page number inside a manuscript (see _create_doc)
        # warning: the analysis may not have been written yet,
@ -100,6 +109,7 @@ class BsXMLDataSet(XMLDataSet):
            ref_acte = ref[0].text
        else:  # there is no analysis
            ref_acte = "NS"
+        # FIXME: use this location -> geolocallisation
 #        prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
        # //sourceDesc//msIdentifier/idno[@n='1'] is always the
        # archive box or manuscript collection id
@ -109,6 +119,7 @@ class BsXMLDataSet(XMLDataSet):

        return {
 #            "num_acte": counter,
+            "prince_name": prince_name,
            "filename": numb,
            "date_time": date_time,
            "date": date,