ga transcribers

3 years ago · 4abfc2ba01
parent e83d0097a6
commit 4abfc2ba01
2 changed files with 29 additions and 11 deletions
--- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py
@ -5,7 +5,7 @@ from pathlib import Path
 from typing import Dict

 from mongoengine import connect
-from mongoengine import Document, StringField, DictField
+from mongoengine import Document, StringField, DictField, ListField

 from kedro.framework.session import KedroSession
 from kedro.extras.datasets.yaml import YAMLDataSet
@ -37,6 +37,7 @@ class Acte(Document):
    prince_name = StringField(required=True, max_length=150)
    analysis = StringField(required=True, max_length=3000)
    date = StringField(required=True, max_length=250)
+    transcribers = ListField(required=True)
    # FIXME type it as a **real** date object ?
    date_time = StringField(required=True, max_length=15)
    filename = StringField(required=True, max_length=100)
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -14,6 +14,7 @@ from kedro.framework.session import KedroSession

 logger = logging.getLogger(__name__)

+
 class XMLDataSet(ABC):
    "Abstract base class for an XML dataset loader"

@ -84,6 +85,29 @@ class BsXMLDataSet(XMLDataSet):
        with open(self._filepath, 'w') as fp:
            json.dump(data, fp, sort_keys=True, indent=4)

+    def find_transcribers(self):
+        "find transcriber xml bs4 helper"
+        transcribers = self.soup.find_all('respStmt')
+        trs = []
+        for pers in transcribers:
+            trs_name = pers.find('name')
+            if trs_name:
+                trs.append(trs_name.get_text())
+        return trs
+
+    def find_prince_name(self):
+        """find prince_name xml bs4 helper
+        
+        prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
+        """
+        persons = self.soup.find_all("listPerson")
+        for pers in persons:
+            if pers.attrs.get('type') == "prince":
+                ps = pers.find('person')
+                ps_name = pers.find('name')
+                prince_name = ps_name.get_text()
+        return prince_name
+        
    def transform(self):
        #soup = make_soup(os.path.join(folder, acte))
        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
@ -92,15 +116,7 @@ class BsXMLDataSet(XMLDataSet):
        date = self.soup.msItem.docDate.text  # verbose date
        analyse = self.soup.abstract.p.text  # acte's short analysis
        ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
-
-        #prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
-        # XXX ugly : I HATE BEAUTIFULL SOUP
-        persons = self.soup.find_all("listPerson")
-        for pers in persons:
-            if pers.attrs.get('type') == "prince":
-                ps = pers.find_next()
-                ps_name = pers.find_next()
-                prince_name = ps_name.get_text()
+        
        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
        # archive box or the page number inside a manuscript (see _create_doc)
        # warning: the analysis may not have been written yet,
@ -119,7 +135,7 @@ class BsXMLDataSet(XMLDataSet):

        return {
 #            "num_acte": counter,
-            "prince_name": prince_name,
+            "prince_name": self.find_prince_name(),
            "filename": numb,
            "date_time": date_time,
            "date": date,
@ -127,6 +143,7 @@ class BsXMLDataSet(XMLDataSet):
            "analysis": analyse,
 #            "doc_acte": doc_query[0],
            "ref_acte": ref_acte,
+            "transcribers": self.find_transcribers()
 #            "state_doc": state_query[0],
 #            "diplo_type_acte": diplo_query[0]
            }