ga transcribers

develop
gwen 2 years ago
parent e83d0097a6
commit 4abfc2ba01

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Dict
from mongoengine import connect
from mongoengine import Document, StringField, DictField
from mongoengine import Document, StringField, DictField, ListField
from kedro.framework.session import KedroSession
from kedro.extras.datasets.yaml import YAMLDataSet
@ -37,6 +37,7 @@ class Acte(Document):
prince_name = StringField(required=True, max_length=150)
analysis = StringField(required=True, max_length=3000)
date = StringField(required=True, max_length=250)
transcribers = ListField(required=True)
# FIXME type it as a **real** date object ?
date_time = StringField(required=True, max_length=15)
filename = StringField(required=True, max_length=100)

@ -14,6 +14,7 @@ from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader"
@ -84,6 +85,29 @@ class BsXMLDataSet(XMLDataSet):
with open(self._filepath, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=4)
def find_transcribers(self):
"find transcriber xml bs4 helper"
transcribers = self.soup.find_all('respStmt')
trs = []
for pers in transcribers:
trs_name = pers.find('name')
if trs_name:
trs.append(trs_name.get_text())
return trs
def find_prince_name(self):
"""find prince_name xml bs4 helper
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
"""
persons = self.soup.find_all("listPerson")
for pers in persons:
if pers.attrs.get('type') == "prince":
ps = pers.find('person')
ps_name = pers.find('name')
prince_name = ps_name.get_text()
return prince_name
def transform(self):
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
@ -92,15 +116,7 @@ class BsXMLDataSet(XMLDataSet):
date = self.soup.msItem.docDate.text # verbose date
analyse = self.soup.abstract.p.text # acte's short analysis
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
#prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
# XXX ugly : I HATE BEAUTIFULL SOUP
persons = self.soup.find_all("listPerson")
for pers in persons:
if pers.attrs.get('type') == "prince":
ps = pers.find_next()
ps_name = pers.find_next()
prince_name = ps_name.get_text()
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
@ -119,7 +135,7 @@ class BsXMLDataSet(XMLDataSet):
return {
# "num_acte": counter,
"prince_name": prince_name,
"prince_name": self.find_prince_name(),
"filename": numb,
"date_time": date_time,
"date": date,
@ -127,6 +143,7 @@ class BsXMLDataSet(XMLDataSet):
"analysis": analyse,
# "doc_acte": doc_query[0],
"ref_acte": ref_acte,
"transcribers": self.find_transcribers()
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
}

Loading…
Cancel
Save