ga transcribers

develop
gwen 2 years ago
parent e83d0097a6
commit 4abfc2ba01

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Dict from typing import Dict
from mongoengine import connect from mongoengine import connect
from mongoengine import Document, StringField, DictField from mongoengine import Document, StringField, DictField, ListField
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
from kedro.extras.datasets.yaml import YAMLDataSet from kedro.extras.datasets.yaml import YAMLDataSet
@ -37,6 +37,7 @@ class Acte(Document):
prince_name = StringField(required=True, max_length=150) prince_name = StringField(required=True, max_length=150)
analysis = StringField(required=True, max_length=3000) analysis = StringField(required=True, max_length=3000)
date = StringField(required=True, max_length=250) date = StringField(required=True, max_length=250)
transcribers = ListField(required=True)
# FIXME type it as a **real** date object ? # FIXME type it as a **real** date object ?
date_time = StringField(required=True, max_length=15) date_time = StringField(required=True, max_length=15)
filename = StringField(required=True, max_length=100) filename = StringField(required=True, max_length=100)

@ -14,6 +14,7 @@ from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class XMLDataSet(ABC): class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader" "Abstract base class for an XML dataset loader"
@ -84,6 +85,29 @@ class BsXMLDataSet(XMLDataSet):
with open(self._filepath, 'w') as fp: with open(self._filepath, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=4) json.dump(data, fp, sort_keys=True, indent=4)
def find_transcribers(self):
"find transcriber xml bs4 helper"
transcribers = self.soup.find_all('respStmt')
trs = []
for pers in transcribers:
trs_name = pers.find('name')
if trs_name:
trs.append(trs_name.get_text())
return trs
def find_prince_name(self):
"""find prince_name xml bs4 helper
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
"""
persons = self.soup.find_all("listPerson")
for pers in persons:
if pers.attrs.get('type') == "prince":
ps = pers.find('person')
ps_name = pers.find('name')
prince_name = ps_name.get_text()
return prince_name
def transform(self): def transform(self):
#soup = make_soup(os.path.join(folder, acte)) #soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
@ -93,14 +117,6 @@ class BsXMLDataSet(XMLDataSet):
analyse = self.soup.abstract.p.text # acte's short analysis analyse = self.soup.abstract.p.text # acte's short analysis
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"}) ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
#prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
# XXX ugly : I HATE BEAUTIFULL SOUP
persons = self.soup.find_all("listPerson")
for pers in persons:
if pers.attrs.get('type') == "prince":
ps = pers.find_next()
ps_name = pers.find_next()
prince_name = ps_name.get_text()
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc) # archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet, # warning: the analysis may not have been written yet,
@ -119,7 +135,7 @@ class BsXMLDataSet(XMLDataSet):
return { return {
# "num_acte": counter, # "num_acte": counter,
"prince_name": prince_name, "prince_name": self.find_prince_name(),
"filename": numb, "filename": numb,
"date_time": date_time, "date_time": date_time,
"date": date, "date": date,
@ -127,6 +143,7 @@ class BsXMLDataSet(XMLDataSet):
"analysis": analyse, "analysis": analyse,
# "doc_acte": doc_query[0], # "doc_acte": doc_query[0],
"ref_acte": ref_acte, "ref_acte": ref_acte,
"transcribers": self.find_transcribers()
# "state_doc": state_query[0], # "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0] # "diplo_type_acte": diplo_query[0]
} }

Loading…
Cancel
Save