|
|
|
|
@ -14,6 +14,7 @@ from kedro.framework.session import KedroSession
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XMLDataSet(ABC):
|
|
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
|
|
|
|
|
|
@ -84,6 +85,29 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
with open(self._filepath, 'w') as fp:
|
|
|
|
|
json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
|
|
|
|
def find_transcribers(self):
|
|
|
|
|
"find transcriber xml bs4 helper"
|
|
|
|
|
transcribers = self.soup.find_all('respStmt')
|
|
|
|
|
trs = []
|
|
|
|
|
for pers in transcribers:
|
|
|
|
|
trs_name = pers.find('name')
|
|
|
|
|
if trs_name:
|
|
|
|
|
trs.append(trs_name.get_text())
|
|
|
|
|
return trs
|
|
|
|
|
|
|
|
|
|
def find_prince_name(self):
|
|
|
|
|
"""find prince_name xml bs4 helper
|
|
|
|
|
|
|
|
|
|
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
|
|
|
|
|
"""
|
|
|
|
|
persons = self.soup.find_all("listPerson")
|
|
|
|
|
for pers in persons:
|
|
|
|
|
if pers.attrs.get('type') == "prince":
|
|
|
|
|
ps = pers.find('person')
|
|
|
|
|
ps_name = pers.find('name')
|
|
|
|
|
prince_name = ps_name.get_text()
|
|
|
|
|
return prince_name
|
|
|
|
|
|
|
|
|
|
def transform(self):
|
|
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
|
@ -93,14 +117,6 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
analyse = self.soup.abstract.p.text # acte's short analysis
|
|
|
|
|
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
|
|
|
|
|
|
#prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
|
|
|
|
|
# XXX ugly : I HATE BEAUTIFULL SOUP
|
|
|
|
|
persons = self.soup.find_all("listPerson")
|
|
|
|
|
for pers in persons:
|
|
|
|
|
if pers.attrs.get('type') == "prince":
|
|
|
|
|
ps = pers.find_next()
|
|
|
|
|
ps_name = pers.find_next()
|
|
|
|
|
prince_name = ps_name.get_text()
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
|
|
@ -119,7 +135,7 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
# "num_acte": counter,
|
|
|
|
|
"prince_name": prince_name,
|
|
|
|
|
"prince_name": self.find_prince_name(),
|
|
|
|
|
"filename": numb,
|
|
|
|
|
"date_time": date_time,
|
|
|
|
|
"date": date,
|
|
|
|
|
@ -127,6 +143,7 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
"analysis": analyse,
|
|
|
|
|
# "doc_acte": doc_query[0],
|
|
|
|
|
"ref_acte": ref_acte,
|
|
|
|
|
"transcribers": self.find_transcribers()
|
|
|
|
|
# "state_doc": state_query[0],
|
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
|
}
|
|
|
|
|
|