|
|
|
@ -100,12 +100,9 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
|
|
|
|
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
|
|
|
|
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
persons = self.soup.find_all("listPerson")
|
|
|
|
person = self.soup.find("listPerson", {'type': "prince"} )
|
|
|
|
for pers in persons:
|
|
|
|
ps = person.find('name')
|
|
|
|
if pers.attrs.get('type') == "prince":
|
|
|
|
prince_name = ps.get_text()
|
|
|
|
ps = pers.find('person')
|
|
|
|
|
|
|
|
ps_name = pers.find('name')
|
|
|
|
|
|
|
|
prince_name = ps_name.get_text()
|
|
|
|
|
|
|
|
return prince_name
|
|
|
|
return prince_name
|
|
|
|
|
|
|
|
|
|
|
|
def transform(self):
|
|
|
|
def transform(self):
|
|
|
|
@ -116,23 +113,24 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
date = self.soup.msItem.docDate.text # verbose date
|
|
|
|
date = self.soup.msItem.docDate.text # verbose date
|
|
|
|
analyse = self.soup.abstract.p.text # acte's short analysis
|
|
|
|
analyse = self.soup.abstract.p.text # acte's short analysis
|
|
|
|
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
|
|
|
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
|
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
|
|
|
|
|
# which would result in List Index Out of Range Error. Hence :
|
|
|
|
|
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
|
ref_acte = ref[0].text
|
|
|
|
ref_acte = ref[0].text
|
|
|
|
else: # there is no analysis
|
|
|
|
else: # there is no analysis
|
|
|
|
ref_acte = "NS"
|
|
|
|
ref_acte = "NS"
|
|
|
|
# FIXME: use this location -> geolocallisation
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
|
|
|
|
|
# which would result in List Index Out of Range Error. Hence:
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
#doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
# #type_diplo = self.soup.body.div["subtype"]
|
|
|
|
#type_diplo = self.soup.body.div["subtype"]
|
|
|
|
# #diplo_state = self.soup.body.div["type"]
|
|
|
|
#diplo_state = self.soup.body.div["type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME: location -> geolocallisation
|
|
|
|
|
|
|
|
#place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
|
|
|
|
#if len(place != "NS":
|
|
|
|
|
|
|
|
# place = place[0].text
|
|
|
|
return {
|
|
|
|
return {
|
|
|
|
# "num_acte": counter,
|
|
|
|
# "num_acte": counter,
|
|
|
|
"prince_name": self.find_prince_name(),
|
|
|
|
"prince_name": self.find_prince_name(),
|
|
|
|
@ -256,16 +254,18 @@ class TextDataSetCollection(DataSetCollection):
|
|
|
|
filepath=str(filepath))
|
|
|
|
filepath=str(filepath))
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
#class FoliumHTMLDataSet(AbstractDataSet):
|
|
|
|
class FoliumHTMLDataSet(AbstractDataSet):
|
|
|
|
# def __init__(self, filepath: str):
|
|
|
|
def __init__(self, filepath: str):
|
|
|
|
# self._filepath = filepath
|
|
|
|
self._filepath = filepath
|
|
|
|
#
|
|
|
|
|
|
|
|
# def _load(self) -> None:
|
|
|
|
def _load(self) -> None:
|
|
|
|
# raise DataSetError('This dataset is WriteOnly')
|
|
|
|
raise DataSetError('This dataset is WriteOnly')
|
|
|
|
#
|
|
|
|
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
|
# return dict(filepath=self._filepath)
|
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
#
|
|
|
|
|
|
|
|
#def _save(self, data: Map) -> None:
|
|
|
|
#def _save(self, data: Map) -> None:
|
|
|
|
# data.save(self._filepath)
|
|
|
|
def _save(self, data) -> None:
|
|
|
|
#
|
|
|
|
# FIXME
|
|
|
|
|
|
|
|
data.save(self._filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|