diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index a61a01d..c1587ec 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -97,17 +97,14 @@ class BsXMLDataSet(XMLDataSet): def find_prince_name(self): """find prince_name xml bs4 helper - + prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()') """ - persons = self.soup.find_all("listPerson") - for pers in persons: - if pers.attrs.get('type') == "prince": - ps = pers.find('person') - ps_name = pers.find('name') - prince_name = ps_name.get_text() + person = self.soup.find("listPerson", {'type': "prince"} ) + ps = person.find('name') + prince_name = ps.get_text() return prince_name - + def transform(self): #soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) @@ -116,23 +113,24 @@ class BsXMLDataSet(XMLDataSet): date = self.soup.msItem.docDate.text # verbose date analyse = self.soup.abstract.p.text # acte's short analysis ref = self.soup.msIdentifier.find_all("idno", {"n": "2"}) - - # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the - # archive box or the page number inside a manuscript (see _create_doc) - # warning: the analysis may not have been written yet, - # which would result in List Index Out of Range Error. Hence : if len(ref) > 0: # there is an analysis ref_acte = ref[0].text else: # there is no analysis ref_acte = "NS" - # FIXME: use this location -> geolocallisation -# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text + # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the + # archive box or the page number inside a manuscript (see _create_doc) + # warning: the analysis may not have been written yet, + # which would result in List Index Out of Range Error. Hence: # //sourceDesc//msIdentifier/idno[@n='1'] is always the # archive box or manuscript collection id -# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0] -# #type_diplo = self.soup.body.div["subtype"] -# #diplo_state = self.soup.body.div["type"] - + #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0] + #type_diplo = self.soup.body.div["subtype"] + #diplo_state = self.soup.body.div["type"] + + # FIXME: location -> geolocallisation + #place = self.soup.find_all("placeName", {"type": "production_place"})[0].text + #if len(place != "NS":   + # place = place[0].text return { # "num_acte": counter, "prince_name": self.find_prince_name(), @@ -256,16 +254,18 @@ class TextDataSetCollection(DataSetCollection): filepath=str(filepath)) return self -#class FoliumHTMLDataSet(AbstractDataSet): -# def __init__(self, filepath: str): -# self._filepath = filepath -# -# def _load(self) -> None: -# raise DataSetError('This dataset is WriteOnly') -# -# def _describe(self) -> Dict[str, Any]: -# return dict(filepath=self._filepath) -# -# def _save(self, data: Map) -> None: -# data.save(self._filepath) -# +class FoliumHTMLDataSet(AbstractDataSet): + def __init__(self, filepath: str): + self._filepath = filepath + + def _load(self) -> None: + raise DataSetError('This dataset is WriteOnly') + + def _describe(self) -> Dict[str, Any]: + return dict(filepath=self._filepath) + + #def _save(self, data: Map) -> None: + def _save(self, data) -> None: + # FIXME + data.save(self._filepath) +