diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 709b692..07874f5 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,3 +1,7 @@ +#essai: +# type: actesdataset.TextDataSet +# filepath: data/01_raw/csv/actors.csv + # ________________________________________________________________________ bourbon: diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index ad03151..bd425f4 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -17,28 +17,14 @@ class XMLDataSet: def __init__(self, filepath: str) -> None: self._filepath = filepath - # xml etree internal representation - self._dom = None - # xml as an str output - self._str = None - def _load(self): - "kedro's API-like loader" - pass - - def _save(self, data:str) -> None: - "kedro's API-like saver" - pass - - @property - def tostring(self) -> str: - "XML source_doc (xml as a string) getter" - # FIXME : charger le _dom d'abord, puis génerer le str ici - if getattr(self, '_str') is not None: - return self._str - else: - attr_error_msg = str(self._describe()) - raise AttributeError(f"XMLDataSet dom object {attr_error_msg} has not been loaded yet") +# def _load(self): +# "kedro's API-like loader" +# pass +# +# def _save(self, data:str) -> None: +# "kedro's API-like saver" +# pass @property def filepath(self) -> str: @@ -67,9 +53,6 @@ class XMLDataSet: class EtreeXMLDataSet(XMLDataSet): "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" - def __str__(self): - return self.str - def _transform_source_doc(self) -> etree._ElementTree: "xml transformer (with element tree)" self.source_doc = etree.parse(self._filepath) @@ -88,74 +71,6 @@ class EtreeXMLDataSet(XMLDataSet): "kedro's API-like saver" with open(self._filepath, 'w') as fhandle: fhandle.write(data) - - -class BsXMLDataSet(XMLDataSet): - "XMLDataSet loaded with BeautifulSoup" - def _load(self) -> str: - "kedro's API-like loader" - self.source_doc = self._load_soup() - return self.source_doc - - def _load_soup(self): - """open a xml file and return a BeautifulSoup object""" - with open(self._filepath, 'r', encoding="utf-8") as opening: - xml = BeautifulSoup(opening, 'xml') - self.internal_xml = xml - ## xml.prettify() -> str (source_doc) - return xml.prettify() - - def get_internal_xml(self): - "beautiful soup internal DOM" - if hasattr(self, 'internal_xml'): - return self.internal_xml - else: - attr_error_msg = str(self._describe()) - raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'") - return self.internal_xml - - def _save(self, data:str) -> None: - "kedro's API-like saver" - raise NotImplementedError("This DataSet shall not be saved...") - - def _extract_data(self): - # FIXME -> traitement à déplacer dans le nodes.py - # make_soup -> _load_soup -> soup est déjà chargé - #soup = make_soup(os.path.join(folder, acte)) - # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) - numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID - date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date - date = soup.msItem.docDate.text # verbose date - analyse = soup.abstract.p.text # acte's short analysis - ref = soup.msIdentifier.find_all("idno", {"n": "2"}) - # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the - # archive box or the page number inside a manuscript (see _create_doc) - # warning: the analysis may not have been written yet, - # which would result in List Index Out of Range Error. Hence : - if len(ref) > 0: # there is an analysis - ref_acte = ref[0].text - else: # there is no analysis - ref_acte = "NS" - prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text - # //sourceDesc//msIdentifier/idno[@n='1'] is always the - # archive box or manuscript collection id - doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] - type_diplo = soup.body.div["subtype"] - diplo_state = soup.body.div["type"] - - # 2/ Make the data list - actes.append({ - "num_acte": counter, - "filename": numb, - "date_time": date_time, - "date": date, - "prod_place_acte": place_query[0], - "analysis": analyse, - "doc_acte": doc_query[0], - "ref_acte": ref_acte, - "state_doc": state_query[0], - "diplo_type_acte": diplo_query[0] - }) class XMLDataSetCollection(AbstractDataSet): @@ -184,7 +99,7 @@ class XMLDataSetCollection(AbstractDataSet): filepath=str(filepath)) return self.datasets - def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None: + def _save(self, datasets: dict[str, Any]) -> None: "kedro's API saver method" for stemfilename, dataset in datasets.items(): dataset._save(dataset.get_source_doc()) @@ -194,6 +109,100 @@ class XMLDataSetCollection(AbstractDataSet): return dict(name=self._housename, folderpath=self._folderpath) +#class TextDataSet: +# """loads/saves data from/to a text file using an underlying filesystem + +# example usage + +# >>> string_to_write = "This will go in a file." +# >>> +# >>> data_set = TextDataSet(filepath="test.md") +# >>> data_set.save(string_to_write) +# >>> reloaded = data_set.load() +# >>> assert string_to_write == reloaded +# """ +# def __init__(self, filepath: str): +# self._filepath = filepath +# +# def _load(self) -> str: +# with open(self._filepath, 'r') as fhandle: +# return fhandle.read() + +# def _save(self, data: str) -> None: +# with open(self._filepath, 'w') as fhandle: +# fhandle.write(data) + +# def _describe(self) -> Dict[str, Any]: +# return dict(filepath=self._filepath) + + +#class BsXMLDataSet(XMLDataSet): +# "XMLDataSet loaded with BeautifulSoup" +# def _load(self) -> str: +# "kedro's API-like loader" +# self.source_doc = self._load_soup() +# return self.source_doc + +# def _load_soup(self): +# """open a xml file and return a BeautifulSoup object""" +# with open(self._filepath, 'r', encoding="utf-8") as opening: +# xml = BeautifulSoup(opening, 'xml') +# self.internal_xml = xml +# ## xml.prettify() -> str (source_doc) +# return xml.prettify() + +# def get_internal_xml(self): +# "beautiful soup internal DOM" +# if hasattr(self, 'internal_xml'): +# return self.internal_xml +# else: +# attr_error_msg = str(self._describe()) +# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'") +# return self.internal_xml + +# def _save(self, data:str) -> None: +# "kedro's API-like saver" +# raise NotImplementedError("This DataSet shall not be saved...") + +# def _extract_data(self): +# # FIXME -> traitement à déplacer dans le nodes.py +# # make_soup -> _load_soup -> soup est déjà chargé +# #soup = make_soup(os.path.join(folder, acte)) +# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) +# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID +# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date +# date = soup.msItem.docDate.text # verbose date +# analyse = soup.abstract.p.text # acte's short analysis +# ref = soup.msIdentifier.find_all("idno", {"n": "2"}) +# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the +# # archive box or the page number inside a manuscript (see _create_doc) +# # warning: the analysis may not have been written yet, +# # which would result in List Index Out of Range Error. Hence : +# if len(ref) > 0: # there is an analysis +# ref_acte = ref[0].text +# else: # there is no analysis +# ref_acte = "NS" +# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text +# # //sourceDesc//msIdentifier/idno[@n='1'] is always the +# # archive box or manuscript collection id +# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] +# type_diplo = soup.body.div["subtype"] +# diplo_state = soup.body.div["type"] + +# # 2/ Make the data list +# actes.append({ +# "num_acte": counter, +# "filename": numb, +# "date_time": date_time, +# "date": date, +# "prod_place_acte": place_query[0], +# "analysis": analyse, +# "doc_acte": doc_query[0], +# "ref_acte": ref_acte, +# "state_doc": state_query[0], +# "diplo_type_acte": diplo_query[0] +# }) + #class JSONDataSet(AbstractDataSet): # def __init__(self, filepath: str): # self._filepath = filepath @@ -208,3 +217,4 @@ class XMLDataSetCollection(AbstractDataSet): # def _describe(self) -> Dict[str, Any]: # return dict(filepath=self._filepath) +