From 29fab8bc02f2bffc9817fa0fbef75c70fc59363e Mon Sep 17 00:00:00 2001 From: gwen Date: Thu, 6 Jul 2023 14:13:20 +0200 Subject: [PATCH] add bsoup --- actes-princiers/conf/base/catalog.yml | 11 +- .../pipelines/xml_processing/nodes.py | 5 +- actes-princiers/src/actesdataset.py | 123 ++++++++---------- 3 files changed, 62 insertions(+), 77 deletions(-) diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index 07874f5..c95f620 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -1,7 +1,3 @@ -#essai: -# type: actesdataset.TextDataSet -# filepath: data/01_raw/csv/actors.csv - # ________________________________________________________________________ bourbon: @@ -9,11 +5,18 @@ bourbon: housename: bourbon folderpath: data/01_raw/houses/bourbon +# FIXME change the path to data/02_intermediate/houses/bourbon/xml bourbon_content: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/houses/bourbon +#bourbon_json: +# type: actesdataset.XMLDataSetCollection +# housename: bourbon +# folderpath: data/02_intermediate/houses/bourbon/json + + # ________________________________________________________________________ berry: diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 80e6a9e..ee9eb9a 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -15,17 +15,16 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di # the dataset **is not** registered in kedro's catalog dataset._load() descr = dataset._describe() -# logger.info(f"dataset {descr} loaded") + logger.info(f"dataset {descr} loaded") output_source_doc = dataset.transform() # set dataset's output filepath output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") output_xmldataset = EtreeXMLDataSet(output_filepath) - # let's create subfolders now, if they don't exist output_filepath = Path(output_filepath) output_xmldataset_dir = output_filepath.parent output_xmldataset_dir.mkdir(parents=True, exist_ok=True) - + # save on file output_xmldataset._save(output_source_doc) output_datasets[dataset_filenamestem] = output_xmldataset return output_datasets diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 38eb786..7680eec 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -16,7 +16,7 @@ with KedroSession.create() as session: xlststylesheet = context.params['xsltstylesheet'] #xlststylesheet = "templates/xsl/actes_princiers.xsl" -# FIXME make this function a classmethod ? +# XXX is it usefull to make this bunch of code a classmethod ? def _xslt(xsltstylesheet): "performs XML transformation on each dataset" xslt_doc = etree.parse(xlststylesheet) @@ -63,6 +63,58 @@ class EtreeXMLDataSet(XMLDataSet): def transform(self): return str(xslt_transformer(self.source_doc)) +class BsXMLDataSet(XMLDataSet): + "XMLDataSet loader with BeautifulSoup" + + def _load(self): + "from the xml file, loads a internal xml repr (with bsoup)" + with open(self._filepath, 'r', encoding="utf-8") as fhandle: + self.soup = BeautifulSoup(fhandle, 'xml') + ## xml.prettify() is the bsoup str(source_doc) + +# FIXME +# def _save(self, data: Dict) -> None: +# "kedro's API-like saver" +# with open(self._filepath, 'w') as fp: +# json.dump(data, fp, sort_keys=True, indent=4) + + def transform(self): + #soup = make_soup(os.path.join(folder, acte)) + # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) + numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID + date_time = self.soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date + date = self.soup.msItem.docDate.text # verbose date + analyse = self.soup.abstract.p.text # acte's short analysis + ref = self.soup.msIdentifier.find_all("idno", {"n": "2"}) + # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the + # archive box or the page number inside a manuscript (see _create_doc) + # warning: the analysis may not have been written yet, + # which would result in List Index Out of Range Error. Hence : + if len(ref) > 0: # there is an analysis + ref_acte = ref[0].text + else: # there is no analysis + ref_acte = "NS" +# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text + # //sourceDesc//msIdentifier/idno[@n='1'] is always the + # archive box or manuscript collection id +# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0] +# #type_diplo = self.soup.body.div["subtype"] +# #diplo_state = self.soup.body.div["type"] + + return { +# "num_acte": counter, + "filename": numb, + "date_time": date_time, + "date": date, +# "prod_place_acte": place_query[0], + "analysis": analyse, +# "doc_acte": doc_query[0], + "ref_acte": ref_acte, +# "state_doc": state_query[0], +# "diplo_type_acte": diplo_query[0] + } + + class XMLDataSetCollection(AbstractDataSet): """Stores instances of ``XMLDataSet`` implementations to provide ``_load`` and ``_save`` capabilities. @@ -94,7 +146,6 @@ class XMLDataSetCollection(AbstractDataSet): "kedro's API repr()" return dict(name=self._housename, folderpath=self._folderpath) - #class TextDataSet: # """loads/saves data from/to a text file using an underlying filesystem @@ -121,74 +172,6 @@ class XMLDataSetCollection(AbstractDataSet): # def _describe(self) -> Dict[str, Any]: # return dict(filepath=self._filepath) - -#class BsXMLDataSet(XMLDataSet): -# "XMLDataSet loaded with BeautifulSoup" -# def _load(self) -> str: -# "kedro's API-like loader" -# self.source_doc = self._load_soup() -# return self.source_doc - -# def _load_soup(self): -# """open a xml file and return a BeautifulSoup object""" -# with open(self._filepath, 'r', encoding="utf-8") as opening: -# xml = BeautifulSoup(opening, 'xml') -# self.internal_xml = xml -# ## xml.prettify() -> str (source_doc) -# return xml.prettify() - -# def get_internal_xml(self): -# "beautiful soup internal DOM" -# if hasattr(self, 'internal_xml'): -# return self.internal_xml -# else: -# attr_error_msg = str(self._describe()) -# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'") -# return self.internal_xml - -# def _save(self, data:str) -> None: -# "kedro's API-like saver" -# raise NotImplementedError("This DataSet shall not be saved...") - -# def _extract_data(self): -# # FIXME -> traitement à déplacer dans le nodes.py -# # make_soup -> _load_soup -> soup est déjà chargé -# #soup = make_soup(os.path.join(folder, acte)) -# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) -# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID -# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date -# date = soup.msItem.docDate.text # verbose date -# analyse = soup.abstract.p.text # acte's short analysis -# ref = soup.msIdentifier.find_all("idno", {"n": "2"}) -# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the -# # archive box or the page number inside a manuscript (see _create_doc) -# # warning: the analysis may not have been written yet, -# # which would result in List Index Out of Range Error. Hence : -# if len(ref) > 0: # there is an analysis -# ref_acte = ref[0].text -# else: # there is no analysis -# ref_acte = "NS" -# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text -# # //sourceDesc//msIdentifier/idno[@n='1'] is always the -# # archive box or manuscript collection id -# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] -# type_diplo = soup.body.div["subtype"] -# diplo_state = soup.body.div["type"] - -# # 2/ Make the data list -# actes.append({ -# "num_acte": counter, -# "filename": numb, -# "date_time": date_time, -# "date": date, -# "prod_place_acte": place_query[0], -# "analysis": analyse, -# "doc_acte": doc_query[0], -# "ref_acte": ref_acte, -# "state_doc": state_query[0], -# "diplo_type_acte": diplo_query[0] -# }) - #class JSONDataSet(AbstractDataSet): # def __init__(self, filepath: str): # self._filepath = filepath