From 29fab8bc02f2bffc9817fa0fbef75c70fc59363e Mon Sep 17 00:00:00 2001
From: gwen <gwenaelremond@free.fr>
Date: Thu, 6 Jul 2023 14:13:20 +0200
Subject: [PATCH] add bsoup

---
 actes-princiers/conf/base/catalog.yml         |  11 +-
 .../pipelines/xml_processing/nodes.py         |   5 +-
 actes-princiers/src/actesdataset.py           | 123 ++++++++----------
 3 files changed, 62 insertions(+), 77 deletions(-)

diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml
index 07874f5..c95f620 100644
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@@ -1,7 +1,3 @@
-#essai:
-#  type: actesdataset.TextDataSet
-#  filepath: data/01_raw/csv/actors.csv
-
 # ________________________________________________________________________
 
 bourbon:
@@ -9,11 +5,18 @@ bourbon:
   housename: bourbon
   folderpath: data/01_raw/houses/bourbon
 
+# FIXME change the path to data/02_intermediate/houses/bourbon/xml 
 bourbon_content:
   type: actesdataset.XMLDataSetCollection
   housename: bourbon
   folderpath: data/02_intermediate/houses/bourbon
 
+#bourbon_json:
+#  type: actesdataset.XMLDataSetCollection
+#  housename: bourbon
+#  folderpath: data/02_intermediate/houses/bourbon/json
+
+
 # ________________________________________________________________________
 
 berry:
diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
index 80e6a9e..ee9eb9a 100755
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@@ -15,17 +15,16 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
         # the dataset **is not** registered in kedro's catalog
         dataset._load()
         descr = dataset._describe()
-#        logger.info(f"dataset {descr} loaded")
+        logger.info(f"dataset {descr} loaded")
         output_source_doc = dataset.transform() 
         # set dataset's output filepath
         output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
         output_xmldataset = EtreeXMLDataSet(output_filepath)
-
         # let's create subfolders now, if they don't exist
         output_filepath = Path(output_filepath)
         output_xmldataset_dir = output_filepath.parent
         output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
-
+        # save on file
         output_xmldataset._save(output_source_doc)
         output_datasets[dataset_filenamestem] = output_xmldataset
     return output_datasets
diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py
index 38eb786..7680eec 100644
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@@ -16,7 +16,7 @@ with KedroSession.create() as session:
     xlststylesheet = context.params['xsltstylesheet']
         
 #xlststylesheet = "templates/xsl/actes_princiers.xsl"
-# FIXME make this function a classmethod ?
+# XXX is it usefull to make this bunch of code a classmethod ?
 def _xslt(xsltstylesheet):
     "performs XML transformation on each dataset"
     xslt_doc = etree.parse(xlststylesheet)
@@ -63,6 +63,58 @@ class EtreeXMLDataSet(XMLDataSet):
     def transform(self):
         return str(xslt_transformer(self.source_doc))
 
+class BsXMLDataSet(XMLDataSet):
+    "XMLDataSet loader with BeautifulSoup"
+
+    def _load(self):
+        "from the xml file, loads a internal xml repr (with bsoup)"
+        with open(self._filepath, 'r', encoding="utf-8") as fhandle:
+            self.soup = BeautifulSoup(fhandle, 'xml')
+        ## xml.prettify() is the bsoup str(source_doc)
+
+# FIXME 
+#    def _save(self, data: Dict) -> None:
+#        "kedro's API-like saver"
+#        with open(self._filepath, 'w') as fp:
+#            json.dump(data, fp, sort_keys=True, indent=4)
+
+    def transform(self):
+        #soup = make_soup(os.path.join(folder, acte))
+        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
+        numb = self.soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
+        date_time = self.soup.msItem.docDate["when"]  # YYYY-MM-DD or YYYY-MM date
+        date = self.soup.msItem.docDate.text  # verbose date
+        analyse = self.soup.abstract.p.text  # acte's short analysis
+        ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
+        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
+        # archive box or the page number inside a manuscript (see _create_doc)
+        # warning: the analysis may not have been written yet,
+        # which would result in List Index Out of Range Error. Hence :
+        if len(ref) > 0:  # there is an analysis
+            ref_acte = ref[0].text
+        else:  # there is no analysis
+            ref_acte = "NS"
+#        prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
+        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
+        # archive box or manuscript collection id
+#        #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
+#        #type_diplo = self.soup.body.div["subtype"]
+#        #diplo_state = self.soup.body.div["type"]
+
+        return {
+#            "num_acte": counter,
+            "filename": numb,
+            "date_time": date_time,
+            "date": date,
+#            "prod_place_acte": place_query[0],
+            "analysis": analyse,
+#            "doc_acte": doc_query[0],
+            "ref_acte": ref_acte,
+#            "state_doc": state_query[0],
+#            "diplo_type_acte": diplo_query[0]
+            }
+
+
 class XMLDataSetCollection(AbstractDataSet):
     """Stores instances of ``XMLDataSet``
     implementations to provide ``_load`` and ``_save`` capabilities.
@@ -94,7 +146,6 @@ class XMLDataSetCollection(AbstractDataSet):
         "kedro's API repr()"
         return dict(name=self._housename, folderpath=self._folderpath)
 
-
 #class TextDataSet:
 #    """loads/saves data from/to a text file using an underlying filesystem
 
@@ -121,74 +172,6 @@ class XMLDataSetCollection(AbstractDataSet):
 #    def _describe(self) -> Dict[str, Any]:
 #        return dict(filepath=self._filepath)
 
-
-#class BsXMLDataSet(XMLDataSet):
-#    "XMLDataSet loaded with BeautifulSoup"
-#    def _load(self) -> str:
-#        "kedro's API-like loader"
-#        self.source_doc = self._load_soup()
-#        return self.source_doc
-
-#    def _load_soup(self):
-#        """open a xml file and return a BeautifulSoup object"""
-#        with open(self._filepath, 'r', encoding="utf-8") as opening:
-#            xml = BeautifulSoup(opening, 'xml')
-#            self.internal_xml = xml
-#        ## xml.prettify() -> str (source_doc)
-#        return xml.prettify()
-
-#    def get_internal_xml(self):
-#        "beautiful soup internal DOM"
-#        if hasattr(self, 'internal_xml'):
-#            return self.internal_xml
-#        else:
-#            attr_error_msg = str(self._describe())
-#            raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
-#        return self.internal_xml
-
-#    def _save(self, data:str) -> None:
-#        "kedro's API-like saver"
-#        raise NotImplementedError("This DataSet shall not be saved...")        
-
-#    def _extract_data(self):
-#        # FIXME -> traitement à déplacer dans le nodes.py
-#        # make_soup -> _load_soup -> soup est déjà chargé
-#        #soup = make_soup(os.path.join(folder, acte))
-#        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
-#        numb = soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
-#        date_time = soup.msItem.docDate["when"]  # YYYY-MM-DD or YYYY-MM date
-#        date = soup.msItem.docDate.text  # verbose date
-#        analyse = soup.abstract.p.text  # acte's short analysis
-#        ref = soup.msIdentifier.find_all("idno", {"n": "2"})
-#        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
-#        # archive box or the page number inside a manuscript (see _create_doc)
-#        # warning: the analysis may not have been written yet,
-#        # which would result in List Index Out of Range Error. Hence :
-#        if len(ref) > 0:  # there is an analysis
-#            ref_acte = ref[0].text
-#        else:  # there is no analysis
-#            ref_acte = "NS"
-#        prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
-#        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
-#        # archive box or manuscript collection id
-#        doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
-#        type_diplo = soup.body.div["subtype"]
-#        diplo_state = soup.body.div["type"]
-
-#        # 2/ Make the data list
-#        actes.append({
-#            "num_acte": counter,
-#            "filename": numb,
-#            "date_time": date_time,
-#            "date": date,
-#            "prod_place_acte": place_query[0],
-#            "analysis": analyse,
-#            "doc_acte": doc_query[0],
-#            "ref_acte": ref_acte,
-#            "state_doc": state_query[0],
-#            "diplo_type_acte": diplo_query[0]
-#            })
-
 #class JSONDataSet(AbstractDataSet):
 #    def __init__(self, filepath: str):
 #        self._filepath = filepath