add bsoup

3 years ago · 29fab8bc02
parent 47f19eb93c
commit 29fab8bc02
3 changed files with 62 additions and 77 deletions
--- a/actes-princiers/conf/base/catalog.yml
+++ b/actes-princiers/conf/base/catalog.yml
@ -1,7 +1,3 @@
 #essai:
 #  type: actesdataset.TextDataSet
 #  filepath: data/01_raw/csv/actors.csv
 # ________________________________________________________________________
 bourbon:
@ -9,11 +5,18 @@ bourbon:
  housename: bourbon
  folderpath: data/01_raw/houses/bourbon
 # FIXME change the path to data/02_intermediate/houses/bourbon/xml 
 bourbon_content:
  type: actesdataset.XMLDataSetCollection
  housename: bourbon
  folderpath: data/02_intermediate/houses/bourbon
 #bourbon_json:
 #  type: actesdataset.XMLDataSetCollection
 #  housename: bourbon
 #  folderpath: data/02_intermediate/houses/bourbon/json
 # ________________________________________________________________________
 berry:
--- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
+++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py
@ -15,17 +15,16 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
        # the dataset **is not** registered in kedro's catalog
        dataset._load()
        descr = dataset._describe()
-#        logger.info(f"dataset {descr} loaded")
+        logger.info(f"dataset {descr} loaded")
        output_source_doc = dataset.transform() 
        # set dataset's output filepath
        output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
        output_xmldataset = EtreeXMLDataSet(output_filepath)
        # let's create subfolders now, if they don't exist
        output_filepath = Path(output_filepath)
        output_xmldataset_dir = output_filepath.parent
        output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
-
+        # save on file
        output_xmldataset._save(output_source_doc)
        output_datasets[dataset_filenamestem] = output_xmldataset
    return output_datasets
--- a/actes-princiers/src/actesdataset.py
+++ b/actes-princiers/src/actesdataset.py
@ -16,7 +16,7 @@ with KedroSession.create() as session:
    xlststylesheet = context.params['xsltstylesheet']
 #xlststylesheet = "templates/xsl/actes_princiers.xsl"
-# FIXME make this function a classmethod ?
+# XXX is it usefull to make this bunch of code a classmethod ?
 def _xslt(xsltstylesheet):
    "performs XML transformation on each dataset"
    xslt_doc = etree.parse(xlststylesheet)
@ -63,6 +63,58 @@ class EtreeXMLDataSet(XMLDataSet):
    def transform(self):
        return str(xslt_transformer(self.source_doc))
 class BsXMLDataSet(XMLDataSet):
    "XMLDataSet loader with BeautifulSoup"
    def _load(self):
        "from the xml file, loads a internal xml repr (with bsoup)"
        with open(self._filepath, 'r', encoding="utf-8") as fhandle:
            self.soup = BeautifulSoup(fhandle, 'xml')
        ## xml.prettify() is the bsoup str(source_doc)
 # FIXME 
 #    def _save(self, data: Dict) -> None:
 #        "kedro's API-like saver"
 #        with open(self._filepath, 'w') as fp:
 #            json.dump(data, fp, sort_keys=True, indent=4)
    def transform(self):
        #soup = make_soup(os.path.join(folder, acte))
        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
        numb = self.soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
        date_time = self.soup.msItem.docDate["when"]  # YYYY-MM-DD or YYYY-MM date
        date = self.soup.msItem.docDate.text  # verbose date
        analyse = self.soup.abstract.p.text  # acte's short analysis
        ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
        # archive box or the page number inside a manuscript (see _create_doc)
        # warning: the analysis may not have been written yet,
        # which would result in List Index Out of Range Error. Hence :
        if len(ref) > 0:  # there is an analysis
            ref_acte = ref[0].text
        else:  # there is no analysis
            ref_acte = "NS"
 #        prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
        # archive box or manuscript collection id
 #        #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
 #        #type_diplo = self.soup.body.div["subtype"]
 #        #diplo_state = self.soup.body.div["type"]
        return {
 #            "num_acte": counter,
            "filename": numb,
            "date_time": date_time,
            "date": date,
 #            "prod_place_acte": place_query[0],
            "analysis": analyse,
 #            "doc_acte": doc_query[0],
            "ref_acte": ref_acte,
 #            "state_doc": state_query[0],
 #            "diplo_type_acte": diplo_query[0]
            }
 class XMLDataSetCollection(AbstractDataSet):
    """Stores instances of ``XMLDataSet``
    implementations to provide ``_load`` and ``_save`` capabilities.
@ -94,7 +146,6 @@ class XMLDataSetCollection(AbstractDataSet):
        "kedro's API repr()"
        return dict(name=self._housename, folderpath=self._folderpath)
 #class TextDataSet:
 #    """loads/saves data from/to a text file using an underlying filesystem
@ -121,74 +172,6 @@ class XMLDataSetCollection(AbstractDataSet):
 #    def _describe(self) -> Dict[str, Any]:
 #        return dict(filepath=self._filepath)
 #class BsXMLDataSet(XMLDataSet):
 #    "XMLDataSet loaded with BeautifulSoup"
 #    def _load(self) -> str:
 #        "kedro's API-like loader"
 #        self.source_doc = self._load_soup()
 #        return self.source_doc
 #    def _load_soup(self):
 #        """open a xml file and return a BeautifulSoup object"""
 #        with open(self._filepath, 'r', encoding="utf-8") as opening:
 #            xml = BeautifulSoup(opening, 'xml')
 #            self.internal_xml = xml
 #        ## xml.prettify() -> str (source_doc)
 #        return xml.prettify()
 #    def get_internal_xml(self):
 #        "beautiful soup internal DOM"
 #        if hasattr(self, 'internal_xml'):
 #            return self.internal_xml
 #        else:
 #            attr_error_msg = str(self._describe())
 #            raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
 #        return self.internal_xml
 #    def _save(self, data:str) -> None:
 #        "kedro's API-like saver"
 #        raise NotImplementedError("This DataSet shall not be saved...")        
 #    def _extract_data(self):
 #        # FIXME -> traitement à déplacer dans le nodes.py
 #        # make_soup -> _load_soup -> soup est déjà chargé
 #        #soup = make_soup(os.path.join(folder, acte))
 #        # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
 #        numb = soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
 #        date_time = soup.msItem.docDate["when"]  # YYYY-MM-DD or YYYY-MM date
 #        date = soup.msItem.docDate.text  # verbose date
 #        analyse = soup.abstract.p.text  # acte's short analysis
 #        ref = soup.msIdentifier.find_all("idno", {"n": "2"})
 #        # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
 #        # archive box or the page number inside a manuscript (see _create_doc)
 #        # warning: the analysis may not have been written yet,
 #        # which would result in List Index Out of Range Error. Hence :
 #        if len(ref) > 0:  # there is an analysis
 #            ref_acte = ref[0].text
 #        else:  # there is no analysis
 #            ref_acte = "NS"
 #        prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
 #        # //sourceDesc//msIdentifier/idno[@n='1'] is always the 
 #        # archive box or manuscript collection id
 #        doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
 #        type_diplo = soup.body.div["subtype"]
 #        diplo_state = soup.body.div["type"]
 #        # 2/ Make the data list
 #        actes.append({
 #            "num_acte": counter,
 #            "filename": numb,
 #            "date_time": date_time,
 #            "date": date,
 #            "prod_place_acte": place_query[0],
 #            "analysis": analyse,
 #            "doc_acte": doc_query[0],
 #            "ref_acte": ref_acte,
 #            "state_doc": state_query[0],
 #            "diplo_type_acte": diplo_query[0]
 #            })
 #class JSONDataSet(AbstractDataSet):
 #    def __init__(self, filepath: str):
 #        self._filepath = filepath