You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
7.1 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

import logging
import json
from typing import Dict, Any
from pathlib import Path
from lxml import etree
from bs4 import BeautifulSoup
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
class XMLDataSet:
"Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None:
self._filepath = filepath
# def _load(self):
# "kedro's API-like loader"
# pass
#
# def _save(self, data:str) -> None:
# "kedro's API-like saver"
# pass
@property
def filepath(self) -> str:
"xml file's filename getters"
return self._filepath
# FIXME à supprimer MAIS alors il faut rapatrier transform()...
def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'):
return self.source_doc
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath)
class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def _load(self):
"from the xml file loads a internal xml repr (with element tree)"
# self.source_doc is an etree internal xml repr document
self.source_doc = etree.parse(self._filepath)
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
#replacing element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc)
def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet``
implementations to provide ``_load`` and ``_save`` capabilities.
"""
def __init__(self,
housename: str,
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
return self.datasets
def _save(self, data) -> None:
"""kedro's API saver method
 There is **nothing to save**, because
 this dataset collections is a *container* dataset.
this method is here only because kedro requires it.
 """
pass
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath)
#class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem
# example usage
# >>> string_to_write = "This will go in a file."
# >>>
# >>> data_set = TextDataSet(filepath="test.md")
# >>> data_set.save(string_to_write)
# >>> reloaded = data_set.load()
# >>> assert string_to_write == reloaded
# """
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> str:
# with open(self._filepath, 'r') as fhandle:
# return fhandle.read()
# def _save(self, data: str) -> None:
# with open(self._filepath, 'w') as fhandle:
# fhandle.write(data)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
#class BsXMLDataSet(XMLDataSet):
# "XMLDataSet loaded with BeautifulSoup"
# def _load(self) -> str:
# "kedro's API-like loader"
# self.source_doc = self._load_soup()
# return self.source_doc
# def _load_soup(self):
# """open a xml file and return a BeautifulSoup object"""
# with open(self._filepath, 'r', encoding="utf-8") as opening:
# xml = BeautifulSoup(opening, 'xml')
# self.internal_xml = xml
# ## xml.prettify() -> str (source_doc)
# return xml.prettify()
# def get_internal_xml(self):
# "beautiful soup internal DOM"
# if hasattr(self, 'internal_xml'):
# return self.internal_xml
# else:
# attr_error_msg = str(self._describe())
# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
# return self.internal_xml
# def _save(self, data:str) -> None:
# "kedro's API-like saver"
# raise NotImplementedError("This DataSet shall not be saved...")
# def _extract_data(self):
# # FIXME -> traitement à déplacer dans le nodes.py
# # make_soup -> _load_soup -> soup est déjà chargé
# #soup = make_soup(os.path.join(folder, acte))
# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
# date = soup.msItem.docDate.text # verbose date
# analyse = soup.abstract.p.text # acte's short analysis
# ref = soup.msIdentifier.find_all("idno", {"n": "2"})
# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# # archive box or the page number inside a manuscript (see _create_doc)
# # warning: the analysis may not have been written yet,
# # which would result in List Index Out of Range Error. Hence :
# if len(ref) > 0: # there is an analysis
# ref_acte = ref[0].text
# else: # there is no analysis
# ref_acte = "NS"
# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
# # //sourceDesc//msIdentifier/idno[@n='1'] is always the
# # archive box or manuscript collection id
# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# type_diplo = soup.body.div["subtype"]
# diplo_state = soup.body.div["type"]
# # 2/ Make the data list
# actes.append({
# "num_acte": counter,
# "filename": numb,
# "date_time": date_time,
# "date": date,
# "prod_place_acte": place_query[0],
# "analysis": analyse,
# "doc_acte": doc_query[0],
# "ref_acte": ref_acte,
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
# })
#class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
# def _load(self) -> Dict:
# with open(self._filepath, 'r') as fp:
# return json.load(fp)
# def _save(self, data: Dict) -> None:
# with open(self._filepath, 'w') as fp:
# json.dump(data, fp, sort_keys=True, indent=4)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)