|
|
import logging
|
|
|
import json
|
|
|
from typing import Dict, Any
|
|
|
from pathlib import Path
|
|
|
|
|
|
from lxml import etree
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from kedro.io import AbstractDataSet, DataSetError
|
|
|
from kedro.framework.session import KedroSession
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
# FIXME hériter de abc (cf dans le code de kedro)
|
|
|
class XMLDataSet:
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
self._filepath = filepath
|
|
|
|
|
|
def get_filepath(self) -> str:
|
|
|
"xml file's long filename getters"
|
|
|
return self._filepath
|
|
|
|
|
|
def get_source_doc(self) -> str:
|
|
|
"XML source_doc (xml as a string) getter"
|
|
|
if hasattr(self, 'source_doc'):
|
|
|
return self.source_doc
|
|
|
else:
|
|
|
attr_error_msg = str(self._describe())
|
|
|
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
|
|
|
|
|
|
def set_source_doc(self, source_doc: str) -> None:
|
|
|
"XML source_doc (xml as a string) setter"
|
|
|
self.source_doc = source_doc
|
|
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
"kedro's API-like repr()"
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
|
|
|
|
|
class EtreeXMLDataSet(XMLDataSet):
|
|
|
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
|
|
|
|
|
|
def _transform_source_doc(self) -> etree._ElementTree:
|
|
|
"xml transformer (with element tree)"
|
|
|
self.source_doc = etree.parse(self._filepath)
|
|
|
# removing namespace
|
|
|
query = "descendant-or-self::*[namespace-uri()!='']"
|
|
|
for element in self.source_doc.xpath(query):
|
|
|
#replacing element name with its local name
|
|
|
element.tag = etree.QName(element).localname
|
|
|
etree.cleanup_namespaces(self.source_doc)
|
|
|
return self.source_doc
|
|
|
|
|
|
def _load(self) -> etree._ElementTree:
|
|
|
"kedro's API-like loader"
|
|
|
self._transform_source_doc()
|
|
|
return self.source_doc
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
"kedro's API-like saver"
|
|
|
with open(self._filepath, 'w') as fhandle:
|
|
|
fhandle.write(data)
|
|
|
|
|
|
|
|
|
class BsXMLDataSet(XMLDataSet):
|
|
|
"XMLDataSet loaded with BeautifulSoup"
|
|
|
|
|
|
def _load(self) -> etree._ElementTree:
|
|
|
"kedro's API-like loader"
|
|
|
self._transform_source_doc()
|
|
|
return self.source_doc
|
|
|
|
|
|
def _load_soup(self):
|
|
|
"""open a xml file and return a BeautifulSoup object"""
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as opening:
|
|
|
xml = BeautifulSoup(opening, 'xml')
|
|
|
return xml
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
"kedro's API-like saver"
|
|
|
raise NotImplementedError("This DataSet shall not be saved...")
|
|
|
|
|
|
def _extract_data(self):
|
|
|
# FIXME -> traitement à déplacer dans le nodes.py
|
|
|
# make_soup -> _load_soup -> soup est déjà chargé
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
|
|
|
date = soup.msItem.docDate.text # verbose date
|
|
|
analyse = soup.abstract.p.text # acte's short analysis
|
|
|
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
# which would result in List Index Out of Range Error. Hence :
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
ref_acte = ref[0].text
|
|
|
else: # there is no analysis
|
|
|
ref_acte = "NS"
|
|
|
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
# archive box or manuscript collection id
|
|
|
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
type_diplo = soup.body.div["subtype"]
|
|
|
diplo_state = soup.body.div["type"]
|
|
|
|
|
|
# 2/ Make the data list
|
|
|
actes.append({
|
|
|
"num_acte": counter,
|
|
|
"filename": numb,
|
|
|
"date_time": date_time,
|
|
|
"date": date,
|
|
|
"prod_place_acte": place_query[0],
|
|
|
"analysis": analyse,
|
|
|
"doc_acte": doc_query[0],
|
|
|
"ref_acte": ref_acte,
|
|
|
"state_doc": state_query[0],
|
|
|
"diplo_type_acte": diplo_query[0]
|
|
|
})
|
|
|
|
|
|
|
|
|
class XMLDataSetCollection(AbstractDataSet):
|
|
|
"""Stores instances of ``XMLDataSet``
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
"""
|
|
|
def __init__(self,
|
|
|
housename: str,
|
|
|
folderpath: str) -> None:
|
|
|
self._housename = housename
|
|
|
self._folderpath = Path(folderpath)
|
|
|
|
|
|
def get_datasets(self) -> Dict[str, Any]:
|
|
|
"datasets mapper getter"
|
|
|
if hasattr(self, 'datasets'):
|
|
|
return self.datasets
|
|
|
else:
|
|
|
attr_error_msg = str(self._describe())
|
|
|
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
|
|
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
"kedro's API loader"
|
|
|
self.datasets = dict()
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(
|
|
|
filepath=str(filepath))
|
|
|
return self.datasets
|
|
|
|
|
|
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None:
|
|
|
"kedro's API saver"
|
|
|
for stemfilename, dataset in datasets.items():
|
|
|
dataset._save(dataset.get_source_doc())
|
|
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
"kedro's API repr()"
|
|
|
return dict(name=self._housename, folderpath=self._folderpath)
|
|
|
|
|
|
|
|
|
#class JSONDataSet(AbstractDataSet):
|
|
|
# def __init__(self, filepath: str):
|
|
|
# self._filepath = filepath
|
|
|
|
|
|
# def _load(self) -> Dict:
|
|
|
# with open(self._filepath, 'r') as fp:
|
|
|
# return json.load(fp)
|
|
|
|
|
|
# def _save(self, data: Dict) -> None:
|
|
|
# with open(self._filepath, 'w') as fp:
|
|
|
# json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
# return dict(filepath=self._filepath)
|