You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

176 lines
6.2 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

import logging
import json
from typing import Dict, Any
from pathlib import Path
from lxml import etree
from bs4 import BeautifulSoup
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
# FIXME hériter de abc (cf dans le code de kedro)
class XMLDataSet:
"Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None:
self._filepath = filepath
def get_filepath(self) -> str:
"xml file's long filename getters"
return self._filepath
def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'):
return self.source_doc
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc: str) -> None:
"XML source_doc (xml as a string) setter"
self.source_doc = source_doc
def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath)
class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)"
self.source_doc = etree.parse(self._filepath)
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
#replacing element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc)
return self.source_doc
def _load(self) -> etree._ElementTree:
"kedro's API-like loader"
self._transform_source_doc()
return self.source_doc
def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
class BsXMLDataSet(XMLDataSet):
"XMLDataSet loaded with BeautifulSoup"
def _load(self) -> etree._ElementTree:
"kedro's API-like loader"
self._transform_source_doc()
return self.source_doc
def _load_soup(self):
"""open a xml file and return a BeautifulSoup object"""
with open(self._filepath, 'r', encoding="utf-8") as opening:
xml = BeautifulSoup(opening, 'xml')
return xml
def _save(self, data:str) -> None:
"kedro's API-like saver"
raise NotImplementedError("This DataSet shall not be saved...")
def _extract_data(self):
# FIXME -> traitement à déplacer dans le nodes.py
# make_soup -> _load_soup -> soup est déjà chargé
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
date = soup.msItem.docDate.text # verbose date
analyse = soup.abstract.p.text # acte's short analysis
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
# which would result in List Index Out of Range Error. Hence :
if len(ref) > 0: # there is an analysis
ref_acte = ref[0].text
else: # there is no analysis
ref_acte = "NS"
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
type_diplo = soup.body.div["subtype"]
diplo_state = soup.body.div["type"]
# 2/ Make the data list
actes.append({
"num_acte": counter,
"filename": numb,
"date_time": date_time,
"date": date,
"prod_place_acte": place_query[0],
"analysis": analyse,
"doc_acte": doc_query[0],
"ref_acte": ref_acte,
"state_doc": state_query[0],
"diplo_type_acte": diplo_query[0]
})
class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet``
implementations to provide ``_load`` and ``_save`` capabilities.
"""
def __init__(self,
housename: str,
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
def get_datasets(self) -> Dict[str, Any]:
"datasets mapper getter"
if hasattr(self, 'datasets'):
return self.datasets
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
return self.datasets
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None:
"kedro's API saver"
for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc())
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath)
#class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
# def _load(self) -> Dict:
# with open(self._filepath, 'r') as fp:
# return json.load(fp)
# def _save(self, data: Dict) -> None:
# with open(self._filepath, 'w') as fp:
# json.dump(data, fp, sort_keys=True, indent=4)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)