|
|
import logging
|
|
|
import json
|
|
|
from typing import Dict, Any
|
|
|
from pathlib import Path
|
|
|
from abc import ABC, abstractmethod
|
|
|
from unidecode import unidecode # to remove accents in the urls
|
|
|
|
|
|
from lxml import etree
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from kedro.io import AbstractDataSet, DataSetError
|
|
|
from kedro.framework.session import KedroSession
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
class XMLDataSet(ABC):
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
self._filepath = filepath
|
|
|
|
|
|
@property
|
|
|
def filepath(self) -> str:
|
|
|
"xml file's filename getters"
|
|
|
return self._filepath
|
|
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
"kedro's API-like repr()"
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
|
|
@abstractmethod
|
|
|
def _load(self):
|
|
|
pass
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
pass
|
|
|
|
|
|
class EtreeXMLDataSet(XMLDataSet):
|
|
|
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
|
|
|
|
|
|
def __init__(self, filepath, params):
|
|
|
self._filepath = filepath
|
|
|
self.xsltstylesheet = params
|
|
|
|
|
|
def _load(self):
|
|
|
"from the xml file loads a internal xml repr (with element tree)"
|
|
|
# self.source_doc is an etree internal xml repr document
|
|
|
self.source_doc = etree.parse(self._filepath)
|
|
|
# removing namespace
|
|
|
query = "descendant-or-self::*[namespace-uri()!='']"
|
|
|
for element in self.source_doc.xpath(query):
|
|
|
#replacing element name with its local name
|
|
|
element.tag = etree.QName(element).localname
|
|
|
etree.cleanup_namespaces(self.source_doc)
|
|
|
|
|
|
def _save(self, data:str) -> None:
|
|
|
"kedro's API-like saver"
|
|
|
with open(self._filepath, 'w') as fhandle:
|
|
|
fhandle.write(data)
|
|
|
|
|
|
@staticmethod
|
|
|
def _xslt(xsltstylesheet):
|
|
|
"performs XML transformation on each dataset"
|
|
|
xslt_doc = etree.parse(xsltstylesheet)
|
|
|
xslt_transformer = etree.XSLT(xslt_doc)
|
|
|
return xslt_transformer
|
|
|
|
|
|
def transform(self):
|
|
|
xslt_transformer = self._xslt(self.xsltstylesheet)
|
|
|
return str(xslt_transformer(self.source_doc))
|
|
|
|
|
|
class BsXMLDataSet(XMLDataSet):
|
|
|
"XMLDataSet loader with BeautifulSoup"
|
|
|
|
|
|
def _load(self):
|
|
|
"from the xml file, loads a internal xml repr (with bsoup)"
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
|
|
|
self.soup = BeautifulSoup(fhandle, 'xml')
|
|
|
## xml.prettify() is the bsoup str(source_doc)
|
|
|
|
|
|
def _save(self, data: Dict) -> None:
|
|
|
"kedro's API-like saver"
|
|
|
with open(self._filepath, 'w') as fp:
|
|
|
json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
def find_transcribers(self):
|
|
|
"find transcriber xml bs4 helper"
|
|
|
transcribers = self.soup.find_all('teiHeader')
|
|
|
trs = []
|
|
|
for header in transcribers:
|
|
|
respStmt = header.find('fileDesc').find('titleStmt').find('respStmt')
|
|
|
if respStmt:
|
|
|
trs_name = respStmt.find('name')
|
|
|
if trs_name:
|
|
|
trs.append(trs_name.get_text())
|
|
|
return trs
|
|
|
|
|
|
|
|
|
def find_prince_name(self):
|
|
|
"""find prince_name xml bs4 helper
|
|
|
|
|
|
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
|
|
|
"""
|
|
|
person = self.soup.find("listPerson", {'type': "prince"} )
|
|
|
ps = person.find('name')
|
|
|
prince_name = ps.get_text()
|
|
|
return prince_name
|
|
|
|
|
|
def extract_prince_code_from_filestem(self, filestem):
|
|
|
"""
|
|
|
builds prince code
|
|
|
|
|
|
:param: filestem
|
|
|
sample: "anj_isa_i_1441_08_05a"
|
|
|
:return: prince code, sample: "isa_i"
|
|
|
"""
|
|
|
# cut with the underscores
|
|
|
cut = filestem.split('_')
|
|
|
# remove house and date
|
|
|
prince_code = "_".join(cut[1:3])
|
|
|
return prince_code
|
|
|
|
|
|
def make_acte_url_from_filestem(self, house, prince_name, filestem):
|
|
|
"""
|
|
|
url sample : /acte/Bourbon/Louis_ii/1367_04_26a
|
|
|
|
|
|
"""
|
|
|
# bourbon -> Bourbon
|
|
|
house = house.capitalize()
|
|
|
# "Louis II de Bourbon" -> "Louis"
|
|
|
prince_name = prince_name.split(" ")[0]
|
|
|
# "anj_isa_i" -> "i"
|
|
|
prince_number = filestem.split("_")[2]
|
|
|
prince_name = unidecode(prince_name)
|
|
|
# Isabelle + i -> Isabelle_i
|
|
|
prince_name = prince_name + "_" + prince_number
|
|
|
timeitem = self.make_timeitem_from_filestem(filestem)
|
|
|
# final url
|
|
|
return "/" + "/".join(['acte', house, prince_name, timeitem])
|
|
|
|
|
|
def make_timeitem_from_filestem(self, filestem):
|
|
|
"""
|
|
|
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
|
|
|
"""
|
|
|
trs_fname = filestem.split('_')
|
|
|
return "_".join(trs_fname[3:])
|
|
|
|
|
|
def transform(self):
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
# date formats : YYYY-MM-DD, YYYY-MM or just YYYY
|
|
|
date_time = self.soup.msItem.docDate["when"]
|
|
|
|
|
|
# datetime parsing
|
|
|
from datetime import datetime
|
|
|
if len(date_time.split('-')) == 1:
|
|
|
# time format 'YYYY'
|
|
|
isotime = datetime.strptime(date_time,'%Y')
|
|
|
date_time = isotime.isoformat()
|
|
|
|
|
|
elif len(date_time.split('-')) == 2:
|
|
|
# time format '%Y-%m'
|
|
|
isotime = datetime.strptime(date_time,'%Y-%m')
|
|
|
date_time = isotime.isoformat()
|
|
|
|
|
|
elif len(date_time.split('-')) == 3:
|
|
|
# time format '%Y-%m-%d'
|
|
|
isotime = datetime.strptime(date_time,'%Y-%m-%d')
|
|
|
date_time = isotime.isoformat()
|
|
|
else:
|
|
|
# FIXME raise exception
|
|
|
pass
|
|
|
|
|
|
date = self.soup.msItem.docDate.text # verbose date
|
|
|
analyse = self.soup.abstract.p.text # acte's short analysis
|
|
|
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
ref_acte = ref[0].text
|
|
|
else: # there is no analysis
|
|
|
ref_acte = "NS"
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
# which would result in List Index Out of Range Error. Hence:
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
# archive box or manuscript collection id
|
|
|
#doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
type_diplo = self.soup.body.div["subtype"]
|
|
|
diplo_state = self.soup.body.div["type"]
|
|
|
# geolocalisation
|
|
|
place = self.soup.find("place")
|
|
|
place_name = place.find("placeName")
|
|
|
if place_name.get_text() != "NS":
|
|
|
pl_name = place_name.get_text()
|
|
|
else:
|
|
|
pl_name = "Non spécifié"
|
|
|
|
|
|
region_balise = place.find("region")
|
|
|
if region_balise is not None:
|
|
|
region = region_balise.get_text()
|
|
|
else:
|
|
|
region = "Non spécifié"
|
|
|
|
|
|
settlement = place.find("settlement")
|
|
|
if settlement is not None:
|
|
|
settlement = settlement.get_text()
|
|
|
else:
|
|
|
settlement = "Non spécifié"
|
|
|
|
|
|
geolocalisation = place.find("geo")
|
|
|
if geolocalisation is not None:
|
|
|
geolocalisation = geolocalisation.get_text()
|
|
|
latitude, longitude = geolocalisation.split(" ")
|
|
|
else:
|
|
|
latitude = None
|
|
|
longitude = None
|
|
|
|
|
|
place = dict(name=pl_name,
|
|
|
region=region,
|
|
|
settlement=settlement,
|
|
|
latitude = latitude,
|
|
|
longitude = longitude
|
|
|
)
|
|
|
# nakala
|
|
|
image = self.soup.find("graphic")
|
|
|
if image is not None:
|
|
|
image = image.get('url')
|
|
|
prince_name = self.find_prince_name()
|
|
|
#self._filepath
|
|
|
# data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml
|
|
|
house = self.filepath.split("/")[3]
|
|
|
return {
|
|
|
"prince_name": prince_name,
|
|
|
"prince_code": self.extract_prince_code_from_filestem(numb),
|
|
|
"filename": numb,
|
|
|
"date_time": date_time,
|
|
|
"date": date,
|
|
|
"analysis": analyse,
|
|
|
"ref_acte": ref_acte,
|
|
|
"transcribers": self.find_transcribers(),
|
|
|
"place": place,
|
|
|
"diplo_type": type_diplo,
|
|
|
"diplo_state": diplo_state,
|
|
|
"image": image,
|
|
|
"url": self.make_acte_url_from_filestem(house, prince_name, numb)
|
|
|
}
|
|
|
|
|
|
class DataSetCollection(AbstractDataSet):
|
|
|
"""Stores instances of ``DataSetCollection``
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
"""
|
|
|
def __init__(self,
|
|
|
housename: str,
|
|
|
folderpath: str) -> None:
|
|
|
self._housename = housename
|
|
|
self._folderpath = Path(folderpath)
|
|
|
# the collections key: file name, value: dataset object
|
|
|
self.datasets = dict()
|
|
|
|
|
|
def _save(self, data) -> None:
|
|
|
"""kedro's API saver method
|
|
|
|
|
|
There is **nothing to save**, because
|
|
|
this dataset collections is a *container* dataset.
|
|
|
this method is here only because kedro requires it.
|
|
|
"""
|
|
|
pass
|
|
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
"kedro's API repr()"
|
|
|
return dict(name=self._housename,
|
|
|
folderpath=str(self._folderpath))
|
|
|
|
|
|
|
|
|
class XMLDataSetCollection(DataSetCollection):
|
|
|
def __init__(self, housename: str,
|
|
|
folderpath: str, xsltstylesheet: str) -> None:
|
|
|
super().__init__(housename, folderpath)
|
|
|
self.xsltstylesheet = xsltstylesheet
|
|
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
"kedro's API loader method"
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(str(filepath), self.xsltstylesheet)
|
|
|
return self
|
|
|
|
|
|
|
|
|
class BsXMLDataSetCollection(DataSetCollection):
|
|
|
def _load(self) -> dict[str, BsXMLDataSet]:
|
|
|
"kedro's API loader method"
|
|
|
self.datasets = dict()
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
self.datasets[filepath.stem] = BsXMLDataSet(
|
|
|
filepath=str(filepath))
|
|
|
return self
|
|
|
|
|
|
|
|
|
class JSONDataSet:
|
|
|
def __init__(self, filepath: str):
|
|
|
self._filepath = filepath
|
|
|
|
|
|
def _load(self) -> Dict:
|
|
|
with open(self._filepath, 'r') as fp:
|
|
|
return json.load(fp)
|
|
|
|
|
|
def _save(self, data: Dict) -> None:
|
|
|
with open(self._filepath, 'w') as fp:
|
|
|
json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
|
|
|
|
|
class JSONDataSetCollection(DataSetCollection):
|
|
|
def _load(self) -> dict[str, JSONDataSet]:
|
|
|
"kedro's API loader method"
|
|
|
self.datasets = dict()
|
|
|
for filepath in sorted(self._folderpath.glob("*.json")):
|
|
|
self.datasets[filepath.stem] = JSONDataSet(
|
|
|
filepath=str(filepath))
|
|
|
return self
|
|
|
|
|
|
class TextDataSet:
|
|
|
"""loads/saves data from/to a text file using an underlying filesystem
|
|
|
example usage
|
|
|
>>> string_to_write = "This will go in a file."
|
|
|
>>>
|
|
|
>>> data_set = TextDataSet(filepath="test.md")
|
|
|
>>> data_set.save(string_to_write)
|
|
|
>>> reloaded = data_set.load()
|
|
|
>>> assert string_to_write == reloaded
|
|
|
"""
|
|
|
def __init__(self, filepath: str):
|
|
|
self._filepath = filepath
|
|
|
|
|
|
def _load(self) -> str:
|
|
|
with open(self._filepath, 'r') as fhandle:
|
|
|
return fhandle.read()
|
|
|
|
|
|
def _save(self, data: str) -> None:
|
|
|
with open(self._filepath, 'w') as fhandle:
|
|
|
fhandle.write(data)
|
|
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
|
|
class TextDataSetCollection(DataSetCollection):
|
|
|
def _load(self) -> dict[str, JSONDataSet]:
|
|
|
"kedro's API loader method"
|
|
|
self.datasets = dict()
|
|
|
for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
|
|
|
self.datasets[filepath.stem] = TextDataSet(
|
|
|
filepath=str(filepath))
|
|
|
return self
|
|
|
|
|
|
#class FoliumHTMLDataSet(AbstractDataSet):
|
|
|
# def __init__(self, filepath: str):
|
|
|
# self._filepath = filepath
|
|
|
#
|
|
|
# def _load(self) -> None:
|
|
|
# raise DataSetError('This dataset is WriteOnly')
|
|
|
#
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
# return dict(filepath=self._filepath)
|
|
|
#
|
|
|
# def _save(self, data: Map) -> None:
|
|
|
# data.save(self._filepath)
|
|
|
|