You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

371 lines
12 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

import logging
import json
from typing import Dict, Any
from pathlib import Path
from abc import ABC, abstractmethod
from unidecode import unidecode # to remove accents in the urls
from lxml import etree
from bs4 import BeautifulSoup
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader"
def __init__(self, filepath: str) -> None:
self._filepath = filepath
@property
def filepath(self) -> str:
"xml file's filename getters"
return self._filepath
def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath)
@abstractmethod
def _load(self):
pass
def _save(self, data:str) -> None:
pass
class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def __init__(self, filepath, params):
self._filepath = filepath
self.xsltstylesheet = params
def _load(self):
"from the xml file loads a internal xml repr (with element tree)"
# self.source_doc is an etree internal xml repr document
self.source_doc = etree.parse(self._filepath)
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
#replacing element name with its local name
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(self.source_doc)
def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
@staticmethod
def _xslt(xsltstylesheet):
"performs XML transformation on each dataset"
xslt_doc = etree.parse(xsltstylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return xslt_transformer
def transform(self):
xslt_transformer = self._xslt(self.xsltstylesheet)
return str(xslt_transformer(self.source_doc))
class BsXMLDataSet(XMLDataSet):
"XMLDataSet loader with BeautifulSoup"
def _load(self):
"from the xml file, loads a internal xml repr (with bsoup)"
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
self.soup = BeautifulSoup(fhandle, 'xml')
## xml.prettify() is the bsoup str(source_doc)
def _save(self, data: Dict) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=4)
def find_transcribers(self):
"find transcriber xml bs4 helper"
transcribers = self.soup.find_all('teiHeader')
trs = []
for header in transcribers:
respStmt = header.find('fileDesc').find('titleStmt').find('respStmt')
if respStmt:
trs_name = respStmt.find('name')
if trs_name:
trs.append(trs_name.get_text())
return trs
def find_prince_name(self):
"""find prince_name xml bs4 helper
prince_name = tree.xpath('//listPerson[@type="prince"]/person/name/text()')
"""
person = self.soup.find("listPerson", {'type': "prince"} )
ps = person.find('name')
prince_name = ps.get_text()
return prince_name
def extract_prince_code_from_filestem(self, filestem):
"""
builds prince code
:param: filestem
sample: "anj_isa_i_1441_08_05a"
:return: prince code, sample: "isa_i"
"""
# cut with the underscores
cut = filestem.split('_')
# remove house and date
prince_code = "_".join(cut[1:3])
return prince_code
def make_acte_url_from_filestem(self, house, prince_name, filestem):
"""
url sample : /acte/Bourbon/Louis_ii/1367_04_26a
"""
# bourbon -> Bourbon
house = house.capitalize()
# "Louis II de Bourbon" -> "Louis"
prince_name = prince_name.split(" ")[0]
# "anj_isa_i" -> "i"
prince_number = filestem.split("_")[2]
prince_name = unidecode(prince_name)
# Isabelle + i -> Isabelle_i
prince_name = prince_name + "_" + prince_number
timeitem = self.make_timeitem_from_filestem(filestem)
# final url
return "/" + "/".join(['acte', house, prince_name, timeitem])
def make_timeitem_from_filestem(self, filestem):
"""
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
"""
trs_fname = filestem.split('_')
return "_".join(trs_fname[3:])
def transform(self):
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
# date formats : YYYY-MM-DD, YYYY-MM or just YYYY
date_time = self.soup.msItem.docDate["when"]
# datetime parsing
from datetime import datetime
if len(date_time.split('-')) == 1:
# time format 'YYYY'
isotime = datetime.strptime(date_time,'%Y')
date_time = isotime.isoformat()
elif len(date_time.split('-')) == 2:
# time format '%Y-%m'
isotime = datetime.strptime(date_time,'%Y-%m')
date_time = isotime.isoformat()
elif len(date_time.split('-')) == 3:
# time format '%Y-%m-%d'
isotime = datetime.strptime(date_time,'%Y-%m-%d')
date_time = isotime.isoformat()
else:
# FIXME raise exception
pass
date = self.soup.msItem.docDate.text # verbose date
analyse = self.soup.abstract.p.text # acte's short analysis
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
if len(ref) > 0: # there is an analysis
ref_acte = ref[0].text
else: # there is no analysis
ref_acte = "NS"
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
# which would result in List Index Out of Range Error. Hence:
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
#doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
type_diplo = self.soup.body.div["subtype"]
diplo_state = self.soup.body.div["type"]
# geolocalisation
place = self.soup.find("place")
place_name = place.find("placeName")
if place_name.get_text() != "NS":
pl_name = place_name.get_text()
else:
pl_name = "Non spécifié"
region_balise = place.find("region")
if region_balise is not None:
region = region_balise.get_text()
else:
region = "Non spécifié"
settlement = place.find("settlement")
if settlement is not None:
settlement = settlement.get_text()
else:
settlement = "Non spécifié"
geolocalisation = place.find("geo")
if geolocalisation is not None:
geolocalisation = geolocalisation.get_text()
latitude, longitude = geolocalisation.split(" ")
else:
latitude = None
longitude = None
place = dict(name=pl_name,
region=region,
settlement=settlement,
latitude = latitude,
longitude = longitude
)
# nakala
image = self.soup.find("graphic")
if image is not None:
image = image.get('url')
prince_name = self.find_prince_name()
#self._filepath
# data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml
house = self.filepath.split("/")[3]
return {
"prince_name": prince_name,
"prince_code": self.extract_prince_code_from_filestem(numb),
"filename": numb,
"date_time": date_time,
"date": date,
"analysis": analyse,
"ref_acte": ref_acte,
"transcribers": self.find_transcribers(),
"place": place,
"diplo_type": type_diplo,
"diplo_state": diplo_state,
"image": image,
"url": self.make_acte_url_from_filestem(house, prince_name, numb)
}
class DataSetCollection(AbstractDataSet):
"""Stores instances of ``DataSetCollection``
implementations to provide ``_load`` and ``_save`` capabilities.
"""
def __init__(self,
housename: str,
folderpath: str) -> None:
self._housename = housename
self._folderpath = Path(folderpath)
# the collections key: file name, value: dataset object
self.datasets = dict()
def _save(self, data) -> None:
"""kedro's API saver method
 There is **nothing to save**, because
 this dataset collections is a *container* dataset.
this method is here only because kedro requires it.
 """
pass
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename,
folderpath=str(self._folderpath))
class XMLDataSetCollection(DataSetCollection):
def __init__(self, housename: str,
folderpath: str, xsltstylesheet: str) -> None:
super().__init__(housename, folderpath)
self.xsltstylesheet = xsltstylesheet
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(str(filepath), self.xsltstylesheet)
return self
class BsXMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, BsXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = BsXMLDataSet(
filepath=str(filepath))
return self
class JSONDataSet:
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self) -> Dict:
with open(self._filepath, 'r') as fp:
return json.load(fp)
def _save(self, data: Dict) -> None:
with open(self._filepath, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=4)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)
class JSONDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.json")):
self.datasets[filepath.stem] = JSONDataSet(
filepath=str(filepath))
return self
class TextDataSet:
"""loads/saves data from/to a text file using an underlying filesystem
example usage
>>> string_to_write = "This will go in a file."
>>>
>>> data_set = TextDataSet(filepath="test.md")
>>> data_set.save(string_to_write)
>>> reloaded = data_set.load()
>>> assert string_to_write == reloaded
"""
def __init__(self, filepath: str):
self._filepath = filepath
def _load(self) -> str:
with open(self._filepath, 'r') as fhandle:
return fhandle.read()
def _save(self, data: str) -> None:
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
return dict(filepath=self._filepath)
class TextDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, JSONDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.pseudoxml")):
self.datasets[filepath.stem] = TextDataSet(
filepath=str(filepath))
return self
#class FoliumHTMLDataSet(AbstractDataSet):
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> None:
# raise DataSetError('This dataset is WriteOnly')
#
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
#
# def _save(self, data: Map) -> None:
# data.save(self._filepath)