|
|
|
|
@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
|
|
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
#from folium import Map
|
|
|
|
|
|
|
|
|
|
from kedro.io import AbstractDataSet, DataSetError
|
|
|
|
|
from kedro.framework.session import KedroSession
|
|
|
|
|
@ -15,12 +16,12 @@ logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
class XMLDataSet(ABC):
|
|
|
|
|
"Abstract base class for an XML dataset loader"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, filepath: str) -> None:
|
|
|
|
|
self._filepath = filepath
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def filepath(self) -> str:
|
|
|
|
|
def filepath(self) -> str:
|
|
|
|
|
"xml file's filename getters"
|
|
|
|
|
return self._filepath
|
|
|
|
|
|
|
|
|
|
@ -41,7 +42,7 @@ class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
|
def __init__(self, filepath, params):
|
|
|
|
|
self._filepath = filepath
|
|
|
|
|
self.xsltstylesheet = params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self):
|
|
|
|
|
"from the xml file loads a internal xml repr (with element tree)"
|
|
|
|
|
# self.source_doc is an etree internal xml repr document
|
|
|
|
|
@ -100,7 +101,7 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
else: # there is no analysis
|
|
|
|
|
ref_acte = "NS"
|
|
|
|
|
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
|
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
|
# #type_diplo = self.soup.body.div["subtype"]
|
|
|
|
|
@ -123,26 +124,26 @@ class DataSetCollection(AbstractDataSet):
|
|
|
|
|
"""Stores instances of ``DataSetCollection``
|
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self,
|
|
|
|
|
def __init__(self,
|
|
|
|
|
housename: str,
|
|
|
|
|
folderpath: str) -> None:
|
|
|
|
|
self._housename = housename
|
|
|
|
|
self._folderpath = Path(folderpath)
|
|
|
|
|
# the collections key: file name, value: dataset object
|
|
|
|
|
self.datasets = dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data) -> None:
|
|
|
|
|
"""kedro's API saver method
|
|
|
|
|
|
|
|
|
|
There is **nothing to save**, because
|
|
|
|
|
|
|
|
|
|
There is **nothing to save**, because
|
|
|
|
|
this dataset collections is a *container* dataset.
|
|
|
|
|
this method is here only because kedro requires it.
|
|
|
|
|
"""
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
|
|
"kedro's API repr()"
|
|
|
|
|
return dict(name=self._housename,
|
|
|
|
|
return dict(name=self._housename,
|
|
|
|
|
folderpath=str(self._folderpath))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -206,18 +207,18 @@ class TextDataSet:
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, filepath: str):
|
|
|
|
|
self._filepath = filepath
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self) -> str:
|
|
|
|
|
with open(self._filepath, 'r') as fhandle:
|
|
|
|
|
return fhandle.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data: str) -> None:
|
|
|
|
|
with open(self._filepath, 'w') as fhandle:
|
|
|
|
|
fhandle.write(data)
|
|
|
|
|
|
|
|
|
|
def _describe(self) -> Dict[str, Any]:
|
|
|
|
|
return dict(filepath=self._filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextDataSetCollection(DataSetCollection):
|
|
|
|
|
def _load(self) -> dict[str, JSONDataSet]:
|
|
|
|
|
"kedro's API loader method"
|
|
|
|
|
@ -226,4 +227,17 @@ class TextDataSetCollection(DataSetCollection):
|
|
|
|
|
self.datasets[filepath.stem] = TextDataSet(
|
|
|
|
|
filepath=str(filepath))
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#class FoliumHTMLDataSet(AbstractDataSet):
|
|
|
|
|
# def __init__(self, filepath: str):
|
|
|
|
|
# self._filepath = filepath
|
|
|
|
|
#
|
|
|
|
|
# def _load(self) -> None:
|
|
|
|
|
# raise DataSetError('This dataset is WriteOnly')
|
|
|
|
|
#
|
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
|
|
# return dict(filepath=self._filepath)
|
|
|
|
|
#
|
|
|
|
|
# def _save(self, data: Map) -> None:
|
|
|
|
|
# data.save(self._filepath)
|
|
|
|
|
#
|
|
|
|
|
|