|
|
|
@ -16,7 +16,7 @@ with KedroSession.create() as session:
|
|
|
|
xlststylesheet = context.params['xsltstylesheet']
|
|
|
|
xlststylesheet = context.params['xsltstylesheet']
|
|
|
|
|
|
|
|
|
|
|
|
#xlststylesheet = "templates/xsl/actes_princiers.xsl"
|
|
|
|
#xlststylesheet = "templates/xsl/actes_princiers.xsl"
|
|
|
|
# FIXME make this function a classmethod ?
|
|
|
|
# XXX is it usefull to make this bunch of code a classmethod ?
|
|
|
|
def _xslt(xsltstylesheet):
|
|
|
|
def _xslt(xsltstylesheet):
|
|
|
|
"performs XML transformation on each dataset"
|
|
|
|
"performs XML transformation on each dataset"
|
|
|
|
xslt_doc = etree.parse(xlststylesheet)
|
|
|
|
xslt_doc = etree.parse(xlststylesheet)
|
|
|
|
@ -63,6 +63,58 @@ class EtreeXMLDataSet(XMLDataSet):
|
|
|
|
def transform(self):
|
|
|
|
def transform(self):
|
|
|
|
return str(xslt_transformer(self.source_doc))
|
|
|
|
return str(xslt_transformer(self.source_doc))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
"XMLDataSet loader with BeautifulSoup"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self):
|
|
|
|
|
|
|
|
"from the xml file, loads a internal xml repr (with bsoup)"
|
|
|
|
|
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
|
|
|
|
|
|
|
|
self.soup = BeautifulSoup(fhandle, 'xml')
|
|
|
|
|
|
|
|
## xml.prettify() is the bsoup str(source_doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME
|
|
|
|
|
|
|
|
# def _save(self, data: Dict) -> None:
|
|
|
|
|
|
|
|
# "kedro's API-like saver"
|
|
|
|
|
|
|
|
# with open(self._filepath, 'w') as fp:
|
|
|
|
|
|
|
|
# json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transform(self):
|
|
|
|
|
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
|
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
|
|
|
|
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
|
|
|
|
|
date_time = self.soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
|
|
|
|
|
|
|
|
date = self.soup.msItem.docDate.text # verbose date
|
|
|
|
|
|
|
|
analyse = self.soup.abstract.p.text # acte's short analysis
|
|
|
|
|
|
|
|
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
|
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
|
|
|
|
|
# which would result in List Index Out of Range Error. Hence :
|
|
|
|
|
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
|
|
|
|
|
ref_acte = ref[0].text
|
|
|
|
|
|
|
|
else: # there is no analysis
|
|
|
|
|
|
|
|
ref_acte = "NS"
|
|
|
|
|
|
|
|
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
|
|
|
|
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
|
|
|
|
# #type_diplo = self.soup.body.div["subtype"]
|
|
|
|
|
|
|
|
# #diplo_state = self.soup.body.div["type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
|
|
# "num_acte": counter,
|
|
|
|
|
|
|
|
"filename": numb,
|
|
|
|
|
|
|
|
"date_time": date_time,
|
|
|
|
|
|
|
|
"date": date,
|
|
|
|
|
|
|
|
# "prod_place_acte": place_query[0],
|
|
|
|
|
|
|
|
"analysis": analyse,
|
|
|
|
|
|
|
|
# "doc_acte": doc_query[0],
|
|
|
|
|
|
|
|
"ref_acte": ref_acte,
|
|
|
|
|
|
|
|
# "state_doc": state_query[0],
|
|
|
|
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
"""Stores instances of ``XMLDataSet``
|
|
|
|
"""Stores instances of ``XMLDataSet``
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
@ -94,7 +146,6 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
"kedro's API repr()"
|
|
|
|
"kedro's API repr()"
|
|
|
|
return dict(name=self._housename, folderpath=self._folderpath)
|
|
|
|
return dict(name=self._housename, folderpath=self._folderpath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#class TextDataSet:
|
|
|
|
#class TextDataSet:
|
|
|
|
# """loads/saves data from/to a text file using an underlying filesystem
|
|
|
|
# """loads/saves data from/to a text file using an underlying filesystem
|
|
|
|
|
|
|
|
|
|
|
|
@ -121,74 +172,6 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
|
# def _describe(self) -> Dict[str, Any]:
|
|
|
|
# return dict(filepath=self._filepath)
|
|
|
|
# return dict(filepath=self._filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
# "XMLDataSet loaded with BeautifulSoup"
|
|
|
|
|
|
|
|
# def _load(self) -> str:
|
|
|
|
|
|
|
|
# "kedro's API-like loader"
|
|
|
|
|
|
|
|
# self.source_doc = self._load_soup()
|
|
|
|
|
|
|
|
# return self.source_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def _load_soup(self):
|
|
|
|
|
|
|
|
# """open a xml file and return a BeautifulSoup object"""
|
|
|
|
|
|
|
|
# with open(self._filepath, 'r', encoding="utf-8") as opening:
|
|
|
|
|
|
|
|
# xml = BeautifulSoup(opening, 'xml')
|
|
|
|
|
|
|
|
# self.internal_xml = xml
|
|
|
|
|
|
|
|
# ## xml.prettify() -> str (source_doc)
|
|
|
|
|
|
|
|
# return xml.prettify()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def get_internal_xml(self):
|
|
|
|
|
|
|
|
# "beautiful soup internal DOM"
|
|
|
|
|
|
|
|
# if hasattr(self, 'internal_xml'):
|
|
|
|
|
|
|
|
# return self.internal_xml
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# attr_error_msg = str(self._describe())
|
|
|
|
|
|
|
|
# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
|
|
|
|
|
|
|
|
# return self.internal_xml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def _save(self, data:str) -> None:
|
|
|
|
|
|
|
|
# "kedro's API-like saver"
|
|
|
|
|
|
|
|
# raise NotImplementedError("This DataSet shall not be saved...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def _extract_data(self):
|
|
|
|
|
|
|
|
# # FIXME -> traitement à déplacer dans le nodes.py
|
|
|
|
|
|
|
|
# # make_soup -> _load_soup -> soup est déjà chargé
|
|
|
|
|
|
|
|
# #soup = make_soup(os.path.join(folder, acte))
|
|
|
|
|
|
|
|
# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
|
|
|
|
# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
|
|
|
|
|
# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
|
|
|
|
|
|
|
|
# date = soup.msItem.docDate.text # verbose date
|
|
|
|
|
|
|
|
# analyse = soup.abstract.p.text # acte's short analysis
|
|
|
|
|
|
|
|
# ref = soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
|
|
|
|
# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
|
|
|
|
# # archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
|
|
|
# # warning: the analysis may not have been written yet,
|
|
|
|
|
|
|
|
# # which would result in List Index Out of Range Error. Hence :
|
|
|
|
|
|
|
|
# if len(ref) > 0: # there is an analysis
|
|
|
|
|
|
|
|
# ref_acte = ref[0].text
|
|
|
|
|
|
|
|
# else: # there is no analysis
|
|
|
|
|
|
|
|
# ref_acte = "NS"
|
|
|
|
|
|
|
|
# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
|
|
|
|
# # //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
|
|
|
|
# # archive box or manuscript collection id
|
|
|
|
|
|
|
|
# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
|
|
|
|
# type_diplo = soup.body.div["subtype"]
|
|
|
|
|
|
|
|
# diplo_state = soup.body.div["type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # 2/ Make the data list
|
|
|
|
|
|
|
|
# actes.append({
|
|
|
|
|
|
|
|
# "num_acte": counter,
|
|
|
|
|
|
|
|
# "filename": numb,
|
|
|
|
|
|
|
|
# "date_time": date_time,
|
|
|
|
|
|
|
|
# "date": date,
|
|
|
|
|
|
|
|
# "prod_place_acte": place_query[0],
|
|
|
|
|
|
|
|
# "analysis": analyse,
|
|
|
|
|
|
|
|
# "doc_acte": doc_query[0],
|
|
|
|
|
|
|
|
# "ref_acte": ref_acte,
|
|
|
|
|
|
|
|
# "state_doc": state_query[0],
|
|
|
|
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
|
|
|
|
# })
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#class JSONDataSet(AbstractDataSet):
|
|
|
|
#class JSONDataSet(AbstractDataSet):
|
|
|
|
# def __init__(self, filepath: str):
|
|
|
|
# def __init__(self, filepath: str):
|
|
|
|
# self._filepath = filepath
|
|
|
|
# self._filepath = filepath
|
|
|
|
|