develop
gwen 3 years ago
parent 5b2894256e
commit e06f83a7f5

@ -1,3 +1,7 @@
#essai:
# type: actesdataset.TextDataSet
# filepath: data/01_raw/csv/actors.csv
# ________________________________________________________________________ # ________________________________________________________________________
bourbon: bourbon:

@ -17,28 +17,14 @@ class XMLDataSet:
def __init__(self, filepath: str) -> None: def __init__(self, filepath: str) -> None:
self._filepath = filepath self._filepath = filepath
# xml etree internal representation
self._dom = None
# xml as an str output
self._str = None
def _load(self): # def _load(self):
"kedro's API-like loader" # "kedro's API-like loader"
pass # pass
#
def _save(self, data:str) -> None: # def _save(self, data:str) -> None:
"kedro's API-like saver" # "kedro's API-like saver"
pass # pass
@property
def tostring(self) -> str:
"XML source_doc (xml as a string) getter"
# FIXME : charger le _dom d'abord, puis génerer le str ici
if getattr(self, '_str') is not None:
return self._str
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet dom object {attr_error_msg} has not been loaded yet")
@property @property
def filepath(self) -> str: def filepath(self) -> str:
@ -67,9 +53,6 @@ class XMLDataSet:
class EtreeXMLDataSet(XMLDataSet): class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def __str__(self):
return self.str
def _transform_source_doc(self) -> etree._ElementTree: def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)" "xml transformer (with element tree)"
self.source_doc = etree.parse(self._filepath) self.source_doc = etree.parse(self._filepath)
@ -88,74 +71,6 @@ class EtreeXMLDataSet(XMLDataSet):
"kedro's API-like saver" "kedro's API-like saver"
with open(self._filepath, 'w') as fhandle: with open(self._filepath, 'w') as fhandle:
fhandle.write(data) fhandle.write(data)
class BsXMLDataSet(XMLDataSet):
"XMLDataSet loaded with BeautifulSoup"
def _load(self) -> str:
"kedro's API-like loader"
self.source_doc = self._load_soup()
return self.source_doc
def _load_soup(self):
"""open a xml file and return a BeautifulSoup object"""
with open(self._filepath, 'r', encoding="utf-8") as opening:
xml = BeautifulSoup(opening, 'xml')
self.internal_xml = xml
## xml.prettify() -> str (source_doc)
return xml.prettify()
def get_internal_xml(self):
"beautiful soup internal DOM"
if hasattr(self, 'internal_xml'):
return self.internal_xml
else:
attr_error_msg = str(self._describe())
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
return self.internal_xml
def _save(self, data:str) -> None:
"kedro's API-like saver"
raise NotImplementedError("This DataSet shall not be saved...")
def _extract_data(self):
# FIXME -> traitement à déplacer dans le nodes.py
# make_soup -> _load_soup -> soup est déjà chargé
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
date = soup.msItem.docDate.text # verbose date
analyse = soup.abstract.p.text # acte's short analysis
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
# which would result in List Index Out of Range Error. Hence :
if len(ref) > 0: # there is an analysis
ref_acte = ref[0].text
else: # there is no analysis
ref_acte = "NS"
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
type_diplo = soup.body.div["subtype"]
diplo_state = soup.body.div["type"]
# 2/ Make the data list
actes.append({
"num_acte": counter,
"filename": numb,
"date_time": date_time,
"date": date,
"prod_place_acte": place_query[0],
"analysis": analyse,
"doc_acte": doc_query[0],
"ref_acte": ref_acte,
"state_doc": state_query[0],
"diplo_type_acte": diplo_query[0]
})
class XMLDataSetCollection(AbstractDataSet): class XMLDataSetCollection(AbstractDataSet):
@ -184,7 +99,7 @@ class XMLDataSetCollection(AbstractDataSet):
filepath=str(filepath)) filepath=str(filepath))
return self.datasets return self.datasets
def _save(self, datasets: dict[str, EtreeXMLDataSet]) -> None: def _save(self, datasets: dict[str, Any]) -> None:
"kedro's API saver method" "kedro's API saver method"
for stemfilename, dataset in datasets.items(): for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc()) dataset._save(dataset.get_source_doc())
@ -194,6 +109,100 @@ class XMLDataSetCollection(AbstractDataSet):
return dict(name=self._housename, folderpath=self._folderpath) return dict(name=self._housename, folderpath=self._folderpath)
#class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem
# example usage
# >>> string_to_write = "This will go in a file."
# >>>
# >>> data_set = TextDataSet(filepath="test.md")
# >>> data_set.save(string_to_write)
# >>> reloaded = data_set.load()
# >>> assert string_to_write == reloaded
# """
# def __init__(self, filepath: str):
# self._filepath = filepath
#
# def _load(self) -> str:
# with open(self._filepath, 'r') as fhandle:
# return fhandle.read()
# def _save(self, data: str) -> None:
# with open(self._filepath, 'w') as fhandle:
# fhandle.write(data)
# def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath)
#class BsXMLDataSet(XMLDataSet):
# "XMLDataSet loaded with BeautifulSoup"
# def _load(self) -> str:
# "kedro's API-like loader"
# self.source_doc = self._load_soup()
# return self.source_doc
# def _load_soup(self):
# """open a xml file and return a BeautifulSoup object"""
# with open(self._filepath, 'r', encoding="utf-8") as opening:
# xml = BeautifulSoup(opening, 'xml')
# self.internal_xml = xml
# ## xml.prettify() -> str (source_doc)
# return xml.prettify()
# def get_internal_xml(self):
# "beautiful soup internal DOM"
# if hasattr(self, 'internal_xml'):
# return self.internal_xml
# else:
# attr_error_msg = str(self._describe())
# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
# return self.internal_xml
# def _save(self, data:str) -> None:
# "kedro's API-like saver"
# raise NotImplementedError("This DataSet shall not be saved...")
# def _extract_data(self):
# # FIXME -> traitement à déplacer dans le nodes.py
# # make_soup -> _load_soup -> soup est déjà chargé
# #soup = make_soup(os.path.join(folder, acte))
# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
# date = soup.msItem.docDate.text # verbose date
# analyse = soup.abstract.p.text # acte's short analysis
# ref = soup.msIdentifier.find_all("idno", {"n": "2"})
# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# # archive box or the page number inside a manuscript (see _create_doc)
# # warning: the analysis may not have been written yet,
# # which would result in List Index Out of Range Error. Hence :
# if len(ref) > 0: # there is an analysis
# ref_acte = ref[0].text
# else: # there is no analysis
# ref_acte = "NS"
# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
# # //sourceDesc//msIdentifier/idno[@n='1'] is always the
# # archive box or manuscript collection id
# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# type_diplo = soup.body.div["subtype"]
# diplo_state = soup.body.div["type"]
# # 2/ Make the data list
# actes.append({
# "num_acte": counter,
# "filename": numb,
# "date_time": date_time,
# "date": date,
# "prod_place_acte": place_query[0],
# "analysis": analyse,
# "doc_acte": doc_query[0],
# "ref_acte": ref_acte,
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
# })
#class JSONDataSet(AbstractDataSet): #class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str): # def __init__(self, filepath: str):
# self._filepath = filepath # self._filepath = filepath
@ -208,3 +217,4 @@ class XMLDataSetCollection(AbstractDataSet):
# def _describe(self) -> Dict[str, Any]: # def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath) # return dict(filepath=self._filepath)

Loading…
Cancel
Save