|
|
|
@ -68,17 +68,18 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self):
|
|
|
|
def _load(self):
|
|
|
|
"from the xml file, loads a internal xml repr (with bsoup)"
|
|
|
|
"from the xml file, loads a internal xml repr (with bsoup)"
|
|
|
|
|
|
|
|
logger.info("------------------------- bsoup loader -------")
|
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
|
|
|
|
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
|
|
|
|
self.soup = BeautifulSoup(fhandle, 'xml')
|
|
|
|
self.soup = BeautifulSoup(fhandle, 'xml')
|
|
|
|
## xml.prettify() is the bsoup str(source_doc)
|
|
|
|
## xml.prettify() is the bsoup str(source_doc)
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME
|
|
|
|
def _save(self, data: Dict) -> None:
|
|
|
|
# def _save(self, data: Dict) -> None:
|
|
|
|
"kedro's API-like saver"
|
|
|
|
# "kedro's API-like saver"
|
|
|
|
with open(self._filepath, 'w') as fp:
|
|
|
|
# with open(self._filepath, 'w') as fp:
|
|
|
|
json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
# json.dump(data, fp, sort_keys=True, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transform(self):
|
|
|
|
def transform(self):
|
|
|
|
|
|
|
|
logger.info("---------------- transform --------------")
|
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
|
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
|
@ -114,9 +115,8 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DataSetCollection(AbstractDataSet):
|
|
|
|
class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
"""Stores instances of ``DataSetCollection``
|
|
|
|
"""Stores instances of ``XMLDataSet``
|
|
|
|
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
implementations to provide ``_load`` and ``_save`` capabilities.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
|
|
def __init__(self,
|
|
|
|
@ -125,15 +125,6 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
self._housename = housename
|
|
|
|
self._housename = housename
|
|
|
|
self._folderpath = Path(folderpath)
|
|
|
|
self._folderpath = Path(folderpath)
|
|
|
|
|
|
|
|
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
|
|
|
|
|
"kedro's API loader method"
|
|
|
|
|
|
|
|
self.datasets = dict()
|
|
|
|
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
|
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(
|
|
|
|
|
|
|
|
filepath=str(filepath))
|
|
|
|
|
|
|
|
# return self.datasets
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _save(self, data) -> None:
|
|
|
|
def _save(self, data) -> None:
|
|
|
|
"""kedro's API saver method
|
|
|
|
"""kedro's API saver method
|
|
|
|
|
|
|
|
|
|
|
|
@ -145,7 +136,27 @@ class XMLDataSetCollection(AbstractDataSet):
|
|
|
|
|
|
|
|
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
|
def _describe(self) -> dict[str, Any]:
|
|
|
|
"kedro's API repr()"
|
|
|
|
"kedro's API repr()"
|
|
|
|
return dict(name=self._housename, folderpath=self._folderpath)
|
|
|
|
return dict(name=self._housename,
|
|
|
|
|
|
|
|
folderpath=str(self._folderpath))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XMLDataSetCollection(DataSetCollection):
|
|
|
|
|
|
|
|
def _load(self) -> dict[str, EtreeXMLDataSet]:
|
|
|
|
|
|
|
|
"kedro's API loader method"
|
|
|
|
|
|
|
|
self.datasets = dict()
|
|
|
|
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
|
|
|
|
self.datasets[filepath.stem] = EtreeXMLDataSet(
|
|
|
|
|
|
|
|
filepath=str(filepath))
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JSONDataSetCollection(DataSetCollection):
|
|
|
|
|
|
|
|
def _load(self) -> dict[str, BsXMLDataSet]:
|
|
|
|
|
|
|
|
"kedro's API loader method"
|
|
|
|
|
|
|
|
self.datasets = dict()
|
|
|
|
|
|
|
|
for filepath in sorted(self._folderpath.glob("*.xml")):
|
|
|
|
|
|
|
|
self.datasets[filepath.stem] = BsXMLDataSet(
|
|
|
|
|
|
|
|
filepath=str(filepath))
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
#class TextDataSet:
|
|
|
|
#class TextDataSet:
|
|
|
|
# """loads/saves data from/to a text file using an underlying filesystem
|
|
|
|
# """loads/saves data from/to a text file using an underlying filesystem
|
|
|
|
|