add bsoup

develop
gwen 3 years ago
parent 47f19eb93c
commit 29fab8bc02

@ -1,7 +1,3 @@
#essai:
# type: actesdataset.TextDataSet
# filepath: data/01_raw/csv/actors.csv
# ________________________________________________________________________ # ________________________________________________________________________
bourbon: bourbon:
@ -9,11 +5,18 @@ bourbon:
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# FIXME change the path to data/02_intermediate/houses/bourbon/xml
bourbon_content: bourbon_content:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon folderpath: data/02_intermediate/houses/bourbon
#bourbon_json:
# type: actesdataset.XMLDataSetCollection
# housename: bourbon
# folderpath: data/02_intermediate/houses/bourbon/json
# ________________________________________________________________________ # ________________________________________________________________________
berry: berry:

@ -15,17 +15,16 @@ def parse_xml_collection(datasets: Dict[str, EtreeXMLDataSet], param: str) -> Di
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
dataset._load() dataset._load()
descr = dataset._describe() descr = dataset._describe()
# logger.info(f"dataset {descr} loaded") logger.info(f"dataset {descr} loaded")
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
output_filepath = dataset.filepath.replace("01_raw", "02_intermediate") output_filepath = dataset.filepath.replace("01_raw", "02_intermediate")
output_xmldataset = EtreeXMLDataSet(output_filepath) output_xmldataset = EtreeXMLDataSet(output_filepath)
# let's create subfolders now, if they don't exist # let's create subfolders now, if they don't exist
output_filepath = Path(output_filepath) output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True) output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(output_source_doc) output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets[dataset_filenamestem] = output_xmldataset
return output_datasets return output_datasets

@ -16,7 +16,7 @@ with KedroSession.create() as session:
xlststylesheet = context.params['xsltstylesheet'] xlststylesheet = context.params['xsltstylesheet']
#xlststylesheet = "templates/xsl/actes_princiers.xsl" #xlststylesheet = "templates/xsl/actes_princiers.xsl"
# FIXME make this function a classmethod ? # XXX is it usefull to make this bunch of code a classmethod ?
def _xslt(xsltstylesheet): def _xslt(xsltstylesheet):
"performs XML transformation on each dataset" "performs XML transformation on each dataset"
xslt_doc = etree.parse(xlststylesheet) xslt_doc = etree.parse(xlststylesheet)
@ -63,6 +63,58 @@ class EtreeXMLDataSet(XMLDataSet):
def transform(self): def transform(self):
return str(xslt_transformer(self.source_doc)) return str(xslt_transformer(self.source_doc))
class BsXMLDataSet(XMLDataSet):
"XMLDataSet loader with BeautifulSoup"
def _load(self):
"from the xml file, loads a internal xml repr (with bsoup)"
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
self.soup = BeautifulSoup(fhandle, 'xml')
## xml.prettify() is the bsoup str(source_doc)
# FIXME
# def _save(self, data: Dict) -> None:
# "kedro's API-like saver"
# with open(self._filepath, 'w') as fp:
# json.dump(data, fp, sort_keys=True, indent=4)
def transform(self):
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
date_time = self.soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
date = self.soup.msItem.docDate.text # verbose date
analyse = self.soup.abstract.p.text # acte's short analysis
ref = self.soup.msIdentifier.find_all("idno", {"n": "2"})
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# archive box or the page number inside a manuscript (see _create_doc)
# warning: the analysis may not have been written yet,
# which would result in List Index Out of Range Error. Hence :
if len(ref) > 0: # there is an analysis
ref_acte = ref[0].text
else: # there is no analysis
ref_acte = "NS"
# prod_place = self.soup.find_all("placeName", {"type": "production_place"})[0].text
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
# #doc = self.soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# #type_diplo = self.soup.body.div["subtype"]
# #diplo_state = self.soup.body.div["type"]
return {
# "num_acte": counter,
"filename": numb,
"date_time": date_time,
"date": date,
# "prod_place_acte": place_query[0],
"analysis": analyse,
# "doc_acte": doc_query[0],
"ref_acte": ref_acte,
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
}
class XMLDataSetCollection(AbstractDataSet): class XMLDataSetCollection(AbstractDataSet):
"""Stores instances of ``XMLDataSet`` """Stores instances of ``XMLDataSet``
implementations to provide ``_load`` and ``_save`` capabilities. implementations to provide ``_load`` and ``_save`` capabilities.
@ -94,7 +146,6 @@ class XMLDataSetCollection(AbstractDataSet):
"kedro's API repr()" "kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath) return dict(name=self._housename, folderpath=self._folderpath)
#class TextDataSet: #class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem # """loads/saves data from/to a text file using an underlying filesystem
@ -121,74 +172,6 @@ class XMLDataSetCollection(AbstractDataSet):
# def _describe(self) -> Dict[str, Any]: # def _describe(self) -> Dict[str, Any]:
# return dict(filepath=self._filepath) # return dict(filepath=self._filepath)
#class BsXMLDataSet(XMLDataSet):
# "XMLDataSet loaded with BeautifulSoup"
# def _load(self) -> str:
# "kedro's API-like loader"
# self.source_doc = self._load_soup()
# return self.source_doc
# def _load_soup(self):
# """open a xml file and return a BeautifulSoup object"""
# with open(self._filepath, 'r', encoding="utf-8") as opening:
# xml = BeautifulSoup(opening, 'xml')
# self.internal_xml = xml
# ## xml.prettify() -> str (source_doc)
# return xml.prettify()
# def get_internal_xml(self):
# "beautiful soup internal DOM"
# if hasattr(self, 'internal_xml'):
# return self.internal_xml
# else:
# attr_error_msg = str(self._describe())
# raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'internal_xml'")
# return self.internal_xml
# def _save(self, data:str) -> None:
# "kedro's API-like saver"
# raise NotImplementedError("This DataSet shall not be saved...")
# def _extract_data(self):
# # FIXME -> traitement à déplacer dans le nodes.py
# # make_soup -> _load_soup -> soup est déjà chargé
# #soup = make_soup(os.path.join(folder, acte))
# # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
# numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
# date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
# date = soup.msItem.docDate.text # verbose date
# analyse = soup.abstract.p.text # acte's short analysis
# ref = soup.msIdentifier.find_all("idno", {"n": "2"})
# # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
# # archive box or the page number inside a manuscript (see _create_doc)
# # warning: the analysis may not have been written yet,
# # which would result in List Index Out of Range Error. Hence :
# if len(ref) > 0: # there is an analysis
# ref_acte = ref[0].text
# else: # there is no analysis
# ref_acte = "NS"
# prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
# # //sourceDesc//msIdentifier/idno[@n='1'] is always the
# # archive box or manuscript collection id
# doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
# type_diplo = soup.body.div["subtype"]
# diplo_state = soup.body.div["type"]
# # 2/ Make the data list
# actes.append({
# "num_acte": counter,
# "filename": numb,
# "date_time": date_time,
# "date": date,
# "prod_place_acte": place_query[0],
# "analysis": analyse,
# "doc_acte": doc_query[0],
# "ref_acte": ref_acte,
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
# })
#class JSONDataSet(AbstractDataSet): #class JSONDataSet(AbstractDataSet):
# def __init__(self, filepath: str): # def __init__(self, filepath: str):
# self._filepath = filepath # self._filepath = filepath

Loading…
Cancel
Save