add json output

develop
gwen 3 years ago
parent 021dcab8fb
commit c5238793c4

@ -11,7 +11,12 @@ bourbon_xmlcontent:
folderpath: data/02_intermediate/houses/bourbon/xml folderpath: data/02_intermediate/houses/bourbon/xml
bourbon_json: bourbon_json:
type: actesdataset.XMLDataSetCollection type: actesdataset.JSONDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
bourbon_jsonoutput:
type: actesdataset.JSONDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon/json folderpath: data/02_intermediate/houses/bourbon/json

@ -4,7 +4,8 @@ from typing import Dict
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
from actesdataset import EtreeXMLDataSet, XMLDataSetCollection from actesdataset import EtreeXMLDataSet, BsXMLDataSet
from actesdataset import XMLDataSetCollection, JSONDataSetCollection
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -24,7 +25,6 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
dataset._load() dataset._load()
descr = dataset._describe()
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml") output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
@ -37,3 +37,28 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> Dict[str, EtreeXML
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets[dataset_filenamestem] = output_xmldataset
return output_datasets return output_datasets
def parse_json_collection(datasetcol: JSONDataSetCollection) -> Dict[str, BsXMLDataSet]:
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename
output_catalog = catalog[housename + '_jsonoutput']
outputfolderpath = output_catalog['folderpath']
output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items():
logger.info("-------------- parse_json ------------" + str(type(dataset)))
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
dataset._load()
output_source_doc = dataset.transform()
# set dataset's output filepath
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".json")
output_xmldataset = BsXMLDataSet(str(output_filepath))
# let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)
# save on file
output_xmldataset._save(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset
return output_datasets

@ -1,7 +1,7 @@
from kedro.pipeline import Pipeline, node, pipeline from kedro.pipeline import Pipeline, node, pipeline
from .nodes import parse_xml_collection from .nodes import parse_xml_collection, parse_json_collection
def create_pipeline(**kwargs) -> Pipeline: def create_pipeline(**kwargs) -> Pipeline:
@ -13,12 +13,12 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs="bourbon_xmlcontent", outputs="bourbon_xmlcontent",
name="bourbon_ds_collection", name="bourbon_ds_collection",
), ),
# node( node(
# func=parse_json_collection, func=parse_json_collection,
# inputs="bourbon", inputs="bourbon_json",
# outputs="bourbon_json", outputs="bourbon_jsonoutput",
# name="bourbon_json_ds_collection", name="bourbon_json_ds_collection",
# ), ),
# node( # node(
# func=parse_xml_collection, # func=parse_xml_collection,
# inputs="berry", # inputs="berry",

@ -68,17 +68,18 @@ class BsXMLDataSet(XMLDataSet):
def _load(self): def _load(self):
"from the xml file, loads a internal xml repr (with bsoup)" "from the xml file, loads a internal xml repr (with bsoup)"
logger.info("------------------------- bsoup loader -------")
with open(self._filepath, 'r', encoding="utf-8") as fhandle: with open(self._filepath, 'r', encoding="utf-8") as fhandle:
self.soup = BeautifulSoup(fhandle, 'xml') self.soup = BeautifulSoup(fhandle, 'xml')
## xml.prettify() is the bsoup str(source_doc) ## xml.prettify() is the bsoup str(source_doc)
# FIXME def _save(self, data: Dict) -> None:
# def _save(self, data: Dict) -> None: "kedro's API-like saver"
# "kedro's API-like saver" with open(self._filepath, 'w') as fp:
# with open(self._filepath, 'w') as fp: json.dump(data, fp, sort_keys=True, indent=4)
# json.dump(data, fp, sort_keys=True, indent=4)
def transform(self): def transform(self):
logger.info("---------------- transform --------------")
#soup = make_soup(os.path.join(folder, acte)) #soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
@ -114,9 +115,8 @@ class BsXMLDataSet(XMLDataSet):
# "diplo_type_acte": diplo_query[0] # "diplo_type_acte": diplo_query[0]
} }
class DataSetCollection(AbstractDataSet):
class XMLDataSetCollection(AbstractDataSet): """Stores instances of ``DataSetCollection``
"""Stores instances of ``XMLDataSet``
implementations to provide ``_load`` and ``_save`` capabilities. implementations to provide ``_load`` and ``_save`` capabilities.
""" """
def __init__(self, def __init__(self,
@ -125,15 +125,6 @@ class XMLDataSetCollection(AbstractDataSet):
self._housename = housename self._housename = housename
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
# return self.datasets
return self
def _save(self, data) -> None: def _save(self, data) -> None:
"""kedro's API saver method """kedro's API saver method
@ -145,7 +136,27 @@ class XMLDataSetCollection(AbstractDataSet):
def _describe(self) -> dict[str, Any]: def _describe(self) -> dict[str, Any]:
"kedro's API repr()" "kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath) return dict(name=self._housename,
folderpath=str(self._folderpath))
class XMLDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet(
filepath=str(filepath))
return self
class JSONDataSetCollection(DataSetCollection):
def _load(self) -> dict[str, BsXMLDataSet]:
"kedro's API loader method"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = BsXMLDataSet(
filepath=str(filepath))
return self
#class TextDataSet: #class TextDataSet:
# """loads/saves data from/to a text file using an underlying filesystem # """loads/saves data from/to a text file using an underlying filesystem

Loading…
Cancel
Save