docstrings and typing

develop
gwen 3 years ago
parent 5de2279d28
commit cc19bd0969

@ -1,15 +1,36 @@
# ________________________________________________________________________ # ________________________________________________________________________
# reading raw bourbon dataset
bourbon: bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/houses/bourbon folderpath: data/01_raw/houses/bourbon
# writing bourbon xmlcontent document attribute
bourbon_content: bourbon_content:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/houses/bourbon folderpath: data/02_intermediate/houses/bourbon
# ________________________________________________________________________ # ________________________________________________________________________
berry:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/berry
berry_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/berry
# ________________________________________________________________________
anjou:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/anjou
anjou_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/anjou

@ -10,26 +10,26 @@ logger = logging.getLogger(__name__)
def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str: def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str:
"performs XML transformation on each dataset"
xslt_doc = etree.parse(xlststylesheet) xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc) xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc)) return str(xslt_transformer(source_doc))
def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]: def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]:
"node function entry point, performs batch processing"
output_datasets = dict() output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# manually loading the dataset because the collection **is not** # a manual load is required here, because
# registered in the catalog # the dataset **is not** registered in kedro's catalog
dataset._load() dataset._load()
# transformation on each dataset
output_source_doc = transform(dataset.get_source_doc(), param) output_source_doc = transform(dataset.get_source_doc(), param)
# set dataset's output filepath # set dataset's output filepath
output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate") output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate")
output_xmldataset = XMLDataSet(output_filepath) output_xmldataset = XMLDataSet(output_filepath)
output_xmldataset.set_source_doc(output_source_doc) output_xmldataset.set_source_doc(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset output_datasets[dataset_filenamestem] = output_xmldataset
# 02_intermediate : # let's create subfolders now, if they don't exist
# let's create subfolders if they don't exist
output_filepath = Path(output_filepath) output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True) output_xmldataset_dir.mkdir(parents=True, exist_ok=True)

@ -12,6 +12,19 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs="bourbon_content", outputs="bourbon_content",
name="bourbon_ds_collection", name="bourbon_ds_collection",
), ),
node(
func=parse_xml_collection,
inputs=["berry", "params:xsltstylesheet"],
outputs="berry_content",
name="berry_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["anjou", "params:xsltstylesheet"],
outputs="anjou_content",
name="anjou_ds_collection",
),
] ]
) )

@ -1,12 +1,11 @@
import logging
import json import json
from typing import Dict, Any from typing import Dict, Any
from pathlib import Path from pathlib import Path
import logging
from lxml import etree from lxml import etree
from kedro.io import AbstractDataSet, DataSetError from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,9 +17,11 @@ class XMLDataSet:
self._filepath = filepath self._filepath = filepath
def get_filepath(self) -> str: def get_filepath(self) -> str:
"xml file's long filename getters"
return self._filepath return self._filepath
def get_source_doc(self) -> str: def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'): if hasattr(self, 'source_doc'):
return self.source_doc return self.source_doc
else: else:
@ -28,9 +29,11 @@ class XMLDataSet:
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'") raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc: str) -> None: def set_source_doc(self, source_doc: str) -> None:
"XML source_doc (xml as a string) setter"
self.source_doc = source_doc self.source_doc = source_doc
def _transform_source_doc(self) -> etree._ElementTree: def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)"
# removing namespace # removing namespace
query = "descendant-or-self::*[namespace-uri()!='']" query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query): for element in self.source_doc.xpath(query):
@ -40,15 +43,18 @@ class XMLDataSet:
return self.source_doc return self.source_doc
def _load(self) -> etree._ElementTree: def _load(self) -> etree._ElementTree:
"kedro's API-like loader"
self.source_doc = etree.parse(self._filepath) self.source_doc = etree.parse(self._filepath)
self._transform_source_doc() self._transform_source_doc()
return self.source_doc return self.source_doc
def _save(self, data:str) -> None: def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle: with open(self._filepath, 'w') as fhandle:
fhandle.write(data) fhandle.write(data)
def _describe(self) -> Dict[str, Any]: def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath) return dict(filepath=self._filepath)
@ -63,6 +69,7 @@ class XMLDataSetCollection(AbstractDataSet):
self._folderpath = Path(folderpath) self._folderpath = Path(folderpath)
def get_datasets(self) -> Dict[str, Any]: def get_datasets(self) -> Dict[str, Any]:
"datasets mapper getter"
if hasattr(self, 'datasets'): if hasattr(self, 'datasets'):
return self.datasets return self.datasets
else: else:
@ -70,6 +77,7 @@ class XMLDataSetCollection(AbstractDataSet):
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'") raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
def _load(self) -> dict[str, XMLDataSet]: def _load(self) -> dict[str, XMLDataSet]:
"kedro's API loader"
self.datasets = dict() self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = XMLDataSet( self.datasets[filepath.stem] = XMLDataSet(
@ -77,10 +85,12 @@ class XMLDataSetCollection(AbstractDataSet):
return self.datasets return self.datasets
def _save(self, datasets: dict[str, XMLDataSet]) -> None: def _save(self, datasets: dict[str, XMLDataSet]) -> None:
"kedro's API saver"
for stemfilename, dataset in datasets.items(): for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc()) dataset._save(dataset.get_source_doc())
def _describe(self) -> dict[str, Any]: def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath) return dict(name=self._housename, folderpath=self._folderpath)

Loading…
Cancel
Save