docstrings and typing

develop
gwen 3 years ago
parent 5de2279d28
commit cc19bd0969

@ -1,15 +1,36 @@
# ________________________________________________________________________
# reading raw bourbon dataset
bourbon:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/01_raw/houses/bourbon
# writing bourbon xmlcontent document attribute
bourbon_content:
type: actesdataset.XMLDataSetCollection
housename: bourbon
folderpath: data/02_intermediate/houses/bourbon
# ________________________________________________________________________
berry:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/berry
berry_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/berry
# ________________________________________________________________________
anjou:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/01_raw/houses/anjou
anjou_content:
type: actesdataset.XMLDataSetCollection
housename: berry
folderpath: data/02_intermediate/houses/anjou

@ -10,26 +10,26 @@ logger = logging.getLogger(__name__)
def transform(source_doc: etree._ElementTree, xlststylesheet: str) -> str:
"performs XML transformation on each dataset"
xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return str(xslt_transformer(source_doc))
def parse_xml_collection(datasets: Dict[str, XMLDataSet], param: str) -> Dict[str, XMLDataSet]:
"node function entry point, performs batch processing"
output_datasets = dict()
for dataset_filenamestem, dataset in datasets.items():
# manually loading the dataset because the collection **is not**
# registered in the catalog
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
dataset._load()
# transformation on each dataset
output_source_doc = transform(dataset.get_source_doc(), param)
# set dataset's output filepath
output_filepath = dataset.get_filepath().replace("01_raw", "02_intermediate")
output_xmldataset = XMLDataSet(output_filepath)
output_xmldataset.set_source_doc(output_source_doc)
output_datasets[dataset_filenamestem] = output_xmldataset
# 02_intermediate :
# let's create subfolders if they don't exist
# let's create subfolders now, if they don't exist
output_filepath = Path(output_filepath)
output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True)

@ -12,6 +12,19 @@ def create_pipeline(**kwargs) -> Pipeline:
outputs="bourbon_content",
name="bourbon_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["berry", "params:xsltstylesheet"],
outputs="berry_content",
name="berry_ds_collection",
),
node(
func=parse_xml_collection,
inputs=["anjou", "params:xsltstylesheet"],
outputs="anjou_content",
name="anjou_ds_collection",
),
]
)

@ -1,12 +1,11 @@
import logging
import json
from typing import Dict, Any
from pathlib import Path
import logging
from lxml import etree
from kedro.io import AbstractDataSet, DataSetError
from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__)
@ -18,9 +17,11 @@ class XMLDataSet:
self._filepath = filepath
def get_filepath(self) -> str:
"xml file's long filename getters"
return self._filepath
def get_source_doc(self) -> str:
"XML source_doc (xml as a string) getter"
if hasattr(self, 'source_doc'):
return self.source_doc
else:
@ -28,9 +29,11 @@ class XMLDataSet:
raise AttributeError(f"XMLDataSet bject {attr_error_msg} has no attribute named : 'source_doc'")
def set_source_doc(self, source_doc: str) -> None:
"XML source_doc (xml as a string) setter"
self.source_doc = source_doc
def _transform_source_doc(self) -> etree._ElementTree:
"xml transformer (with element tree)"
# removing namespace
query = "descendant-or-self::*[namespace-uri()!='']"
for element in self.source_doc.xpath(query):
@ -40,15 +43,18 @@ class XMLDataSet:
return self.source_doc
def _load(self) -> etree._ElementTree:
"kedro's API-like loader"
self.source_doc = etree.parse(self._filepath)
self._transform_source_doc()
return self.source_doc
def _save(self, data:str) -> None:
"kedro's API-like saver"
with open(self._filepath, 'w') as fhandle:
fhandle.write(data)
def _describe(self) -> Dict[str, Any]:
"kedro's API-like repr()"
return dict(filepath=self._filepath)
@ -63,6 +69,7 @@ class XMLDataSetCollection(AbstractDataSet):
self._folderpath = Path(folderpath)
def get_datasets(self) -> Dict[str, Any]:
"datasets mapper getter"
if hasattr(self, 'datasets'):
return self.datasets
else:
@ -70,6 +77,7 @@ class XMLDataSetCollection(AbstractDataSet):
raise AttributeError(f"Object {attr_error_msg} has no attribute named : 'datasets'")
def _load(self) -> dict[str, XMLDataSet]:
"kedro's API loader"
self.datasets = dict()
for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = XMLDataSet(
@ -77,10 +85,12 @@ class XMLDataSetCollection(AbstractDataSet):
return self.datasets
def _save(self, datasets: dict[str, XMLDataSet]) -> None:
"kedro's API saver"
for stemfilename, dataset in datasets.items():
dataset._save(dataset.get_source_doc())
def _describe(self) -> dict[str, Any]:
"kedro's API repr()"
return dict(name=self._housename, folderpath=self._folderpath)

Loading…
Cancel
Save