add xslt stylesheet in the catalog collection

develop
gwen 3 years ago
parent 537003eefa
commit e1b34140f0

@ -5,12 +5,14 @@ bourbon:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/01_raw/xml/Bourbon folderpath: data/01_raw/xml/Bourbon
xsltstylesheet: templates/xsl/actes_princiers.xsl
# output (write) **pseudo xml** dataset # output (write) **pseudo xml** dataset
bourbon_xmlcontent: bourbon_xmlcontent:
type: actesdataset.XMLDataSetCollection type: actesdataset.XMLDataSetCollection
housename: bourbon housename: bourbon
folderpath: data/02_intermediate/xml/Bourbon/xml folderpath: data/02_intermediate/xml/Bourbon/xml
xsltstylesheet: templates/xsl/actes_princiers.xsl
# input (read) **pseudo xml** dataset # input (read) **pseudo xml** dataset
bourbon_pseudoxmlcontent: bourbon_pseudoxmlcontent:

@ -12,10 +12,9 @@ logger = logging.getLogger(__name__)
with KedroSession.create() as session: with KedroSession.create() as session:
context = session.load_context() context = session.load_context()
# catalog = context.get_catalog() # FIXME : ça porte à confusion de renvoyer un dict # catalog = context.get_catalog()
def parse_xml_collection(datasetcol: XMLDataSetCollection, params: str) -> XMLDataSetCollection:
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename
@ -28,7 +27,7 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti
output_source_doc = dataset.transform() output_source_doc = dataset.transform()
# set dataset's output filepath # set dataset's output filepath
output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml") output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml")
output_xmldataset = EtreeXMLDataSet(str(output_filepath)) output_xmldataset = EtreeXMLDataSet(str(output_filepath), params)
# let's create subfolders, if they don't exist # let's create subfolders, if they don't exist
output_xmldataset_dir = output_filepath.parent output_xmldataset_dir = output_filepath.parent
output_xmldataset_dir.mkdir(parents=True, exist_ok=True) output_xmldataset_dir.mkdir(parents=True, exist_ok=True)

@ -10,7 +10,7 @@ def create_pipeline(**kwargs) -> Pipeline:
[ [
node( node(
func=parse_xml_collection, func=parse_xml_collection,
inputs="bourbon", inputs=["bourbon", "params:xsltstylesheet"],
outputs="bourbon_xmlcontent", outputs="bourbon_xmlcontent",
name="bourbon_ds_collection", name="bourbon_ds_collection",
), ),

@ -13,20 +13,6 @@ from kedro.framework.session import KedroSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
xlststylesheet = context.params['xsltstylesheet']
#xlststylesheet = "templates/xsl/actes_princiers.xsl"
# XXX is it usefull to make this bunch of code a classmethod ?
def _xslt(xsltstylesheet):
"performs XML transformation on each dataset"
xslt_doc = etree.parse(xlststylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return xslt_transformer
xslt_transformer = _xslt(xlststylesheet)
class XMLDataSet(ABC): class XMLDataSet(ABC):
"Abstract base class for an XML dataset loader" "Abstract base class for an XML dataset loader"
@ -52,6 +38,10 @@ class XMLDataSet(ABC):
class EtreeXMLDataSet(XMLDataSet): class EtreeXMLDataSet(XMLDataSet):
"XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)"
def __init__(self, filepath, params):
self._filepath = filepath
self.xsltstylesheet = params
def _load(self): def _load(self):
"from the xml file loads a internal xml repr (with element tree)" "from the xml file loads a internal xml repr (with element tree)"
# self.source_doc is an etree internal xml repr document # self.source_doc is an etree internal xml repr document
@ -68,7 +58,15 @@ class EtreeXMLDataSet(XMLDataSet):
with open(self._filepath, 'w') as fhandle: with open(self._filepath, 'w') as fhandle:
fhandle.write(data) fhandle.write(data)
@staticmethod
def _xslt(xsltstylesheet):
"performs XML transformation on each dataset"
xslt_doc = etree.parse(xsltstylesheet)
xslt_transformer = etree.XSLT(xslt_doc)
return xslt_transformer
def transform(self): def transform(self):
xslt_transformer = self._xslt(self.xsltstylesheet)
return str(xslt_transformer(self.source_doc)) return str(xslt_transformer(self.source_doc))
class BsXMLDataSet(XMLDataSet): class BsXMLDataSet(XMLDataSet):
@ -149,11 +147,15 @@ class DataSetCollection(AbstractDataSet):
class XMLDataSetCollection(DataSetCollection): class XMLDataSetCollection(DataSetCollection):
def __init__(self, housename: str,
folderpath: str, xsltstylesheet: str) -> None:
super().__init__(housename, folderpath)
self.xsltstylesheet = xsltstylesheet
def _load(self) -> dict[str, EtreeXMLDataSet]: def _load(self) -> dict[str, EtreeXMLDataSet]:
"kedro's API loader method" "kedro's API loader method"
for filepath in sorted(self._folderpath.glob("*.xml")): for filepath in sorted(self._folderpath.glob("*.xml")):
self.datasets[filepath.stem] = EtreeXMLDataSet( self.datasets[filepath.stem] = EtreeXMLDataSet(str(filepath), self.xsltstylesheet)
filepath=str(filepath))
return self return self

Loading…
Cancel
Save