diff --git a/actes-princiers/conf/base/catalog.yml b/actes-princiers/conf/base/catalog.yml index b839a02..f7d3b4a 100644 --- a/actes-princiers/conf/base/catalog.yml +++ b/actes-princiers/conf/base/catalog.yml @@ -5,13 +5,15 @@ bourbon: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/01_raw/xml/Bourbon - + xsltstylesheet: templates/xsl/actes_princiers.xsl + # output (write) **pseudo xml** dataset bourbon_xmlcontent: type: actesdataset.XMLDataSetCollection housename: bourbon folderpath: data/02_intermediate/xml/Bourbon/xml - + xsltstylesheet: templates/xsl/actes_princiers.xsl + # input (read) **pseudo xml** dataset bourbon_pseudoxmlcontent: type: actesdataset.TextDataSetCollection diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 928f7a4..5187cb4 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -12,10 +12,9 @@ logger = logging.getLogger(__name__) with KedroSession.create() as session: context = session.load_context() -# catalog = context.get_catalog() # FIXME : ça porte à confusion de renvoyer un dict +# catalog = context.get_catalog() - -def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection: +def parse_xml_collection(datasetcol: XMLDataSetCollection, params: str) -> XMLDataSetCollection: "node function entry point, performs batch processing" datasets = datasetcol.datasets housename = datasetcol._housename @@ -28,7 +27,7 @@ def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollecti output_source_doc = dataset.transform() # set dataset's output filepath output_filepath = outputfolderpath / Path(dataset_filenamestem).with_suffix(".pseudoxml") - output_xmldataset = EtreeXMLDataSet(str(output_filepath)) + output_xmldataset = EtreeXMLDataSet(str(output_filepath), params) # let's create subfolders, if they don't exist output_xmldataset_dir = output_filepath.parent output_xmldataset_dir.mkdir(parents=True, exist_ok=True) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py index aeb764d..4ee3b48 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/pipeline.py @@ -10,7 +10,7 @@ def create_pipeline(**kwargs) -> Pipeline: [ node( func=parse_xml_collection, - inputs="bourbon", + inputs=["bourbon", "params:xsltstylesheet"], outputs="bourbon_xmlcontent", name="bourbon_ds_collection", ), diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 77039c7..6ff8357 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -13,20 +13,6 @@ from kedro.framework.session import KedroSession logger = logging.getLogger(__name__) -with KedroSession.create() as session: - context = session.load_context() - xlststylesheet = context.params['xsltstylesheet'] - -#xlststylesheet = "templates/xsl/actes_princiers.xsl" -# XXX is it usefull to make this bunch of code a classmethod ? -def _xslt(xsltstylesheet): - "performs XML transformation on each dataset" - xslt_doc = etree.parse(xlststylesheet) - xslt_transformer = etree.XSLT(xslt_doc) - return xslt_transformer - -xslt_transformer = _xslt(xlststylesheet) - class XMLDataSet(ABC): "Abstract base class for an XML dataset loader" @@ -52,6 +38,10 @@ class XMLDataSet(ABC): class EtreeXMLDataSet(XMLDataSet): "XMLDataSet loader with lxml.etree (lxml.etree._ElementTree)" + def __init__(self, filepath, params): + self._filepath = filepath + self.xsltstylesheet = params + def _load(self): "from the xml file loads a internal xml repr (with element tree)" # self.source_doc is an etree internal xml repr document @@ -67,8 +57,16 @@ class EtreeXMLDataSet(XMLDataSet): "kedro's API-like saver" with open(self._filepath, 'w') as fhandle: fhandle.write(data) - + + @staticmethod + def _xslt(xsltstylesheet): + "performs XML transformation on each dataset" + xslt_doc = etree.parse(xsltstylesheet) + xslt_transformer = etree.XSLT(xslt_doc) + return xslt_transformer + def transform(self): + xslt_transformer = self._xslt(self.xsltstylesheet) return str(xslt_transformer(self.source_doc)) class BsXMLDataSet(XMLDataSet): @@ -149,11 +147,15 @@ class DataSetCollection(AbstractDataSet): class XMLDataSetCollection(DataSetCollection): + def __init__(self, housename: str, + folderpath: str, xsltstylesheet: str) -> None: + super().__init__(housename, folderpath) + self.xsltstylesheet = xsltstylesheet + def _load(self) -> dict[str, EtreeXMLDataSet]: "kedro's API loader method" for filepath in sorted(self._folderpath.glob("*.xml")): - self.datasets[filepath.stem] = EtreeXMLDataSet( - filepath=str(filepath)) + self.datasets[filepath.stem] = EtreeXMLDataSet(str(filepath), self.xsltstylesheet) return self