From 20cec1e2bd89b8ce9a7952f48455c280ea6c0b51 Mon Sep 17 00:00:00 2001 From: gwen Date: Sat, 8 Jul 2023 22:44:40 +0200 Subject: [PATCH] retrieve from catalog --- .../pipelines/xml_processing/nodes.py | 14 ++++++-------- actes-princiers/src/actesdataset.py | 4 +--- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py index 38a9b88..17b4ee6 100755 --- a/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/xml_processing/nodes.py @@ -12,16 +12,15 @@ logger = logging.getLogger(__name__) with KedroSession.create() as session: context = session.load_context() - catalog = context.get_catalog() +# catalog = context.get_catalog() # FIXME : ça porte à confusion de renvoyer un dict + def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection: "node function entry point, performs batch processing" datasets = datasetcol.datasets housename = datasetcol._housename -# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml" - output_catalog = catalog[housename + '_xmlcontent'] - outputfolderpath = output_catalog['folderpath'] - output_datasets = XMLDataSetCollection(housename, str(outputfolderpath)) + output_datasets = context.catalog.load(housename + '_xmlcontent') + outputfolderpath = output_datasets._folderpath for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because # the dataset **is not** registered in kedro's catalog @@ -43,9 +42,8 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle "node function entry point, performs batch processing" datasets = datasetcol.datasets housename = datasetcol._housename - output_catalog = catalog[housename + '_jsonoutput'] - outputfolderpath = output_catalog['folderpath'] - output_datasets = JSONDataSetCollection(housename, str(outputfolderpath)) + output_datasets = context.catalog.load(housename + '_jsonoutput') + outputfolderpath = output_datasets._folderpath for dataset_filenamestem, dataset in datasets.items(): # a manual load is required here, because # the dataset **is not** registered in kedro's catalog diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index ee568ec..0bfda10 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -76,7 +76,6 @@ class BsXMLDataSet(XMLDataSet): def _load(self): "from the xml file, loads a internal xml repr (with bsoup)" - logger.info("------------------------- bsoup loader -------") with open(self._filepath, 'r', encoding="utf-8") as fhandle: self.soup = BeautifulSoup(fhandle, 'xml') ## xml.prettify() is the bsoup str(source_doc) @@ -87,7 +86,6 @@ class BsXMLDataSet(XMLDataSet): json.dump(data, fp, sort_keys=True, indent=4) def transform(self): - logger.info("---------------- transform --------------") #soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID @@ -169,7 +167,7 @@ class BsXMLDataSetCollection(DataSetCollection): return self -class JSONDataSet: #(AbstractDataSet): +class JSONDataSet: def __init__(self, filepath: str): self._filepath = filepath