retrieve from catalog

develop
gwen 3 years ago
parent 83f39b8986
commit 20cec1e2bd

@ -12,16 +12,15 @@ logger = logging.getLogger(__name__)
with KedroSession.create() as session: with KedroSession.create() as session:
context = session.load_context() context = session.load_context()
catalog = context.get_catalog() # catalog = context.get_catalog() # FIXME : ça porte à confusion de renvoyer un dict
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection: def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml" output_datasets = context.catalog.load(housename + '_xmlcontent')
output_catalog = catalog[housename + '_xmlcontent'] outputfolderpath = output_datasets._folderpath
outputfolderpath = output_catalog['folderpath']
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog
@ -43,9 +42,8 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
"node function entry point, performs batch processing" "node function entry point, performs batch processing"
datasets = datasetcol.datasets datasets = datasetcol.datasets
housename = datasetcol._housename housename = datasetcol._housename
output_catalog = catalog[housename + '_jsonoutput'] output_datasets = context.catalog.load(housename + '_jsonoutput')
outputfolderpath = output_catalog['folderpath'] outputfolderpath = output_datasets._folderpath
output_datasets = JSONDataSetCollection(housename, str(outputfolderpath))
for dataset_filenamestem, dataset in datasets.items(): for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because # a manual load is required here, because
# the dataset **is not** registered in kedro's catalog # the dataset **is not** registered in kedro's catalog

@ -76,7 +76,6 @@ class BsXMLDataSet(XMLDataSet):
def _load(self): def _load(self):
"from the xml file, loads a internal xml repr (with bsoup)" "from the xml file, loads a internal xml repr (with bsoup)"
logger.info("------------------------- bsoup loader -------")
with open(self._filepath, 'r', encoding="utf-8") as fhandle: with open(self._filepath, 'r', encoding="utf-8") as fhandle:
self.soup = BeautifulSoup(fhandle, 'xml') self.soup = BeautifulSoup(fhandle, 'xml')
## xml.prettify() is the bsoup str(source_doc) ## xml.prettify() is the bsoup str(source_doc)
@ -87,7 +86,6 @@ class BsXMLDataSet(XMLDataSet):
json.dump(data, fp, sort_keys=True, indent=4) json.dump(data, fp, sort_keys=True, indent=4)
def transform(self): def transform(self):
logger.info("---------------- transform --------------")
#soup = make_soup(os.path.join(folder, acte)) #soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
@ -169,7 +167,7 @@ class BsXMLDataSetCollection(DataSetCollection):
return self return self
class JSONDataSet: #(AbstractDataSet): class JSONDataSet:
def __init__(self, filepath: str): def __init__(self, filepath: str):
self._filepath = filepath self._filepath = filepath

Loading…
Cancel
Save