retrieve from catalog

develop
gwen 3 years ago
parent 83f39b8986
commit 20cec1e2bd

@ -12,16 +12,15 @@ logger = logging.getLogger(__name__)
with KedroSession.create() as session:
context = session.load_context()
catalog = context.get_catalog()
# catalog = context.get_catalog() # FIXME : ça porte à confusion de renvoyer un dict
def parse_xml_collection(datasetcol: XMLDataSetCollection) -> XMLDataSetCollection:
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename
# outputfolderpath = f"data/02_intermediate/houses/{housename}/xml"
output_catalog = catalog[housename + '_xmlcontent']
outputfolderpath = output_catalog['folderpath']
output_datasets = XMLDataSetCollection(housename, str(outputfolderpath))
output_datasets = context.catalog.load(housename + '_xmlcontent')
outputfolderpath = output_datasets._folderpath
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog
@ -43,9 +42,8 @@ def make_json_collection(datasetcol: BsXMLDataSetCollection) -> JSONDataSetColle
"node function entry point, performs batch processing"
datasets = datasetcol.datasets
housename = datasetcol._housename
output_catalog = catalog[housename + '_jsonoutput']
outputfolderpath = output_catalog['folderpath']
output_datasets = JSONDataSetCollection(housename, str(outputfolderpath))
output_datasets = context.catalog.load(housename + '_jsonoutput')
outputfolderpath = output_datasets._folderpath
for dataset_filenamestem, dataset in datasets.items():
# a manual load is required here, because
# the dataset **is not** registered in kedro's catalog

@ -76,7 +76,6 @@ class BsXMLDataSet(XMLDataSet):
def _load(self):
"from the xml file, loads a internal xml repr (with bsoup)"
logger.info("------------------------- bsoup loader -------")
with open(self._filepath, 'r', encoding="utf-8") as fhandle:
self.soup = BeautifulSoup(fhandle, 'xml')
## xml.prettify() is the bsoup str(source_doc)
@ -87,7 +86,6 @@ class BsXMLDataSet(XMLDataSet):
json.dump(data, fp, sort_keys=True, indent=4)
def transform(self):
logger.info("---------------- transform --------------")
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
numb = self.soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
@ -169,7 +167,7 @@ class BsXMLDataSetCollection(DataSetCollection):
return self
class JSONDataSet: #(AbstractDataSet):
class JSONDataSet:
def __init__(self, filepath: str):
self._filepath = filepath

Loading…
Cancel
Save