import os import re from bs4 import BeautifulSoup def make_soup(file: str): """open a xml file and return a BeautifulSoup object""" with open(file, 'r', encoding="utf-8") as opening: xml = BeautifulSoup(opening, 'xml') return xml def split_div(xml_file: str, prince: str)-> dict: """1) read xml file with function make_soup 2) make a list with all //div if //div/@n == True 3) make actes' ids with param + date manage double dates : each //docDate/date must have a @n with 'a', 'b', etc... the commented scrip may help you to check if there is double date. 4) merge actes' ids list with div list""" # 1 soup = make_soup(xml_file) # 2 actes = [div for div in soup.find_all('div', {'n': True})] # 3 actes_id =[] for acte in actes: date = acte.div.docDate.date["when"] date_nb = acte.div.docDate.date["n"] actes_id.append(prince + date.replace("-", "_") + date_nb) """ # script to get all double dates prev_acte = None for id_acte in actes_id: if prev_acte == id_acte: print(prev_acte, id_acte) prev_acte = id_acte """ # 4 if len(actes_id) == len(actes): dict_actes = {actes_id[item]: actes[item] for item in range(len(actes_id))} else: print(f"""** ERROR **\nactes_id == {len(actes_id)}\nactes == {len(actes)}""") return dict_actes def create_file(actes, house, folder, tei_canvas): for acte in actes: filename = os.path.join("..", "static", "xml", house, folder, acte + '.xml') with open(filename, 'w', encoding='utf-8') as writting: writting.write(str(actes[acte])) with open(filename, 'r', encoding='utf-8') as xml: soup = BeautifulSoup(xml, "xml") print(soup) canvas = f""" Actes princiers Actes de Charles Ier de Bourbon transcribed by Jean-Damien Généro Acte édité dans le cadre du programme Actes princiers. direction scientifique Olivier Mattéoni direction technique Jean-Damien Généro direction technique Nicolas Perreaux Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558) Olivier Mattéoni 2022 Distributed under an Open License 2.0 ++ACTE++ """ corpus = "../../bourbon-latex/charles-actes-latex.xml" create_file(split_div(corpus, "brb_ch_i_"), "Bourbon", "Brb_5_Charles_Ier", canvas)