From 711604829dce359afb8109b6c6e10a7c021006c5 Mon Sep 17 00:00:00 2001 From: jgenero Date: Wed, 12 Oct 2022 15:05:25 +0200 Subject: [PATCH] initial commit -- split div --- app/cmd/split_div.py | 105 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 app/cmd/split_div.py diff --git a/app/cmd/split_div.py b/app/cmd/split_div.py new file mode 100644 index 0000000..0b70ba1 --- /dev/null +++ b/app/cmd/split_div.py @@ -0,0 +1,105 @@ +import os +import re +from bs4 import BeautifulSoup + + +def make_soup(file: str): + """open a xml file and return a BeautifulSoup object""" + with open(file, 'r', encoding="utf-8") as opening: + xml = BeautifulSoup(opening, 'xml') + return xml + + +def split_div(xml_file: str, prince: str)-> dict: + """1) read xml file with function make_soup + 2) make a list with all //div if //div/@n == True + 3) make actes' ids with param + date + manage double dates : each //docDate/date must have + a @n with 'a', 'b', etc... the commented scrip may + help you to check if there is double date. + 4) merge actes' ids list with div list""" + # 1 + soup = make_soup(xml_file) + # 2 + actes = [div for div in soup.find_all('div', {'n': True})] + # 3 + actes_id =[] + for acte in actes: + date = acte.div.docDate.date["when"] + date_nb = acte.div.docDate.date["n"] + actes_id.append(prince + date.replace("-", "_") + date_nb) + """ + # script to get all double dates + prev_acte = None + for id_acte in actes_id: + if prev_acte == id_acte: + print(prev_acte, id_acte) + prev_acte = id_acte + """ + # 4 + if len(actes_id) == len(actes): + dict_actes = {actes_id[item]: actes[item] for item in range(len(actes_id))} + else: + print(f"""** ERROR **\nactes_id == {len(actes_id)}\nactes == {len(actes)}""") + return dict_actes + +def create_file(actes, house, folder, tei_canvas): + for acte in actes: + filename = os.path.join("..", "static", "xml", house, folder, acte + '.xml') + with open(filename, 'w', encoding='utf-8') as writting: + writting.write(str(actes[acte])) + with open(filename, 'r', encoding='utf-8') as xml: + soup = BeautifulSoup(xml, "xml") + print(soup) + + + +canvas = f""" + + + + +Actes princiers +Actes de Charles Ier de Bourbon + + +transcribed by +Jean-Damien Généro + + + +Acte édité dans le cadre du programme Actes princiers. + +direction scientifique +Olivier Mattéoni + + +direction technique +Jean-Damien Généro + + +direction technique +Nicolas Perreaux + + + +Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558) +Olivier Mattéoni +2022 +Distributed under an Open License 2.0 + + + + + + + +++ACTE++ + + +""" + + +corpus = "../../bourbon-latex/charles-actes-latex.xml" + +create_file(split_div(corpus, "brb_ch_i_"), "Bourbon", "Brb_5_Charles_Ier", canvas)