initial commit -- split div
parent
484e1870a7
commit
711604829d
@ -0,0 +1,105 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def make_soup(file: str):
|
||||||
|
"""open a xml file and return a BeautifulSoup object"""
|
||||||
|
with open(file, 'r', encoding="utf-8") as opening:
|
||||||
|
xml = BeautifulSoup(opening, 'xml')
|
||||||
|
return xml
|
||||||
|
|
||||||
|
|
||||||
|
def split_div(xml_file: str, prince: str)-> dict:
|
||||||
|
"""1) read xml file with function make_soup
|
||||||
|
2) make a list with all //div if //div/@n == True
|
||||||
|
3) make actes' ids with param <prince> + date
|
||||||
|
manage double dates : each //docDate/date must have
|
||||||
|
a @n with 'a', 'b', etc... the commented scrip may
|
||||||
|
help you to check if there is double date.
|
||||||
|
4) merge actes' ids list with div list"""
|
||||||
|
# 1
|
||||||
|
soup = make_soup(xml_file)
|
||||||
|
# 2
|
||||||
|
actes = [div for div in soup.find_all('div', {'n': True})]
|
||||||
|
# 3
|
||||||
|
actes_id =[]
|
||||||
|
for acte in actes:
|
||||||
|
date = acte.div.docDate.date["when"]
|
||||||
|
date_nb = acte.div.docDate.date["n"]
|
||||||
|
actes_id.append(prince + date.replace("-", "_") + date_nb)
|
||||||
|
"""
|
||||||
|
# script to get all double dates
|
||||||
|
prev_acte = None
|
||||||
|
for id_acte in actes_id:
|
||||||
|
if prev_acte == id_acte:
|
||||||
|
print(prev_acte, id_acte)
|
||||||
|
prev_acte = id_acte
|
||||||
|
"""
|
||||||
|
# 4
|
||||||
|
if len(actes_id) == len(actes):
|
||||||
|
dict_actes = {actes_id[item]: actes[item] for item in range(len(actes_id))}
|
||||||
|
else:
|
||||||
|
print(f"""** ERROR **\nactes_id == {len(actes_id)}\nactes == {len(actes)}""")
|
||||||
|
return dict_actes
|
||||||
|
|
||||||
|
def create_file(actes, house, folder, tei_canvas):
|
||||||
|
for acte in actes:
|
||||||
|
filename = os.path.join("..", "static", "xml", house, folder, acte + '.xml')
|
||||||
|
with open(filename, 'w', encoding='utf-8') as writting:
|
||||||
|
writting.write(str(actes[acte]))
|
||||||
|
with open(filename, 'r', encoding='utf-8') as xml:
|
||||||
|
soup = BeautifulSoup(xml, "xml")
|
||||||
|
print(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
canvas = f"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
||||||
|
<teiHeader>
|
||||||
|
<fileDesc>
|
||||||
|
<titleStmt>
|
||||||
|
<title level="s">Actes princiers</title>
|
||||||
|
<title level="m">Actes de Charles Ier de Bourbon</title>
|
||||||
|
<title level="a"></title>
|
||||||
|
<respStmt>
|
||||||
|
<resp>transcribed by</resp>
|
||||||
|
<name>Jean-Damien Généro</name>
|
||||||
|
</respStmt>
|
||||||
|
</titleStmt>
|
||||||
|
<editionStmt>
|
||||||
|
<edition>Acte édité dans le cadre du programme Actes princiers.</edition>
|
||||||
|
<respStmt>
|
||||||
|
<resp>direction scientifique</resp>
|
||||||
|
<name>Olivier Mattéoni</name>
|
||||||
|
</respStmt>
|
||||||
|
<respStmt>
|
||||||
|
<resp>direction technique</resp>
|
||||||
|
<name>Jean-Damien Généro</name>
|
||||||
|
</respStmt>
|
||||||
|
<respStmt>
|
||||||
|
<resp>direction technique</resp>
|
||||||
|
<name>Nicolas Perreaux</name>
|
||||||
|
</respStmt>
|
||||||
|
</editionStmt>
|
||||||
|
<publicationStmt>
|
||||||
|
<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
|
||||||
|
<authority>Olivier Mattéoni</authority>
|
||||||
|
<date when="2022">2022</date>
|
||||||
|
<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
|
||||||
|
</publicationStmt>
|
||||||
|
<sourceDesc>
|
||||||
|
</sourceDesc>
|
||||||
|
</fileDesc>
|
||||||
|
</teiHeader>
|
||||||
|
<text>
|
||||||
|
<body>
|
||||||
|
++ACTE++
|
||||||
|
</body>
|
||||||
|
</text>
|
||||||
|
</TEI>"""
|
||||||
|
|
||||||
|
|
||||||
|
corpus = "../../bourbon-latex/charles-actes-latex.xml"
|
||||||
|
|
||||||
|
create_file(split_div(corpus, "brb_ch_i_"), "Bourbon", "Brb_5_Charles_Ier", canvas)
|
||||||
Loading…
Reference in New Issue