update charles ier

main
Jean-Damien Genero 4 years ago
parent 489d0b365b
commit 7673649c95

@ -22,52 +22,59 @@ def beautiful_clean_up(path):
# acts # acts
separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>' separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
reading = re.sub(separator, '</div><div>', reading) reading = re.sub(separator, '</div><div>', reading)
reading = reading.replace('<figure>', '<div>') reading = re.sub(r'<body>\n +<figure>', '<body>\n<div>', reading)
reading = reading.replace('</figure>', '</div>') reading = reading.replace(r'<\/figure>\n +<\/body>', '</div>\n</body>')
reading = reading.replace('<figure>', '')
reading = reading.replace('</figure>', '')
reading = reading.replace('<ident>', '')
reading = reading.replace('</ident>', '')
reading = reading.replace("", "'")
soup = BeautifulSoup(reading, 'xml') soup = BeautifulSoup(reading, 'xml')
# titre # titre
titleStmt = soup.titleStmt titleStmt = soup.titleStmt
title = soup.new_tag('title') # title = soup.new_tag('title')
author = soup.new_tag('author') author = soup.new_tag('author')
title.string = 'Actes d\'Agnès de Bourgogne' # title.string = 'Actes de Charles Ier'
author.string = 'Jean-Damien Généro' author.string = 'Jean-Damien Généro'
titleStmt.append(title) # titleStmt.append(title)
titleStmt.append(author) titleStmt.append(author)
# publicationStmt # publicationStmt
publicationStmt = soup.publicationStmt # publicationStmt = soup.publicationStmt
date = soup.new_tag('date') # date = soup.new_tag('date')
date['when'] = '2022' # date['when'] = '2022'
date.string = '2022' # date.string = '2022'
publisher = soup.new_tag('publisher') # publisher = soup.new_tag('publisher')
publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' # publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
availability = soup.new_tag('availability') # availability = soup.new_tag('availability')
licence = soup.new_tag('licence') # licence = soup.new_tag('licence')
licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' # licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
licence.string = 'Distributed under an Open License 2.0' # licence.string = 'Distributed under an Open License 2.0'
availability.append(licence) # availability.append(licence)
publicationStmt.append(publisher) # publicationStmt.append(publisher)
publicationStmt.append(date) # publicationStmt.append(date)
publicationStmt.append(availability) # publicationStmt.append(availability)
# sourceDesc # sourceDesc
sourceDesc = soup.sourceDesc # sourceDesc = soup.sourceDesc
sourceDesc_p = soup.new_tag('p') # sourceDesc_p = soup.new_tag('p')
sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' # sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
sourceDesc.append(sourceDesc_p) # sourceDesc.append(sourceDesc_p)
# removing all empty tags # removing all empty tags
for empty_tag in soup.find_all(): for empty_tag in soup.find_all():
if len(empty_tag.get_text(strip=True)) == 0: if len(empty_tag.get_text(strip=True)) == 0:
empty_tag.extract() empty_tag.extract()
# number # number
for match in soup.find_all('div'): for match in soup.findAll('ident'):
match['xml:id'] = 'agnes-' + match.p.string match.replaceWithChildren()
match['n'] = match.p.string
# witness
for match in soup.findAll('seg'): for match in soup.findAll('seg'):
match.replaceWithChildren() match.replaceWithChildren()
for match in soup.findAll('emph'): for match in soup.findAll('emph'):
match.replaceWithChildren() match.replaceWithChildren()
for match in soup.findAll('name'): for match in soup.findAll('name'):
match.replaceWithChildren() match.replaceWithChildren()
for match in soup.find_all('div'):
match['xml:id'] = 'charles_ier_' + match.p.string.replace(' ', '')
match['n'] = match.p.string.replace(' ', '')
# witness
for tag in soup.find_all('p'): for tag in soup.find_all('p'):
try: try:
tag['rend'] tag['rend']
@ -85,17 +92,18 @@ def beautiful_clean_up(path):
result = result.replace('p rend="Corps ', 'p rend="') result = result.replace('p rend="Corps ', 'p rend="')
result = result.replace(' ', ' ') result = result.replace(' ', ' ')
result = result.replace('\n', '') result = result.replace('\n', '')
with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting: result = result.replace('.replace(' ', '')', '')
writting.write(result)
newsoup = BeautifulSoup(result, 'xml') newsoup = BeautifulSoup(result, 'xml')
# with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op:
# op.write(newsoup.prettify())
body = newsoup.body body = newsoup.body
for div in body.find_all('div'): for div in body.find_all('div'):
#print("\n\n" + div['xml:id']) #print("\n\n" + div['xml:id'])
filename = 'agnes_' + div['n'] + '.xml' filename = 'charles_ier_' + div['n'] + '.xml'
with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening: with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'r', encoding='utf-8') as opening:
soup_act = BeautifulSoup(opening, 'xml') soup_act = BeautifulSoup(opening, 'xml')
soup_act.body.append(div) soup_act.body.append(div)
with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting: with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding='utf-8') as writting:
writting.write(soup_act.prettify()) writting.write(soup_act.prettify())
@ -105,11 +113,11 @@ def teiheader_making(file, tei_canvas):
with open(file, 'r', encoding='utf-8') as opening: with open(file, 'r', encoding='utf-8') as opening:
csvfile = csv.reader(opening, delimiter=";") csvfile = csv.reader(opening, delimiter=";")
for line in csvfile: for line in csvfile:
filename = 'agnes_' + line[0] + ".xml" filename = 'charles_ier_' + line[0] + ".xml"
soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4]) soup.title.string = 'Acte {} de Charles Ier de Bourbon ({})'.format(line[0], line[4])
p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} de Charles Ier de Bourbon, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
soup.sourceDesc.append(p_sourceDesc) soup.sourceDesc.append(p_sourceDesc)
with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting: with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding="utf-8") as writting:
writting.write(str(soup)) writting.write(str(soup))
@ -136,6 +144,6 @@ canvas = f"""<?xml version="1.0" encoding="utf-8"?>
</text> </text>
</TEI>""" </TEI>"""
teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas) teiheader_making("../static/csv/corpus-charles-i.csv", canvas)
beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml") beautiful_clean_up("../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml")

Loading…
Cancel
Save