You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
4.9 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Authors : Jean-Damien Généro, Bertrand Dumenieu
Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2021-12-31
Update : 2022-01-04
"""
import csv
import os
import re
from bs4 import BeautifulSoup
def beautiful_clean_up(path):
with open(path, 'r', encoding='utf-8') as opened:
reading = opened.read()
# acts
separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
reading = re.sub(separator, '</div><div>', reading)
reading = reading.replace('<figure>', '<div>')
reading = reading.replace('</figure>', '</div>')
soup = BeautifulSoup(reading, 'xml')
# titre
titleStmt = soup.titleStmt
title = soup.new_tag('title')
author = soup.new_tag('author')
title.string = 'Actes d\'Agnès de Bourgogne'
author.string = 'Jean-Damien Généro'
titleStmt.append(title)
titleStmt.append(author)
# publicationStmt
publicationStmt = soup.publicationStmt
date = soup.new_tag('date')
date['when'] = '2022'
date.string = '2022'
publisher = soup.new_tag('publisher')
publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
availability = soup.new_tag('availability')
licence = soup.new_tag('licence')
licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
licence.string = 'Distributed under an Open License 2.0'
availability.append(licence)
publicationStmt.append(publisher)
publicationStmt.append(date)
publicationStmt.append(availability)
# sourceDesc
sourceDesc = soup.sourceDesc
sourceDesc_p = soup.new_tag('p')
sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
sourceDesc.append(sourceDesc_p)
# removing all empty tags
for empty_tag in soup.find_all():
if len(empty_tag.get_text(strip=True)) == 0:
empty_tag.extract()
# number
for match in soup.find_all('div'):
match['xml:id'] = 'agnes-' + match.p.string
match['n'] = match.p.string
# witness
for match in soup.findAll('seg'):
match.replaceWithChildren()
for match in soup.findAll('emph'):
match.replaceWithChildren()
for match in soup.findAll('name'):
match.replaceWithChildren()
for tag in soup.find_all('p'):
try:
tag['rend']
except KeyError:
continue
if tag['rend'] == 'Corps':
del tag['rend']
# for match in soup.find_all('p'):
# if re.search(r'([A-Z])\. [OC]', match.text):
# witness = soup.new_tag('witness')
# witness.contents = match.contents
# witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
# match.replaceWith(witness)
result = str(soup)
result = result.replace('p rend="Corps ', 'p rend="')
result = result.replace(' ', ' ')
result = result.replace('\n', '')
with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting:
writting.write(result)
newsoup = BeautifulSoup(result, 'xml')
body = newsoup.body
for div in body.find_all('div'):
#print("\n\n" + div['xml:id'])
filename = 'agnes_' + div['n'] + '.xml'
with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening:
soup_act = BeautifulSoup(opening, 'xml')
soup_act.body.append(div)
with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting:
writting.write(soup_act.prettify())
def teiheader_making(file, tei_canvas):
soup = BeautifulSoup(tei_canvas, 'xml')
p_sourceDesc = soup.new_tag('p')
with open(file, 'r', encoding='utf-8') as opening:
csvfile = csv.reader(opening, delimiter=";")
for line in csvfile:
filename = 'agnes_' + line[0] + ".xml"
soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4])
p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
soup.sourceDesc.append(p_sourceDesc)
with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting:
writting.write(str(soup))
canvas = f"""<?xml version="1.0" encoding="utf-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
<author></author>
</titleStmt>
<publicationStmt>
<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
<date when="2022">2022</date>
<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
</publicationStmt>
<sourceDesc>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
</body>
</text>
</TEI>"""
teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas)
beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")