|
|
|
@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Authors : Jean-Damien Généro, Bertrand Dumenieu
|
|
|
|
|
|
|
|
Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
|
|
|
|
|
|
|
|
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
|
|
|
|
|
|
|
|
Date : 2021-12-31
|
|
|
|
|
|
|
|
Update : 2022-01-04
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def beautiful_clean_up(path):
|
|
|
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as opened:
|
|
|
|
|
|
|
|
reading = opened.read()
|
|
|
|
|
|
|
|
# acts
|
|
|
|
|
|
|
|
separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
|
|
|
|
|
|
|
|
reading = re.sub(separator, '</div><div>', reading)
|
|
|
|
|
|
|
|
reading = reading.replace('<figure>', '<div>')
|
|
|
|
|
|
|
|
reading = reading.replace('</figure>', '</div>')
|
|
|
|
|
|
|
|
soup = BeautifulSoup(reading, 'xml')
|
|
|
|
|
|
|
|
# titre
|
|
|
|
|
|
|
|
titleStmt = soup.titleStmt
|
|
|
|
|
|
|
|
title = soup.new_tag('title')
|
|
|
|
|
|
|
|
author = soup.new_tag('author')
|
|
|
|
|
|
|
|
title.string = 'Actes d\'Agnès de Bourgogne'
|
|
|
|
|
|
|
|
author.string = 'Jean-Damien Généro'
|
|
|
|
|
|
|
|
titleStmt.append(title)
|
|
|
|
|
|
|
|
titleStmt.append(author)
|
|
|
|
|
|
|
|
# publicationStmt
|
|
|
|
|
|
|
|
publicationStmt = soup.publicationStmt
|
|
|
|
|
|
|
|
date = soup.new_tag('date')
|
|
|
|
|
|
|
|
date['when'] = '2022'
|
|
|
|
|
|
|
|
date.string = '2022'
|
|
|
|
|
|
|
|
publisher = soup.new_tag('publisher')
|
|
|
|
|
|
|
|
publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
|
|
|
|
|
|
|
|
availability = soup.new_tag('availability')
|
|
|
|
|
|
|
|
licence = soup.new_tag('licence')
|
|
|
|
|
|
|
|
licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
|
|
|
|
|
|
|
|
licence.string = 'Distributed under an Open License 2.0'
|
|
|
|
|
|
|
|
availability.append(licence)
|
|
|
|
|
|
|
|
publicationStmt.append(publisher)
|
|
|
|
|
|
|
|
publicationStmt.append(date)
|
|
|
|
|
|
|
|
publicationStmt.append(availability)
|
|
|
|
|
|
|
|
# sourceDesc
|
|
|
|
|
|
|
|
sourceDesc = soup.sourceDesc
|
|
|
|
|
|
|
|
sourceDesc_p = soup.new_tag('p')
|
|
|
|
|
|
|
|
sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
|
|
|
|
|
|
|
|
sourceDesc.append(sourceDesc_p)
|
|
|
|
|
|
|
|
# removing all empty tags
|
|
|
|
|
|
|
|
for empty_tag in soup.find_all():
|
|
|
|
|
|
|
|
if len(empty_tag.get_text(strip=True)) == 0:
|
|
|
|
|
|
|
|
empty_tag.extract()
|
|
|
|
|
|
|
|
# number
|
|
|
|
|
|
|
|
for match in soup.find_all('div'):
|
|
|
|
|
|
|
|
match['xml:id'] = 'agnes-' + match.p.string
|
|
|
|
|
|
|
|
match['n'] = match.p.string
|
|
|
|
|
|
|
|
# witness
|
|
|
|
|
|
|
|
for match in soup.findAll('seg'):
|
|
|
|
|
|
|
|
match.replaceWithChildren()
|
|
|
|
|
|
|
|
for match in soup.findAll('emph'):
|
|
|
|
|
|
|
|
match.replaceWithChildren()
|
|
|
|
|
|
|
|
for match in soup.findAll('name'):
|
|
|
|
|
|
|
|
match.replaceWithChildren()
|
|
|
|
|
|
|
|
for tag in soup.find_all('p'):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
tag['rend']
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
if tag['rend'] == 'Corps':
|
|
|
|
|
|
|
|
del tag['rend']
|
|
|
|
|
|
|
|
# for match in soup.find_all('p'):
|
|
|
|
|
|
|
|
# if re.search(r'([A-Z])\. [OC]', match.text):
|
|
|
|
|
|
|
|
# witness = soup.new_tag('witness')
|
|
|
|
|
|
|
|
# witness.contents = match.contents
|
|
|
|
|
|
|
|
# witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
|
|
|
|
|
|
|
|
# match.replaceWith(witness)
|
|
|
|
|
|
|
|
result = str(soup)
|
|
|
|
|
|
|
|
result = result.replace('p rend="Corps ', 'p rend="')
|
|
|
|
|
|
|
|
result = result.replace(' ', ' ')
|
|
|
|
|
|
|
|
result = result.replace('\n', '')
|
|
|
|
|
|
|
|
with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting:
|
|
|
|
|
|
|
|
writting.write(result)
|
|
|
|
|
|
|
|
newsoup = BeautifulSoup(result, 'xml')
|
|
|
|
|
|
|
|
body = newsoup.body
|
|
|
|
|
|
|
|
for div in body.find_all('div'):
|
|
|
|
|
|
|
|
#print("\n\n" + div['xml:id'])
|
|
|
|
|
|
|
|
filename = 'agnes_' + div['n'] + '.xml'
|
|
|
|
|
|
|
|
with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening:
|
|
|
|
|
|
|
|
soup_act = BeautifulSoup(opening, 'xml')
|
|
|
|
|
|
|
|
soup_act.body.append(div)
|
|
|
|
|
|
|
|
with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting:
|
|
|
|
|
|
|
|
writting.write(soup_act.prettify())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def teiheader_making(file, tei_canvas):
|
|
|
|
|
|
|
|
soup = BeautifulSoup(tei_canvas, 'xml')
|
|
|
|
|
|
|
|
p_sourceDesc = soup.new_tag('p')
|
|
|
|
|
|
|
|
with open(file, 'r', encoding='utf-8') as opening:
|
|
|
|
|
|
|
|
csvfile = csv.reader(opening, delimiter=";")
|
|
|
|
|
|
|
|
for line in csvfile:
|
|
|
|
|
|
|
|
filename = 'agnes_' + line[0] + ".xml"
|
|
|
|
|
|
|
|
soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4])
|
|
|
|
|
|
|
|
p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
|
|
|
|
|
|
|
|
soup.sourceDesc.append(p_sourceDesc)
|
|
|
|
|
|
|
|
with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting:
|
|
|
|
|
|
|
|
writting.write(str(soup))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
canvas = f"""<?xml version="1.0" encoding="utf-8"?>
|
|
|
|
|
|
|
|
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
|
|
|
|
|
|
|
<teiHeader>
|
|
|
|
|
|
|
|
<fileDesc>
|
|
|
|
|
|
|
|
<titleStmt>
|
|
|
|
|
|
|
|
<title></title>
|
|
|
|
|
|
|
|
<author></author>
|
|
|
|
|
|
|
|
</titleStmt>
|
|
|
|
|
|
|
|
<publicationStmt>
|
|
|
|
|
|
|
|
<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
|
|
|
|
|
|
|
|
<date when="2022">2022</date>
|
|
|
|
|
|
|
|
<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
|
|
|
|
|
|
|
|
</publicationStmt>
|
|
|
|
|
|
|
|
<sourceDesc>
|
|
|
|
|
|
|
|
</sourceDesc>
|
|
|
|
|
|
|
|
</fileDesc>
|
|
|
|
|
|
|
|
</teiHeader>
|
|
|
|
|
|
|
|
<text>
|
|
|
|
|
|
|
|
<body>
|
|
|
|
|
|
|
|
</body>
|
|
|
|
|
|
|
|
</text>
|
|
|
|
|
|
|
|
</TEI>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")
|