You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
5.4 KiB
Python

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Authors : Jean-Damien Généro, Bertrand Dumenieu
Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2021-12-31
Update : 2022-01-04
"""
import csv
import os
import re
from bs4 import BeautifulSoup
def beautiful_clean_up(path):
with open(path, 'r', encoding='utf-8') as opened:
reading = opened.read()
# acts
separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
reading = re.sub(separator, '</div><div>', reading)
reading = re.sub(r'<body>\n +<figure>', '<body>\n<div>', reading)
reading = reading.replace(r'<\/figure>\n +<\/body>', '</div>\n</body>')
reading = reading.replace('<figure>', '')
reading = reading.replace('</figure>', '')
reading = reading.replace('<ident>', '')
reading = reading.replace('</ident>', '')
reading = reading.replace("", "'")
soup = BeautifulSoup(reading, 'xml')
# titre
titleStmt = soup.titleStmt
# title = soup.new_tag('title')
author = soup.new_tag('author')
# title.string = 'Actes de Charles Ier'
author.string = 'Jean-Damien Généro'
# titleStmt.append(title)
titleStmt.append(author)
# publicationStmt
# publicationStmt = soup.publicationStmt
# date = soup.new_tag('date')
# date['when'] = '2022'
# date.string = '2022'
# publisher = soup.new_tag('publisher')
# publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
# availability = soup.new_tag('availability')
# licence = soup.new_tag('licence')
# licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
# licence.string = 'Distributed under an Open License 2.0'
# availability.append(licence)
# publicationStmt.append(publisher)
# publicationStmt.append(date)
# publicationStmt.append(availability)
# sourceDesc
# sourceDesc = soup.sourceDesc
# sourceDesc_p = soup.new_tag('p')
# sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
# sourceDesc.append(sourceDesc_p)
# removing all empty tags
for empty_tag in soup.find_all():
if len(empty_tag.get_text(strip=True)) == 0:
empty_tag.extract()
# number
for match in soup.findAll('ident'):
match.replaceWithChildren()
for match in soup.findAll('seg'):
match.replaceWithChildren()
for match in soup.findAll('emph'):
match.replaceWithChildren()
for match in soup.findAll('name'):
match.replaceWithChildren()
for match in soup.find_all('div'):
match['xml:id'] = 'charles_ier_' + match.p.string.replace(' ', '')
match['n'] = match.p.string.replace(' ', '')
# witness
for tag in soup.find_all('p'):
try:
tag['rend']
except KeyError:
continue
if tag['rend'] == 'Corps':
del tag['rend']
# for match in soup.find_all('p'):
# if re.search(r'([A-Z])\. [OC]', match.text):
# witness = soup.new_tag('witness')
# witness.contents = match.contents
# witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
# match.replaceWith(witness)
result = str(soup)
result = result.replace('p rend="Corps ', 'p rend="')
result = result.replace(' ', ' ')
result = result.replace('\n', '')
result = result.replace('.replace(' ', '')', '')
newsoup = BeautifulSoup(result, 'xml')
# with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op:
# op.write(newsoup.prettify())
body = newsoup.body
for div in body.find_all('div'):
#print("\n\n" + div['xml:id'])
filename = 'charles_ier_' + div['n'] + '.xml'
with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'r', encoding='utf-8') as opening:
soup_act = BeautifulSoup(opening, 'xml')
soup_act.body.append(div)
with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding='utf-8') as writting:
writting.write(soup_act.prettify())
def teiheader_making(file, tei_canvas):
soup = BeautifulSoup(tei_canvas, 'xml')
p_sourceDesc = soup.new_tag('p')
with open(file, 'r', encoding='utf-8') as opening:
csvfile = csv.reader(opening, delimiter=";")
for line in csvfile:
filename = 'charles_ier_' + line[0] + ".xml"
soup.title.string = 'Acte {} de Charles Ier de Bourbon ({})'.format(line[0], line[4])
p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} de Charles Ier de Bourbon, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
soup.sourceDesc.append(p_sourceDesc)
with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding="utf-8") as writting:
writting.write(str(soup))
canvas = f"""<?xml version="1.0" encoding="utf-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
<author></author>
</titleStmt>
<publicationStmt>
<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
<date when="2022">2022</date>
<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
</publicationStmt>
<sourceDesc>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
</body>
</text>
</TEI>"""
teiheader_making("../static/csv/corpus-charles-i.csv", canvas)
beautiful_clean_up("../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml")