actes-princiers/app/cmd/bourbon_clean_up.py

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Authors : Jean-Damien Généro, Bertrand Dumenieu
Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2021-12-31
Update : 2022-01-05

Regex dans les ODT :
- j'enlève tous les sauts de page (ctrl+A, puis Format, Paragraphe, onglet Enchainements, décocher Saut / Insérer.)
- je vérifie que tous les séparateurs sont sur une seule ligne : ` +\*\*` >> `\n$1`
- je transforme les séparateurs : `\*{3}` >> `-*-`
"""


import csv
import os
import re
from bs4 import BeautifulSoup


def beautiful_clean_up(path):
	with open(path, 'r', encoding='utf-8') as opened:
		reading = opened.read()
		# acts
		separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
		reading = re.sub(separator, '</div><div>', reading)
		reading = re.sub(r'<body>\n +<figure>', '<body>\n<div>', reading)
		reading = reading.replace(r'<\/figure>\n +<\/body>', '</div>\n</body>')
		reading = reading.replace('<figure>', '')
		reading = reading.replace('</figure>', '')
		reading = reading.replace('<ident>', '')
		reading = reading.replace('</ident>', '')
		reading = reading.replace("’", "'")
		soup = BeautifulSoup(reading, 'xml')
		# titre
		titleStmt = soup.titleStmt
		# title = soup.new_tag('title')
		author = soup.new_tag('author')
		# title.string = 'Actes de Charles Ier'
		author.string = 'Jean-Damien Généro'
		# titleStmt.append(title)
		titleStmt.append(author)
		# publicationStmt
		# publicationStmt = soup.publicationStmt
		# date = soup.new_tag('date')
		# date['when'] = '2022'
		# date.string = '2022'
		# publisher = soup.new_tag('publisher')
		# publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
		# availability = soup.new_tag('availability')
		# licence = soup.new_tag('licence')
		# licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
		# licence.string = 'Distributed under an Open License 2.0'
		# availability.append(licence)
		# publicationStmt.append(publisher)
		# publicationStmt.append(date)
		# publicationStmt.append(availability)
		# sourceDesc
		# sourceDesc = soup.sourceDesc
		# sourceDesc_p = soup.new_tag('p')
		# sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
		# sourceDesc.append(sourceDesc_p)
		# removing all empty tags
		for empty_tag in soup.find_all():
			if len(empty_tag.get_text(strip=True)) == 0:
				empty_tag.extract()
		# number
		for match in soup.findAll('ident'):
			match.replaceWithChildren()
		for match in soup.findAll('seg'):
			match.replaceWithChildren()
		for match in soup.findAll('emph'):
			match.replaceWithChildren()
		for match in soup.findAll('name'):
			match.replaceWithChildren()
		for match in soup.find_all('div'):
			match['xml:id'] = 'charles_ier_' + match.p.string.replace(' ', '')
			match['n'] = match.p.string.replace(' ', '')
		# witness
		for tag in soup.find_all('p'):
			try:
				tag['rend']
			except KeyError:
				continue
			if tag['rend'] == 'Corps':
				del tag['rend']
		# for match in soup.find_all('p'):
		# 	if re.search(r'([A-Z])\. [OC]', match.text):
		# 		witness = soup.new_tag('witness')
		# 		witness.contents = match.contents
		# 		witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
		# 		match.replaceWith(witness)
		result = str(soup)
		result = result.replace('p rend="Corps ', 'p rend="')
		result = result.replace(' ', ' ')
		result = result.replace('\n', '')
		result = result.replace('.replace(' ', '')', '')
		newsoup = BeautifulSoup(result, 'xml')
		# with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op:
			# op.write(newsoup.prettify())
		body = newsoup.body
		for div in body.find_all('div'):
			#print("\n\n" + div['xml:id'])
			filename = 'charles_ier_' + div['n'] + '.xml'
			with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'r', encoding='utf-8') as opening:
				soup_act = BeautifulSoup(opening, 'xml')
				soup_act.body.append(div)
			with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding='utf-8') as writting:
				writting.write(soup_act.prettify())


def teiheader_making(file, tei_canvas):
	soup = BeautifulSoup(tei_canvas, 'xml')
	p_sourceDesc = soup.new_tag('p')
	with open(file, 'r', encoding='utf-8') as opening:
		csvfile = csv.reader(opening, delimiter=";")
		for line in csvfile:
			filename = 'charles_ier_' + line[0] + ".xml"
			soup.title.string = 'Acte {} de Charles Ier de Bourbon ({})'.format(line[0], line[4])
			p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} de Charles Ier de Bourbon, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
			soup.sourceDesc.append(p_sourceDesc)
			with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding="utf-8") as writting:
				writting.write(str(soup))


canvas = f"""<?xml version="1.0" encoding="utf-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
<author></author>
</titleStmt>
<publicationStmt>
<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
<date when="2022">2022</date>
<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
</publicationStmt>
<sourceDesc>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
</body>
</text>
</TEI>"""

teiheader_making("../static/csv/corpus-charles-i.csv", canvas)

beautiful_clean_up("../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml")