From 12fea204ec1583146c33d8016b1e42517d461f99 Mon Sep 17 00:00:00 2001
From: Jean-Damien Genero <jean-damien.genero@ehess.fr>
Date: Wed, 5 Jan 2022 11:08:48 +0100
Subject: [PATCH] initial commit

---
 app/cmd/bourbon_clean_up.py | 141 ++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 app/cmd/bourbon_clean_up.py
diff --git a/app/cmd/bourbon_clean_up.py b/app/cmd/bourbon_clean_up.py
new file mode 100644
index 0000000..73029b1
--- /dev/null
+++ b/app/cmd/bourbon_clean_up.py
@@ -0,0 +1,141 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+"""
+Authors : Jean-Damien Généro, Bertrand Dumenieu
+Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
+Assigned at the Centre de recherches historiques (CRH, UMR 8558)
+Date : 2021-12-31
+Update : 2022-01-04
+"""
+
+
+import csv
+import os
+import re
+from bs4 import BeautifulSoup
+
+
+def beautiful_clean_up(path):
+	with open(path, 'r', encoding='utf-8') as opened:
+		reading = opened.read()
+		# acts
+		separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
+		reading = re.sub(separator, '</div><div>', reading)
+		reading = reading.replace('<figure>', '<div>')
+		reading = reading.replace('</figure>', '</div>')
+		soup = BeautifulSoup(reading, 'xml')
+		# titre
+		titleStmt = soup.titleStmt
+		title = soup.new_tag('title')
+		author = soup.new_tag('author')
+		title.string = 'Actes d\'Agnès de Bourgogne'
+		author.string = 'Jean-Damien Généro'
+		titleStmt.append(title)
+		titleStmt.append(author)
+		# publicationStmt
+		publicationStmt = soup.publicationStmt
+		date = soup.new_tag('date')
+		date['when'] = '2022'
+		date.string = '2022'
+		publisher = soup.new_tag('publisher')
+		publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
+		availability = soup.new_tag('availability')
+		licence = soup.new_tag('licence')
+		licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
+		licence.string = 'Distributed under an Open License 2.0'
+		availability.append(licence)
+		publicationStmt.append(publisher)
+		publicationStmt.append(date)
+		publicationStmt.append(availability)
+		# sourceDesc
+		sourceDesc = soup.sourceDesc
+		sourceDesc_p = soup.new_tag('p')
+		sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
+		sourceDesc.append(sourceDesc_p)
+		# removing all empty tags
+		for empty_tag in soup.find_all():
+			if len(empty_tag.get_text(strip=True)) == 0:
+				empty_tag.extract()
+		# number
+		for match in soup.find_all('div'):
+			match['xml:id'] = 'agnes-' + match.p.string
+			match['n'] = match.p.string
+		# witness
+		for match in soup.findAll('seg'):
+			match.replaceWithChildren()
+		for match in soup.findAll('emph'):
+			match.replaceWithChildren()
+		for match in soup.findAll('name'):
+			match.replaceWithChildren()
+		for tag in soup.find_all('p'):
+			try:
+				tag['rend']
+			except KeyError:
+				continue
+			if tag['rend'] == 'Corps':
+				del tag['rend']
+		# for match in soup.find_all('p'):
+		# 	if re.search(r'([A-Z])\. [OC]', match.text):
+		# 		witness = soup.new_tag('witness')
+		# 		witness.contents = match.contents
+		# 		witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
+		# 		match.replaceWith(witness)
+		result = str(soup)
+		result = result.replace('p rend="Corps ', 'p rend="')
+		result = result.replace(' ', ' ')
+		result = result.replace('\n', '')
+		with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting:
+			writting.write(result)
+		newsoup = BeautifulSoup(result, 'xml')
+		body = newsoup.body
+		for div in body.find_all('div'):
+			#print("\n\n" + div['xml:id'])
+			filename = 'agnes_' + div['n'] + '.xml'
+			with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening:
+				soup_act = BeautifulSoup(opening, 'xml')
+				soup_act.body.append(div)
+			with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting:
+				writting.write(soup_act.prettify())
+
+
+def teiheader_making(file, tei_canvas):
+	soup = BeautifulSoup(tei_canvas, 'xml')
+	p_sourceDesc = soup.new_tag('p')
+	with open(file, 'r', encoding='utf-8') as opening:
+		csvfile = csv.reader(opening, delimiter=";")
+		for line in csvfile:
+			filename = 'agnes_' + line[0] + ".xml"
+			soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4])
+			p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
+			soup.sourceDesc.append(p_sourceDesc)
+			with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting:
+				writting.write(str(soup))
+
+
+canvas = f"""<?xml version="1.0" encoding="utf-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0">
+<teiHeader>
+<fileDesc>
+<titleStmt>
+<title></title>
+<author></author>
+</titleStmt>
+<publicationStmt>
+<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
+<date when="2022">2022</date>
+<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
+</publicationStmt>
+<sourceDesc>
+</sourceDesc>
+</fileDesc>
+</teiHeader>
+<text>
+<body>
+</body>
+</text>
+</TEI>"""
+
+teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas)
+
+beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")