initial commit

5 years ago · 12fea204ec
parent d5335ddf47
commit 12fea204ec
1 changed files with 141 additions and 0 deletions
--- a/app/cmd/bourbon_clean_up.py
+++ b/app/cmd/bourbon_clean_up.py
@ -0,0 +1,141 @@
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 """
 Authors : Jean-Damien Généro, Bertrand Dumenieu
 Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
 Assigned at the Centre de recherches historiques (CRH, UMR 8558)
 Date : 2021-12-31
 Update : 2022-01-04
 """
 import csv
 import os
 import re
 from bs4 import BeautifulSoup
 def beautiful_clean_up(path):
 	with open(path, 'r', encoding='utf-8') as opened:
 		reading = opened.read()
 		# acts
 		separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
 		reading = re.sub(separator, '</div><div>', reading)
 		reading = reading.replace('<figure>', '<div>')
 		reading = reading.replace('</figure>', '</div>')
 		soup = BeautifulSoup(reading, 'xml')
 		# titre
 		titleStmt = soup.titleStmt
 		title = soup.new_tag('title')
 		author = soup.new_tag('author')
 		title.string = 'Actes d\'Agnès de Bourgogne'
 		author.string = 'Jean-Damien Généro'
 		titleStmt.append(title)
 		titleStmt.append(author)
 		# publicationStmt
 		publicationStmt = soup.publicationStmt
 		date = soup.new_tag('date')
 		date['when'] = '2022'
 		date.string = '2022'
 		publisher = soup.new_tag('publisher')
 		publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
 		availability = soup.new_tag('availability')
 		licence = soup.new_tag('licence')
 		licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
 		licence.string = 'Distributed under an Open License 2.0'
 		availability.append(licence)
 		publicationStmt.append(publisher)
 		publicationStmt.append(date)
 		publicationStmt.append(availability)
 		# sourceDesc
 		sourceDesc = soup.sourceDesc
 		sourceDesc_p = soup.new_tag('p')
 		sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
 		sourceDesc.append(sourceDesc_p)
 		# removing all empty tags
 		for empty_tag in soup.find_all():
 			if len(empty_tag.get_text(strip=True)) == 0:
 				empty_tag.extract()
 		# number
 		for match in soup.find_all('div'):
 			match['xml:id'] = 'agnes-' + match.p.string
 			match['n'] = match.p.string
 		# witness
 		for match in soup.findAll('seg'):
 			match.replaceWithChildren()
 		for match in soup.findAll('emph'):
 			match.replaceWithChildren()
 		for match in soup.findAll('name'):
 			match.replaceWithChildren()
 		for tag in soup.find_all('p'):
 			try:
 				tag['rend']
 			except KeyError:
 				continue
 			if tag['rend'] == 'Corps':
 				del tag['rend']
 		# for match in soup.find_all('p'):
 		# 	if re.search(r'([A-Z])\. [OC]', match.text):
 		# 		witness = soup.new_tag('witness')
 		# 		witness.contents = match.contents
 		# 		witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
 		# 		match.replaceWith(witness)
 		result = str(soup)
 		result = result.replace('p rend="Corps ', 'p rend="')
 		result = result.replace(' ', ' ')
 		result = result.replace('\n', '')
 		with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting:
 			writting.write(result)
 		newsoup = BeautifulSoup(result, 'xml')
 		body = newsoup.body
 		for div in body.find_all('div'):
 			#print("\n\n" + div['xml:id'])
 			filename = 'agnes_' + div['n'] + '.xml'
 			with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening:
 				soup_act = BeautifulSoup(opening, 'xml')
 				soup_act.body.append(div)
 			with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting:
 				writting.write(soup_act.prettify())
 def teiheader_making(file, tei_canvas):
 	soup = BeautifulSoup(tei_canvas, 'xml')
 	p_sourceDesc = soup.new_tag('p')
 	with open(file, 'r', encoding='utf-8') as opening:
 		csvfile = csv.reader(opening, delimiter=";")
 		for line in csvfile:
 			filename = 'agnes_' + line[0] + ".xml"
 			soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4])
 			p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
 			soup.sourceDesc.append(p_sourceDesc)
 			with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting:
 				writting.write(str(soup))
 canvas = f"""<?xml version="1.0" encoding="utf-8"?>
 <TEI xmlns="http://www.tei-c.org/ns/1.0">
 <teiHeader>
 <fileDesc>
 <titleStmt>
 <title></title>
 <author></author>
 </titleStmt>
 <publicationStmt>
 <publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
 <date when="2022">2022</date>
 <availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
 </publicationStmt>
 <sourceDesc>
 </sourceDesc>
 </fileDesc>
 </teiHeader>
 <text>
 <body>
 </body>
 </text>
 </TEI>"""
 teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas)
 beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")