Delete bourbon_clean_up.py

5 years ago · c4249c8032
parent 12fea204ec
commit c4249c8032
1 changed files with 0 additions and 141 deletions
--- a/scripts/bourbon_clean_up.py
+++ b/scripts/bourbon_clean_up.py
@ -1,141 +0,0 @@
-#!/usr/bin/python
-# -*- coding: UTF-8 -*-
-
-"""
-Authors : Jean-Damien Généro, Bertrand Dumenieu
-Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS)
-Assigned at the Centre de recherches historiques (CRH, UMR 8558)
-Date : 2021-12-31
-Update : 2022-01-04
-"""
-
-
-import csv
-import os
-import re
-from bs4 import BeautifulSoup
-
-
-def beautiful_clean_up(path):
-	with open(path, 'r', encoding='utf-8') as opened:
-		reading = opened.read()
-		# acts
-		separator = r'<p rend=".+">\n +<seg rend="Aucun"><hi rend="sub">-\*<\/hi>-<\/seg>\n +<\/p>'
-		reading = re.sub(separator, '</div><div>', reading)
-		reading = reading.replace('<figure>', '<div>')
-		reading = reading.replace('</figure>', '</div>')
-		soup = BeautifulSoup(reading, 'xml')
-		# titre
-		titleStmt = soup.titleStmt
-		title = soup.new_tag('title')
-		author = soup.new_tag('author')
-		title.string = 'Actes d\'Agnès de Bourgogne'
-		author.string = 'Jean-Damien Généro'
-		titleStmt.append(title)
-		titleStmt.append(author)
-		# publicationStmt
-		publicationStmt = soup.publicationStmt
-		date = soup.new_tag('date')
-		date['when'] = '2022'
-		date.string = '2022'
-		publisher = soup.new_tag('publisher')
-		publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
-		availability = soup.new_tag('availability')
-		licence = soup.new_tag('licence')
-		licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
-		licence.string = 'Distributed under an Open License 2.0'
-		availability.append(licence)
-		publicationStmt.append(publisher)
-		publicationStmt.append(date)
-		publicationStmt.append(availability)
-		# sourceDesc
-		sourceDesc = soup.sourceDesc
-		sourceDesc_p = soup.new_tag('p')
-		sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
-		sourceDesc.append(sourceDesc_p)
-		# removing all empty tags
-		for empty_tag in soup.find_all():
-			if len(empty_tag.get_text(strip=True)) == 0:
-				empty_tag.extract()
-		# number
-		for match in soup.find_all('div'):
-			match['xml:id'] = 'agnes-' + match.p.string
-			match['n'] = match.p.string
-		# witness
-		for match in soup.findAll('seg'):
-			match.replaceWithChildren()
-		for match in soup.findAll('emph'):
-			match.replaceWithChildren()
-		for match in soup.findAll('name'):
-			match.replaceWithChildren()
-		for tag in soup.find_all('p'):
-			try:
-				tag['rend']
-			except KeyError:
-				continue
-			if tag['rend'] == 'Corps':
-				del tag['rend']
-		# for match in soup.find_all('p'):
-		# 	if re.search(r'([A-Z])\. [OC]', match.text):
-		# 		witness = soup.new_tag('witness')
-		# 		witness.contents = match.contents
-		# 		witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
-		# 		match.replaceWith(witness)
-		result = str(soup)
-		result = result.replace('p rend="Corps ', 'p rend="')
-		result = result.replace(' ', ' ')
-		result = result.replace('\n', '')
-		with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting:
-			writting.write(result)
-		newsoup = BeautifulSoup(result, 'xml')
-		body = newsoup.body
-		for div in body.find_all('div'):
-			#print("\n\n" + div['xml:id'])
-			filename = 'agnes_' + div['n'] + '.xml'
-			with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening:
-				soup_act = BeautifulSoup(opening, 'xml')
-				soup_act.body.append(div)
-			with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting:
-				writting.write(soup_act.prettify())
-
-
-def teiheader_making(file, tei_canvas):
-	soup = BeautifulSoup(tei_canvas, 'xml')
-	p_sourceDesc = soup.new_tag('p')
-	with open(file, 'r', encoding='utf-8') as opening:
-		csvfile = csv.reader(opening, delimiter=";")
-		for line in csvfile:
-			filename = 'agnes_' + line[0] + ".xml"
-			soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4])
-			p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}."""
-			soup.sourceDesc.append(p_sourceDesc)
-			with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting:
-				writting.write(str(soup))
-
-
-canvas = f"""<?xml version="1.0" encoding="utf-8"?>
-<TEI xmlns="http://www.tei-c.org/ns/1.0">
-<teiHeader>
-<fileDesc>
-<titleStmt>
-<title></title>
-<author></author>
-</titleStmt>
-<publicationStmt>
-<publisher>Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558)</publisher>
-<date when="2022">2022</date>
-<availability><licence source="https://github.com/etalab/licence-ouverte/blob/master/open-licence.md">Distributed under an Open License 2.0</licence></availability>
-</publicationStmt>
-<sourceDesc>
-</sourceDesc>
-</fileDesc>
-</teiHeader>
-<text>
-<body>
-</body>
-</text>
-</TEI>"""
-
-teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas)
-
-beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")