From c4249c80320516921dd6a44def97364e1c3194de Mon Sep 17 00:00:00 2001 From: Jean-Damien Genero Date: Wed, 5 Jan 2022 10:10:13 +0000 Subject: [PATCH] Delete bourbon_clean_up.py --- scripts/bourbon_clean_up.py | 141 ------------------------------------ 1 file changed, 141 deletions(-) delete mode 100644 scripts/bourbon_clean_up.py diff --git a/scripts/bourbon_clean_up.py b/scripts/bourbon_clean_up.py deleted file mode 100644 index 73029b1..0000000 --- a/scripts/bourbon_clean_up.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/python -# -*- coding: UTF-8 -*- - -""" -Authors : Jean-Damien Généro, Bertrand Dumenieu -Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS) -Assigned at the Centre de recherches historiques (CRH, UMR 8558) -Date : 2021-12-31 -Update : 2022-01-04 -""" - - -import csv -import os -import re -from bs4 import BeautifulSoup - - -def beautiful_clean_up(path): - with open(path, 'r', encoding='utf-8') as opened: - reading = opened.read() - # acts - separator = r'

\n +-\*<\/hi>-<\/seg>\n +<\/p>' - reading = re.sub(separator, '

', reading) - reading = reading.replace('
', '
') - reading = reading.replace('
', '
') - soup = BeautifulSoup(reading, 'xml') - # titre - titleStmt = soup.titleStmt - title = soup.new_tag('title') - author = soup.new_tag('author') - title.string = 'Actes d\'Agnès de Bourgogne' - author.string = 'Jean-Damien Généro' - titleStmt.append(title) - titleStmt.append(author) - # publicationStmt - publicationStmt = soup.publicationStmt - date = soup.new_tag('date') - date['when'] = '2022' - date.string = '2022' - publisher = soup.new_tag('publisher') - publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' - availability = soup.new_tag('availability') - licence = soup.new_tag('licence') - licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' - licence.string = 'Distributed under an Open License 2.0' - availability.append(licence) - publicationStmt.append(publisher) - publicationStmt.append(date) - publicationStmt.append(availability) - # sourceDesc - sourceDesc = soup.sourceDesc - sourceDesc_p = soup.new_tag('p') - sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' - sourceDesc.append(sourceDesc_p) - # removing all empty tags - for empty_tag in soup.find_all(): - if len(empty_tag.get_text(strip=True)) == 0: - empty_tag.extract() - # number - for match in soup.find_all('div'): - match['xml:id'] = 'agnes-' + match.p.string - match['n'] = match.p.string - # witness - for match in soup.findAll('seg'): - match.replaceWithChildren() - for match in soup.findAll('emph'): - match.replaceWithChildren() - for match in soup.findAll('name'): - match.replaceWithChildren() - for tag in soup.find_all('p'): - try: - tag['rend'] - except KeyError: - continue - if tag['rend'] == 'Corps': - del tag['rend'] - # for match in soup.find_all('p'): - # if re.search(r'([A-Z])\. [OC]', match.text): - # witness = soup.new_tag('witness') - # witness.contents = match.contents - # witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1) - # match.replaceWith(witness) - result = str(soup) - result = result.replace('p rend="Corps ', 'p rend="') - result = result.replace(' ', ' ') - result = result.replace('\n', '') - with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting: - writting.write(result) - newsoup = BeautifulSoup(result, 'xml') - body = newsoup.body - for div in body.find_all('div'): - #print("\n\n" + div['xml:id']) - filename = 'agnes_' + div['n'] + '.xml' - with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening: - soup_act = BeautifulSoup(opening, 'xml') - soup_act.body.append(div) - with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting: - writting.write(soup_act.prettify()) - - -def teiheader_making(file, tei_canvas): - soup = BeautifulSoup(tei_canvas, 'xml') - p_sourceDesc = soup.new_tag('p') - with open(file, 'r', encoding='utf-8') as opening: - csvfile = csv.reader(opening, delimiter=";") - for line in csvfile: - filename = 'agnes_' + line[0] + ".xml" - soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4]) - p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" - soup.sourceDesc.append(p_sourceDesc) - with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting: - writting.write(str(soup)) - - -canvas = f""" - - - - - - - - -Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558) -2022 -Distributed under an Open License 2.0 - - - - - - - - - -""" - -teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas) - -beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")