From 12fea204ec1583146c33d8016b1e42517d461f99 Mon Sep 17 00:00:00 2001 From: Jean-Damien Genero Date: Wed, 5 Jan 2022 11:08:48 +0100 Subject: [PATCH] initial commit --- app/cmd/bourbon_clean_up.py | 141 ++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 app/cmd/bourbon_clean_up.py diff --git a/app/cmd/bourbon_clean_up.py b/app/cmd/bourbon_clean_up.py new file mode 100644 index 0000000..73029b1 --- /dev/null +++ b/app/cmd/bourbon_clean_up.py @@ -0,0 +1,141 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +""" +Authors : Jean-Damien Généro, Bertrand Dumenieu +Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS) +Assigned at the Centre de recherches historiques (CRH, UMR 8558) +Date : 2021-12-31 +Update : 2022-01-04 +""" + + +import csv +import os +import re +from bs4 import BeautifulSoup + + +def beautiful_clean_up(path): + with open(path, 'r', encoding='utf-8') as opened: + reading = opened.read() + # acts + separator = r'

\n +-\*<\/hi>-<\/seg>\n +<\/p>' + reading = re.sub(separator, '

', reading) + reading = reading.replace('
', '
') + reading = reading.replace('
', '
') + soup = BeautifulSoup(reading, 'xml') + # titre + titleStmt = soup.titleStmt + title = soup.new_tag('title') + author = soup.new_tag('author') + title.string = 'Actes d\'Agnès de Bourgogne' + author.string = 'Jean-Damien Généro' + titleStmt.append(title) + titleStmt.append(author) + # publicationStmt + publicationStmt = soup.publicationStmt + date = soup.new_tag('date') + date['when'] = '2022' + date.string = '2022' + publisher = soup.new_tag('publisher') + publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' + availability = soup.new_tag('availability') + licence = soup.new_tag('licence') + licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' + licence.string = 'Distributed under an Open License 2.0' + availability.append(licence) + publicationStmt.append(publisher) + publicationStmt.append(date) + publicationStmt.append(availability) + # sourceDesc + sourceDesc = soup.sourceDesc + sourceDesc_p = soup.new_tag('p') + sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' + sourceDesc.append(sourceDesc_p) + # removing all empty tags + for empty_tag in soup.find_all(): + if len(empty_tag.get_text(strip=True)) == 0: + empty_tag.extract() + # number + for match in soup.find_all('div'): + match['xml:id'] = 'agnes-' + match.p.string + match['n'] = match.p.string + # witness + for match in soup.findAll('seg'): + match.replaceWithChildren() + for match in soup.findAll('emph'): + match.replaceWithChildren() + for match in soup.findAll('name'): + match.replaceWithChildren() + for tag in soup.find_all('p'): + try: + tag['rend'] + except KeyError: + continue + if tag['rend'] == 'Corps': + del tag['rend'] + # for match in soup.find_all('p'): + # if re.search(r'([A-Z])\. [OC]', match.text): + # witness = soup.new_tag('witness') + # witness.contents = match.contents + # witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1) + # match.replaceWith(witness) + result = str(soup) + result = result.replace('p rend="Corps ', 'p rend="') + result = result.replace(' ', ' ') + result = result.replace('\n', '') + with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting: + writting.write(result) + newsoup = BeautifulSoup(result, 'xml') + body = newsoup.body + for div in body.find_all('div'): + #print("\n\n" + div['xml:id']) + filename = 'agnes_' + div['n'] + '.xml' + with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening: + soup_act = BeautifulSoup(opening, 'xml') + soup_act.body.append(div) + with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting: + writting.write(soup_act.prettify()) + + +def teiheader_making(file, tei_canvas): + soup = BeautifulSoup(tei_canvas, 'xml') + p_sourceDesc = soup.new_tag('p') + with open(file, 'r', encoding='utf-8') as opening: + csvfile = csv.reader(opening, delimiter=";") + for line in csvfile: + filename = 'agnes_' + line[0] + ".xml" + soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4]) + p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" + soup.sourceDesc.append(p_sourceDesc) + with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting: + writting.write(str(soup)) + + +canvas = f""" + + + + + + + + +Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558) +2022 +Distributed under an Open License 2.0 + + + + + + + + + +""" + +teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas) + +beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml")