#!/usr/bin/python # -*- coding: UTF-8 -*- """ Authors : Jean-Damien Généro, Bertrand Dumenieu Affiliation : French National Center for Scientific Research (CNRS), École des hautes études en sciences sociales (EHESS) Assigned at the Centre de recherches historiques (CRH, UMR 8558) Date : 2021-12-31 Update : 2022-01-05 Regex dans les ODT : - j'enlève tous les sauts de page (ctrl+A, puis Format, Paragraphe, onglet Enchainements, décocher Saut / Insérer.) - je vérifie que tous les séparateurs sont sur une seule ligne : ` +\*\*` >> `\n$1` - je transforme les séparateurs : `\*{3}` >> `-*-` """ import csv import os import re from bs4 import BeautifulSoup def beautiful_clean_up(path): with open(path, 'r', encoding='utf-8') as opened: reading = opened.read() # acts separator = r'

\n +-\*<\/hi>-<\/seg>\n +<\/p>' reading = re.sub(separator, '

', reading) reading = re.sub(r'\n +
', '\n
', reading) reading = reading.replace(r'<\/figure>\n +<\/body>', '
\n') reading = reading.replace('
', '') reading = reading.replace('
', '') reading = reading.replace('', '') reading = reading.replace('', '') reading = reading.replace("’", "'") soup = BeautifulSoup(reading, 'xml') # titre titleStmt = soup.titleStmt # title = soup.new_tag('title') author = soup.new_tag('author') # title.string = 'Actes de Charles Ier' author.string = 'Jean-Damien Généro' # titleStmt.append(title) titleStmt.append(author) # publicationStmt # publicationStmt = soup.publicationStmt # date = soup.new_tag('date') # date['when'] = '2022' # date.string = '2022' # publisher = soup.new_tag('publisher') # publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' # availability = soup.new_tag('availability') # licence = soup.new_tag('licence') # licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' # licence.string = 'Distributed under an Open License 2.0' # availability.append(licence) # publicationStmt.append(publisher) # publicationStmt.append(date) # publicationStmt.append(availability) # sourceDesc # sourceDesc = soup.sourceDesc # sourceDesc_p = soup.new_tag('p') # sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' # sourceDesc.append(sourceDesc_p) # removing all empty tags for empty_tag in soup.find_all(): if len(empty_tag.get_text(strip=True)) == 0: empty_tag.extract() # number for match in soup.findAll('ident'): match.replaceWithChildren() for match in soup.findAll('seg'): match.replaceWithChildren() for match in soup.findAll('emph'): match.replaceWithChildren() for match in soup.findAll('name'): match.replaceWithChildren() for match in soup.find_all('div'): match['xml:id'] = 'charles_ier_' + match.p.string.replace(' ', '') match['n'] = match.p.string.replace(' ', '') # witness for tag in soup.find_all('p'): try: tag['rend'] except KeyError: continue if tag['rend'] == 'Corps': del tag['rend'] # for match in soup.find_all('p'): # if re.search(r'([A-Z])\. [OC]', match.text): # witness = soup.new_tag('witness') # witness.contents = match.contents # witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1) # match.replaceWith(witness) result = str(soup) result = result.replace('p rend="Corps ', 'p rend="') result = result.replace(' ', ' ') result = result.replace('\n', '') result = result.replace('.replace(' ', '')', '') newsoup = BeautifulSoup(result, 'xml') # with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op: # op.write(newsoup.prettify()) body = newsoup.body for div in body.find_all('div'): #print("\n\n" + div['xml:id']) filename = 'charles_ier_' + div['n'] + '.xml' with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'r', encoding='utf-8') as opening: soup_act = BeautifulSoup(opening, 'xml') soup_act.body.append(div) with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding='utf-8') as writting: writting.write(soup_act.prettify()) def teiheader_making(file, tei_canvas): soup = BeautifulSoup(tei_canvas, 'xml') p_sourceDesc = soup.new_tag('p') with open(file, 'r', encoding='utf-8') as opening: csvfile = csv.reader(opening, delimiter=";") for line in csvfile: filename = 'charles_ier_' + line[0] + ".xml" soup.title.string = 'Acte {} de Charles Ier de Bourbon ({})'.format(line[0], line[4]) p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} de Charles Ier de Bourbon, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" soup.sourceDesc.append(p_sourceDesc) with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding="utf-8") as writting: writting.write(str(soup)) canvas = f""" Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centre de recherches historiques (UMR 8558) 2022 Distributed under an Open License 2.0 """ teiheader_making("../static/csv/corpus-charles-i.csv", canvas) beautiful_clean_up("../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml")