#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Authors : Jean - Damien Généro , Bertrand Dumenieu
Affiliation : French National Center for Scientific Research ( CNRS ) , École des hautes études en sciences sociales ( EHESS )
Assigned at the Centre de recherches historiques ( CRH , UMR 8558 )
Date : 2021 - 12 - 31
Update : 2022 - 01 - 05
Regex dans les ODT :
- j ' enlève tous les sauts de page (ctrl+A, puis Format, Paragraphe, onglet Enchainements, décocher Saut / Insérer.)
- je vérifie que tous les séparateurs sont sur une seule ligne : ` + \* \* ` >> ` \n $ 1 `
- je transforme les séparateurs : ` \* { 3 } ` >> ` - * - `
"""
import csv
import os
import re
from bs4 import BeautifulSoup
def beautiful_clean_up ( path ) :
with open ( path , ' r ' , encoding = ' utf-8 ' ) as opened :
reading = opened . read ( )
# acts
separator = r ' <p rend= " .+ " > \ n +<seg rend= " Aucun " ><hi rend= " sub " >- \ *< \ /hi>-< \ /seg> \ n +< \ /p> '
reading = re . sub ( separator , ' </div><div> ' , reading )
reading = re . sub ( r ' <body> \ n +<figure> ' , ' <body> \n <div> ' , reading )
reading = reading . replace ( r ' < \ /figure> \ n +< \ /body> ' , ' </div> \n </body> ' )
reading = reading . replace ( ' <figure> ' , ' ' )
reading = reading . replace ( ' </figure> ' , ' ' )
reading = reading . replace ( ' <ident> ' , ' ' )
reading = reading . replace ( ' </ident> ' , ' ' )
reading = reading . replace ( " ’ " , " ' " )
soup = BeautifulSoup ( reading , ' xml ' )
# titre
titleStmt = soup . titleStmt
# title = soup.new_tag('title')
author = soup . new_tag ( ' author ' )
# title.string = 'Actes de Charles Ier'
author . string = ' Jean-Damien Généro '
# titleStmt.append(title)
titleStmt . append ( author )
# publicationStmt
# publicationStmt = soup.publicationStmt
# date = soup.new_tag('date')
# date['when'] = '2022'
# date.string = '2022'
# publisher = soup.new_tag('publisher')
# publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
# availability = soup.new_tag('availability')
# licence = soup.new_tag('licence')
# licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
# licence.string = 'Distributed under an Open License 2.0'
# availability.append(licence)
# publicationStmt.append(publisher)
# publicationStmt.append(date)
# publicationStmt.append(availability)
# sourceDesc
# sourceDesc = soup.sourceDesc
# sourceDesc_p = soup.new_tag('p')
# sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
# sourceDesc.append(sourceDesc_p)
# removing all empty tags
for empty_tag in soup . find_all ( ) :
if len ( empty_tag . get_text ( strip = True ) ) == 0 :
empty_tag . extract ( )
# number
for match in soup . findAll ( ' ident ' ) :
match . replaceWithChildren ( )
for match in soup . findAll ( ' seg ' ) :
match . replaceWithChildren ( )
for match in soup . findAll ( ' emph ' ) :
match . replaceWithChildren ( )
for match in soup . findAll ( ' name ' ) :
match . replaceWithChildren ( )
for match in soup . find_all ( ' div ' ) :
match [ ' xml:id ' ] = ' charles_ier_ ' + match . p . string . replace ( ' ' , ' ' )
match [ ' n ' ] = match . p . string . replace ( ' ' , ' ' )
# witness
for tag in soup . find_all ( ' p ' ) :
try :
tag [ ' rend ' ]
except KeyError :
continue
if tag [ ' rend ' ] == ' Corps ' :
del tag [ ' rend ' ]
# for match in soup.find_all('p'):
# if re.search(r'([A-Z])\. [OC]', match.text):
# witness = soup.new_tag('witness')
# witness.contents = match.contents
# witness['n'] = re.search(r'([A-Z])\. +[OC]', match.text).group(1)
# match.replaceWith(witness)
result = str ( soup )
result = result . replace ( ' p rend= " Corps ' , ' p rend= " ' )
result = result . replace ( ' ' , ' ' )
result = result . replace ( ' \n ' , ' ' )
result = result . replace ( ' .replace( ' ' , ' ' ) ' , ' ' )
newsoup = BeautifulSoup ( result , ' xml ' )
# with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op:
# op.write(newsoup.prettify())
body = newsoup . body
for div in body . find_all ( ' div ' ) :
#print("\n\n" + div['xml:id'])
filename = ' charles_ier_ ' + div [ ' n ' ] + ' .xml '
with open ( os . path . join ( " ../static/xml/Bourbon/5-Charles-Ier " , filename ) , ' r ' , encoding = ' utf-8 ' ) as opening :
soup_act = BeautifulSoup ( opening , ' xml ' )
soup_act . body . append ( div )
with open ( os . path . join ( " ../static/xml/Bourbon/5-Charles-Ier " , filename ) , ' w ' , encoding = ' utf-8 ' ) as writting :
writting . write ( soup_act . prettify ( ) )
def teiheader_making ( file , tei_canvas ) :
soup = BeautifulSoup ( tei_canvas , ' xml ' )
p_sourceDesc = soup . new_tag ( ' p ' )
with open ( file , ' r ' , encoding = ' utf-8 ' ) as opening :
csvfile = csv . reader ( opening , delimiter = " ; " )
for line in csvfile :
filename = ' charles_ier_ ' + line [ 0 ] + " .xml "
soup . title . string = ' Acte {} de Charles Ier de Bourbon ( {} ) ' . format ( line [ 0 ] , line [ 4 ] )
p_sourceDesc . string = f """ Ce fichier contient l ' acte n° { line [ 0 ] } de Charles Ier de Bourbon, { line [ 9 ] } ( { line [ 2 ] } , { line [ 1 ] } ) daté du { line [ 4 ] } à { line [ 5 ] } . """
soup . sourceDesc . append ( p_sourceDesc )
with open ( os . path . join ( " ../static/xml/Bourbon/5-Charles-Ier " , filename ) , ' w ' , encoding = " utf-8 " ) as writting :
writting . write ( str ( soup ) )
canvas = f """ <?xml version= " 1.0 " encoding= " utf-8 " ?>
< TEI xmlns = " http://www.tei-c.org/ns/1.0 " >
< teiHeader >
< fileDesc >
< titleStmt >
< title > < / title >
< author > < / author >
< / titleStmt >
< publicationStmt >
< publisher > Laboratoire de Médiévistique occidentale de Paris ( UMR 8589 ) , Centre de recherches historiques ( UMR 8558 ) < / publisher >
< date when = " 2022 " > 2022 < / date >
< availability > < licence source = " https://github.com/etalab/licence-ouverte/blob/master/open-licence.md " > Distributed under an Open License 2.0 < / licence > < / availability >
< / publicationStmt >
< sourceDesc >
< / sourceDesc >
< / fileDesc >
< / teiHeader >
< text >
< body >
< / body >
< / text >
< / TEI > """
teiheader_making ( " ../static/csv/corpus-charles-i.csv " , canvas )
beautiful_clean_up ( " ../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml " )