@ -22,52 +22,59 @@ def beautiful_clean_up(path):
# acts
# acts
separator = r ' <p rend= " .+ " > \ n +<seg rend= " Aucun " ><hi rend= " sub " >- \ *< \ /hi>-< \ /seg> \ n +< \ /p> '
separator = r ' <p rend= " .+ " > \ n +<seg rend= " Aucun " ><hi rend= " sub " >- \ *< \ /hi>-< \ /seg> \ n +< \ /p> '
reading = re . sub ( separator , ' </div><div> ' , reading )
reading = re . sub ( separator , ' </div><div> ' , reading )
reading = reading . replace ( ' <figure> ' , ' <div> ' )
reading = re . sub ( r ' <body> \ n +<figure> ' , ' <body> \n <div> ' , reading )
reading = reading . replace ( ' </figure> ' , ' </div> ' )
reading = reading . replace ( r ' < \ /figure> \ n +< \ /body> ' , ' </div> \n </body> ' )
reading = reading . replace ( ' <figure> ' , ' ' )
reading = reading . replace ( ' </figure> ' , ' ' )
reading = reading . replace ( ' <ident> ' , ' ' )
reading = reading . replace ( ' </ident> ' , ' ' )
reading = reading . replace ( " ’ " , " ' " )
soup = BeautifulSoup ( reading , ' xml ' )
soup = BeautifulSoup ( reading , ' xml ' )
# titre
# titre
titleStmt = soup . titleStmt
titleStmt = soup . titleStmt
title = soup . new_tag ( ' title ' )
# title = soup.new_tag('title' )
author = soup . new_tag ( ' author ' )
author = soup . new_tag ( ' author ' )
title . string = ' Actes d \' Agnès de Bourgogne '
# title.string = 'Actes de Charles Ier '
author . string = ' Jean-Damien Généro '
author . string = ' Jean-Damien Généro '
titleStmt . append ( title )
# titleStmt.append(title )
titleStmt . append ( author )
titleStmt . append ( author )
# publicationStmt
# publicationStmt
publicationStmt = soup . publicationStmt
# publicationStmt = soup. publicationStmt
date = soup . new_tag ( ' date ' )
# date = soup.new_tag('date' )
date [ ' when ' ] = ' 2022 '
# date['when'] = '2022 '
date . string = ' 2022 '
# date.string = '2022 '
publisher = soup . new_tag ( ' publisher ' )
# publisher = soup.new_tag('publisher' )
publisher . string = ' Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558) '
# publisher.string = ' Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)'
availability = soup . new_tag ( ' availability ' )
# availability = soup.new_tag('availability' )
licence = soup . new_tag ( ' licence ' )
# licence = soup.new_tag('licence' )
licence [ ' source ' ] = ' https://github.com/etalab/licence-ouverte/blob/master/open-licence.md '
# licence['source'] = ' https://github.com/etalab/licence-ouverte/blob/master/open-licence.md'
licence . string = ' Distributed under an Open License 2.0 '
# licence.string = 'Distributed under an Open License 2.0 '
availability . append ( licence )
# availability.append(licence )
publicationStmt . append ( publisher )
# publicationStmt.append(publisher )
publicationStmt . append ( date )
# publicationStmt.append(date )
publicationStmt . append ( availability )
# publicationStmt.append(availability )
# sourceDesc
# sourceDesc
sourceDesc = soup . sourceDesc
# sourceDesc = soup. sourceDesc
sourceDesc_p = soup . new_tag ( ' p ' )
# sourceDesc_p = soup.new_tag('p' )
sourceDesc_p . string = ' Les actes ci-dessous sont issus de plusieurs dépôts d \' archives. '
# sourceDesc_p.string = ' Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.'
sourceDesc . append ( sourceDesc_p )
# sourceDesc.append(sourceDesc_p )
# removing all empty tags
# removing all empty tags
for empty_tag in soup . find_all ( ) :
for empty_tag in soup . find_all ( ) :
if len ( empty_tag . get_text ( strip = True ) ) == 0 :
if len ( empty_tag . get_text ( strip = True ) ) == 0 :
empty_tag . extract ( )
empty_tag . extract ( )
# number
# number
for match in soup . find_all ( ' div ' ) :
for match in soup . findAll ( ' ident ' ) :
match [ ' xml:id ' ] = ' agnes- ' + match . p . string
match . replaceWithChildren ( )
match [ ' n ' ] = match . p . string
# witness
for match in soup . findAll ( ' seg ' ) :
for match in soup . findAll ( ' seg ' ) :
match . replaceWithChildren ( )
match . replaceWithChildren ( )
for match in soup . findAll ( ' emph ' ) :
for match in soup . findAll ( ' emph ' ) :
match . replaceWithChildren ( )
match . replaceWithChildren ( )
for match in soup . findAll ( ' name ' ) :
for match in soup . findAll ( ' name ' ) :
match . replaceWithChildren ( )
match . replaceWithChildren ( )
for match in soup . find_all ( ' div ' ) :
match [ ' xml:id ' ] = ' charles_ier_ ' + match . p . string . replace ( ' ' , ' ' )
match [ ' n ' ] = match . p . string . replace ( ' ' , ' ' )
# witness
for tag in soup . find_all ( ' p ' ) :
for tag in soup . find_all ( ' p ' ) :
try :
try :
tag [ ' rend ' ]
tag [ ' rend ' ]
@ -85,17 +92,18 @@ def beautiful_clean_up(path):
result = result . replace ( ' p rend= " Corps ' , ' p rend= " ' )
result = result . replace ( ' p rend= " Corps ' , ' p rend= " ' )
result = result . replace ( ' ' , ' ' )
result = result . replace ( ' ' , ' ' )
result = result . replace ( ' \n ' , ' ' )
result = result . replace ( ' \n ' , ' ' )
with open ( " /home/genero/Bureau/Bourbon/agnes_actes-2.xml " , ' w ' , encoding = ' utf-8 ' ) as writting :
result = result . replace ( ' .replace( ' ' , ' ' ) ' , ' ' )
writting . write ( result )
newsoup = BeautifulSoup ( result , ' xml ' )
newsoup = BeautifulSoup ( result , ' xml ' )
# with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op:
# op.write(newsoup.prettify())
body = newsoup . body
body = newsoup . body
for div in body . find_all ( ' div ' ) :
for div in body . find_all ( ' div ' ) :
#print("\n\n" + div['xml:id'])
#print("\n\n" + div['xml:id'])
filename = ' agnes _' + div [ ' n ' ] + ' .xml '
filename = ' charles_ier _' + div [ ' n ' ] + ' .xml '
with open ( os . path . join ( " . /agnes_actes " , filename ) , ' r ' , encoding = ' utf-8 ' ) as opening :
with open ( os . path . join ( " . ./static/xml/Bourbon/5-Charles-Ier " , filename ) , ' r ' , encoding = ' utf-8 ' ) as opening :
soup_act = BeautifulSoup ( opening , ' xml ' )
soup_act = BeautifulSoup ( opening , ' xml ' )
soup_act . body . append ( div )
soup_act . body . append ( div )
with open ( os . path . join ( " . /agnes_actes " , filename ) , ' w ' , encoding = ' utf-8 ' ) as writting :
with open ( os . path . join ( " . ./static/xml/Bourbon/5-Charles-Ier " , filename ) , ' w ' , encoding = ' utf-8 ' ) as writting :
writting . write ( soup_act . prettify ( ) )
writting . write ( soup_act . prettify ( ) )
@ -105,11 +113,11 @@ def teiheader_making(file, tei_canvas):
with open ( file , ' r ' , encoding = ' utf-8 ' ) as opening :
with open ( file , ' r ' , encoding = ' utf-8 ' ) as opening :
csvfile = csv . reader ( opening , delimiter = " ; " )
csvfile = csv . reader ( opening , delimiter = " ; " )
for line in csvfile :
for line in csvfile :
filename = ' agnes _' + line [ 0 ] + " .xml "
filename = ' charles_ier _' + line [ 0 ] + " .xml "
soup . title . string = ' Acte {} d \' Agnès de Bourgogne ({} ) ' . format ( line [ 0 ] , line [ 4 ] )
soup . title . string = ' Acte {} d e Charles Ier de Bourbon ({} ) ' . format ( line [ 0 ] , line [ 4 ] )
p_sourceDesc . string = f """ Ce fichier contient l ' acte n° { line [ 0 ] } d ' Agnès de Bourgogne , { line [ 9 ] } ( { line [ 2 ] } , { line [ 1 ] } ) daté du { line [ 4 ] } à { line [ 5 ] } . """
p_sourceDesc . string = f """ Ce fichier contient l ' acte n° { line [ 0 ] } d e Charles Ier de Bourbon , { line [ 9 ] } ( { line [ 2 ] } , { line [ 1 ] } ) daté du { line [ 4 ] } à { line [ 5 ] } . """
soup . sourceDesc . append ( p_sourceDesc )
soup . sourceDesc . append ( p_sourceDesc )
with open ( os . path . join ( " . /agnes_actes " , filename ) , ' w ' , encoding = " utf-8 " ) as writting :
with open ( os . path . join ( " . ./static/xml/Bourbon/5-Charles-Ier " , filename ) , ' w ' , encoding = " utf-8 " ) as writting :
writting . write ( str ( soup ) )
writting . write ( str ( soup ) )
@ -136,6 +144,6 @@ canvas = f"""<?xml version="1.0" encoding="utf-8"?>
< / text >
< / text >
< / TEI > """
< / TEI > """
teiheader_making ( " . /actes-ducs-bourbon/base-donnees-m2/agnes_actes .csv" , canvas )
teiheader_making ( " . ./static/csv/corpus-charles-i .csv" , canvas )
beautiful_clean_up ( " /home/genero/Bureau/Bourbon/agnes_actes-1 .xml" )
beautiful_clean_up ( " ../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i .xml" )