diff --git a/app/cmd/bourbon_clean_up.py b/app/cmd/bourbon_clean_up.py index 73029b1..ec47554 100644 --- a/app/cmd/bourbon_clean_up.py +++ b/app/cmd/bourbon_clean_up.py @@ -22,52 +22,59 @@ def beautiful_clean_up(path): # acts separator = r'

\n +-\*<\/hi>-<\/seg>\n +<\/p>' reading = re.sub(separator, '

', reading) - reading = reading.replace('
', '
') - reading = reading.replace('
', '
') + reading = re.sub(r'\n +
', '\n
', reading) + reading = reading.replace(r'<\/figure>\n +<\/body>', '
\n') + reading = reading.replace('
', '') + reading = reading.replace('
', '') + reading = reading.replace('', '') + reading = reading.replace('', '') + reading = reading.replace("’", "'") soup = BeautifulSoup(reading, 'xml') # titre titleStmt = soup.titleStmt - title = soup.new_tag('title') + # title = soup.new_tag('title') author = soup.new_tag('author') - title.string = 'Actes d\'Agnès de Bourgogne' + # title.string = 'Actes de Charles Ier' author.string = 'Jean-Damien Généro' - titleStmt.append(title) + # titleStmt.append(title) titleStmt.append(author) # publicationStmt - publicationStmt = soup.publicationStmt - date = soup.new_tag('date') - date['when'] = '2022' - date.string = '2022' - publisher = soup.new_tag('publisher') - publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' - availability = soup.new_tag('availability') - licence = soup.new_tag('licence') - licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' - licence.string = 'Distributed under an Open License 2.0' - availability.append(licence) - publicationStmt.append(publisher) - publicationStmt.append(date) - publicationStmt.append(availability) + # publicationStmt = soup.publicationStmt + # date = soup.new_tag('date') + # date['when'] = '2022' + # date.string = '2022' + # publisher = soup.new_tag('publisher') + # publisher.string = 'Laboratoire de Médiévistique occidentale de Paris (UMR 8589), Centres de recherches historiques (UMR 8558)' + # availability = soup.new_tag('availability') + # licence = soup.new_tag('licence') + # licence['source'] = 'https://github.com/etalab/licence-ouverte/blob/master/open-licence.md' + # licence.string = 'Distributed under an Open License 2.0' + # availability.append(licence) + # publicationStmt.append(publisher) + # publicationStmt.append(date) + # publicationStmt.append(availability) # sourceDesc - sourceDesc = soup.sourceDesc - sourceDesc_p = soup.new_tag('p') - sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' - sourceDesc.append(sourceDesc_p) + # sourceDesc = soup.sourceDesc + # sourceDesc_p = soup.new_tag('p') + # sourceDesc_p.string = 'Les actes ci-dessous sont issus de plusieurs dépôts d\'archives.' + # sourceDesc.append(sourceDesc_p) # removing all empty tags for empty_tag in soup.find_all(): if len(empty_tag.get_text(strip=True)) == 0: empty_tag.extract() # number - for match in soup.find_all('div'): - match['xml:id'] = 'agnes-' + match.p.string - match['n'] = match.p.string - # witness + for match in soup.findAll('ident'): + match.replaceWithChildren() for match in soup.findAll('seg'): match.replaceWithChildren() for match in soup.findAll('emph'): match.replaceWithChildren() for match in soup.findAll('name'): match.replaceWithChildren() + for match in soup.find_all('div'): + match['xml:id'] = 'charles_ier_' + match.p.string.replace(' ', '') + match['n'] = match.p.string.replace(' ', '') + # witness for tag in soup.find_all('p'): try: tag['rend'] @@ -85,17 +92,18 @@ def beautiful_clean_up(path): result = result.replace('p rend="Corps ', 'p rend="') result = result.replace(' ', ' ') result = result.replace('\n', '') - with open("/home/genero/Bureau/Bourbon/agnes_actes-2.xml", 'w', encoding='utf-8') as writting: - writting.write(result) + result = result.replace('.replace(' ', '')', '') newsoup = BeautifulSoup(result, 'xml') + # with open('../static/xml/Bourbon/5-Charles-Ier/corpus2.xml', 'w', encoding='utf-8') as op: + # op.write(newsoup.prettify()) body = newsoup.body for div in body.find_all('div'): #print("\n\n" + div['xml:id']) - filename = 'agnes_' + div['n'] + '.xml' - with open(os.path.join("./agnes_actes", filename), 'r', encoding='utf-8') as opening: + filename = 'charles_ier_' + div['n'] + '.xml' + with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'r', encoding='utf-8') as opening: soup_act = BeautifulSoup(opening, 'xml') soup_act.body.append(div) - with open(os.path.join("./agnes_actes", filename), 'w', encoding='utf-8') as writting: + with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding='utf-8') as writting: writting.write(soup_act.prettify()) @@ -105,11 +113,11 @@ def teiheader_making(file, tei_canvas): with open(file, 'r', encoding='utf-8') as opening: csvfile = csv.reader(opening, delimiter=";") for line in csvfile: - filename = 'agnes_' + line[0] + ".xml" - soup.title.string = 'Acte {} d\'Agnès de Bourgogne ({})'.format(line[0], line[4]) - p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} d'Agnès de Bourgogne, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" + filename = 'charles_ier_' + line[0] + ".xml" + soup.title.string = 'Acte {} de Charles Ier de Bourbon ({})'.format(line[0], line[4]) + p_sourceDesc.string = f"""Ce fichier contient l'acte n°{line[0]} de Charles Ier de Bourbon, {line[9]} ({line[2]}, {line[1]}) daté du {line[4]} à {line[5]}.""" soup.sourceDesc.append(p_sourceDesc) - with open(os.path.join("./agnes_actes", filename), 'w', encoding="utf-8") as writting: + with open(os.path.join("../static/xml/Bourbon/5-Charles-Ier", filename), 'w', encoding="utf-8") as writting: writting.write(str(soup)) @@ -136,6 +144,6 @@ canvas = f""" """ -teiheader_making("./actes-ducs-bourbon/base-donnees-m2/agnes_actes.csv", canvas) +teiheader_making("../static/csv/corpus-charles-i.csv", canvas) -beautiful_clean_up("/home/genero/Bureau/Bourbon/agnes_actes-1.xml") +beautiful_clean_up("../static/xml/Bourbon/5-Charles-Ier/corpus-charles-i.xml")