You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
1.7 KiB
Python

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Author : Jean-Damien Généro
Affiliation : French National Center for Scientific Research (CNRS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2022-09-29
Update :
"""
from bs4 import BeautifulSoup
import spacy
def spacy_bb(xml):
result = []
with open(xml, 'r', encoding='utf-8') as corpus:
soup = BeautifulSoup(corpus, 'xml')
for note in soup.find_all('note'):
note.decompose()
actes = soup.find_all('div', {'type' : 'acte'})
nlp = spacy.load('fr_core_news_sm')
for div in actes:
print("\n\n\n ###############")
for p in div.find_all('p'):
doc = nlp(p.text)
for ent in doc.ents:
if ent.label_ in ['PER']:
print(ent)
def clean_training_data(xml):
examples = []
with open(xml, 'r', encoding='utf-8') as opening:
soup = BeautifulSoup(opening, 'xml')
entity_tags = ['placeName', 'persName']
for entity in entity_tags:
for entity in soup.find_all(entity, {'ref': True}):
del entity['ref']
removed = ['note', 'pb']
for tag in removed:
for tag in soup.find_all(tag):
tag.decompose()
for hi in soup.findAll('hi'):
hi.replaceWithChildren()
for div in soup.find_all('div', {'type' : 'acte'}):
txt = str(div.p)
# txt = txt.replace('<p>', '')
# txt = txt.replace('</p>', '')
txt = txt.replace('persName', 'PERSON')
txt = txt.replace('placeName', 'LOC')
examples.append(txt)
return examples
# spacy_bb("../bourbon-latex/charles-actes-latex.xml")
# nlp = spacy.load('fr_core_news_lg')
# ner = nlp.get_pipe("ner")