You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.7 KiB
Python
66 lines
1.7 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
"""
|
|
Author : Jean-Damien Généro
|
|
Affiliation : French National Center for Scientific Research (CNRS)
|
|
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
|
|
Date : 2022-09-29
|
|
Update :
|
|
"""
|
|
|
|
from bs4 import BeautifulSoup
|
|
import spacy
|
|
|
|
|
|
def spacy_bb(xml):
|
|
result = []
|
|
with open(xml, 'r', encoding='utf-8') as corpus:
|
|
soup = BeautifulSoup(corpus, 'xml')
|
|
for note in soup.find_all('note'):
|
|
note.decompose()
|
|
actes = soup.find_all('div', {'type' : 'acte'})
|
|
nlp = spacy.load('fr_core_news_sm')
|
|
for div in actes:
|
|
print("\n\n\n ###############")
|
|
for p in div.find_all('p'):
|
|
doc = nlp(p.text)
|
|
for ent in doc.ents:
|
|
if ent.label_ in ['PER']:
|
|
print(ent)
|
|
|
|
|
|
def clean_training_data(xml):
|
|
examples = []
|
|
with open(xml, 'r', encoding='utf-8') as opening:
|
|
soup = BeautifulSoup(opening, 'xml')
|
|
entity_tags = ['placeName', 'persName']
|
|
for entity in entity_tags:
|
|
for entity in soup.find_all(entity, {'ref': True}):
|
|
del entity['ref']
|
|
removed = ['note', 'pb']
|
|
for tag in removed:
|
|
for tag in soup.find_all(tag):
|
|
tag.decompose()
|
|
for hi in soup.findAll('hi'):
|
|
hi.replaceWithChildren()
|
|
for div in soup.find_all('div', {'type' : 'acte'}):
|
|
txt = str(div.p)
|
|
# txt = txt.replace('<p>', '')
|
|
# txt = txt.replace('</p>', '')
|
|
txt = txt.replace('persName', 'PERSON')
|
|
txt = txt.replace('placeName', 'LOC')
|
|
examples.append(txt)
|
|
return examples
|
|
|
|
|
|
|
|
# spacy_bb("../bourbon-latex/charles-actes-latex.xml")
|
|
|
|
|
|
|
|
|
|
# nlp = spacy.load('fr_core_news_lg')
|
|
|
|
# ner = nlp.get_pipe("ner")
|