actes-princiers/corpus-nlp/spacy_bourbon.py

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Author : Jean-Damien Généro
Affiliation : French National Center for Scientific Research (CNRS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2022-09-29
Update :
"""

from bs4 import BeautifulSoup
import spacy


def spacy_bb(xml):
    result = []
    with open(xml, 'r', encoding='utf-8') as corpus:
        soup = BeautifulSoup(corpus, 'xml')
    for note in soup.find_all('note'):
        note.decompose()
    actes = soup.find_all('div', {'type' : 'acte'})
    nlp = spacy.load('fr_core_news_sm')
    for div in actes:
         print("\n\n\n ###############")
         for p in div.find_all('p'):
             doc = nlp(p.text)
             for ent in doc.ents:
                 if ent.label_ in ['PER']:
                     print(ent)


def clean_training_data(xml):
    examples = []
    with open(xml, 'r', encoding='utf-8') as opening:
        soup = BeautifulSoup(opening, 'xml')
    entity_tags = ['placeName', 'persName']
    for entity in entity_tags:
        for entity in soup.find_all(entity, {'ref': True}):
            del entity['ref']
    removed = ['note', 'pb']
    for tag in removed:
        for tag in soup.find_all(tag):
            tag.decompose()
    for hi in soup.findAll('hi'):
            hi.replaceWithChildren()
    for div in soup.find_all('div', {'type' : 'acte'}):
        txt = str(div.p)
        # txt = txt.replace('<p>', '')
        # txt = txt.replace('</p>', '')
        txt = txt.replace('persName', 'PERSON')
        txt = txt.replace('placeName', 'LOC')
        examples.append(txt)
    return examples


# spacy_bb("../bourbon-latex/charles-actes-latex.xml")


# nlp = spacy.load('fr_core_news_lg')

# ner = nlp.get_pipe("ner")