#!/usr/bin/python # -*- coding: UTF-8 -*- """ Author : Jean-Damien Généro Affiliation : French National Center for Scientific Research (CNRS) Assigned at the Centre de recherches historiques (CRH, UMR 8558) Date : 2022-09-29 Update : """ from bs4 import BeautifulSoup import spacy def spacy_bb(xml): result = [] with open(xml, 'r', encoding='utf-8') as corpus: soup = BeautifulSoup(corpus, 'xml') for note in soup.find_all('note'): note.decompose() actes = soup.find_all('div', {'type' : 'acte'}) nlp = spacy.load('fr_core_news_sm') for div in actes: print("\n\n\n ###############") for p in div.find_all('p'): doc = nlp(p.text) for ent in doc.ents: if ent.label_ in ['PER']: print(ent) def clean_training_data(xml): examples = [] with open(xml, 'r', encoding='utf-8') as opening: soup = BeautifulSoup(opening, 'xml') entity_tags = ['placeName', 'persName'] for entity in entity_tags: for entity in soup.find_all(entity, {'ref': True}): del entity['ref'] removed = ['note', 'pb'] for tag in removed: for tag in soup.find_all(tag): tag.decompose() for hi in soup.findAll('hi'): hi.replaceWithChildren() for div in soup.find_all('div', {'type' : 'acte'}): txt = str(div.p) # txt = txt.replace('
', '') # txt = txt.replace('
', '') txt = txt.replace('persName', 'PERSON') txt = txt.replace('placeName', 'LOC') examples.append(txt) return examples # spacy_bb("../bourbon-latex/charles-actes-latex.xml") # nlp = spacy.load('fr_core_news_lg') # ner = nlp.get_pipe("ner")