You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.6 KiB
Python

3 years ago
from random import shuffle
import spacy
import re
from spacy.training import Example
from spacy.tokens import DocBin
from spacy import displacy
from bs4 import BeautifulSoup
ENTITIES = ["PERSON", "LOC"]
entities_disjunction = '|'.join(ENTITIES)
rgx_entity = re.compile(f"<({entities_disjunction})>(.+?)<\/({entities_disjunction})>")
rgx_tag = re.compile(f"<\/?({entities_disjunction})>")
def multi_re_search(pattern, string):
while True:
m = re.search(pattern, string)
if not m: break
yield m
string = string[m.end():]
def rm_tags(string_with_tags: str) -> str:
return re.sub(rgx_tag, "", string_with_tags)
def get_entities(string_with_tags: str) -> str:
entities = multi_re_search(rgx_entity, string_with_tags)
offset = 0
for e in entities:
group = e.group(0)
tag = re.search(rgx_tag, group).group(1)
span = e.span()
start = max(0, span[0] + offset)
end = max(0, span[1] + offset - len(tag) * 2 - 5)
offset += e.end() - len(tag) * 2 - 5 # last entity len - length of the enclosing tags
yield start, end, tag
import examples as samples
examples = samples.examples
nlp = spacy.load("fr_core_news_lg")
def create_set(nlp, examples, output):
db = DocBin()
for text in examples:
entities = list(get_entities(text))
if not entities:
continue
text = rm_tags(text)
doc = nlp(text)
ents = []
for start, end, label in entities:
span = doc.char_span(start, end, label=label)
ents.append(span)
doc.ents = ents
print(ents)
db.add(doc)
db.to_disk(output)
shuffle(examples)
ntrain = int(0.8 * len(examples))
ndev = len(examples) - ntrain
print(len(examples), "train", ntrain, "dev", ndev)
create_set(nlp, examples[:ntrain], "train.spacy")
create_set(nlp, examples[ndev:], "dev.spacy")
# nlp = spacy.load("./output/model-best")
# doc = nlp("fol. 10 v° et 11 r° Transaction par laquelle il fut convenu que (Pierre) Fromond paierait 8 livres 6 sols parisis pour lods et ventes du plâtre qui était tiré dans une vigne située au territoire de Dives, du côté du gibet qui était en la censive du Cens-Commun, dans laquelle ledit Fromond avait PERSONmis de faire une carrière de plâtre sans le consentement du chapitre et dudit Bouchard (de Brétigny), lequel plâtre ne pourrait être tiré que pendant quatre ans, passé lesquels ledit Fromond serait tenu de replacer et mettre ladite vigne en bon état, etc.")
# displacy.serve(doc, style="ent")