You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
2.6 KiB
Python
80 lines
2.6 KiB
Python
from random import shuffle
|
|
import spacy
|
|
import re
|
|
from spacy.training import Example
|
|
from spacy.tokens import DocBin
|
|
from spacy import displacy
|
|
from bs4 import BeautifulSoup
|
|
|
|
ENTITIES = ["PERSON", "LOC"]
|
|
|
|
entities_disjunction = '|'.join(ENTITIES)
|
|
rgx_entity = re.compile(f"<({entities_disjunction})>(.+?)<\/({entities_disjunction})>")
|
|
rgx_tag = re.compile(f"<\/?({entities_disjunction})>")
|
|
|
|
|
|
def multi_re_search(pattern, string):
|
|
while True:
|
|
m = re.search(pattern, string)
|
|
if not m: break
|
|
yield m
|
|
string = string[m.end():]
|
|
|
|
|
|
def rm_tags(string_with_tags: str) -> str:
|
|
return re.sub(rgx_tag, "", string_with_tags)
|
|
|
|
def get_entities(string_with_tags: str) -> str:
|
|
entities = multi_re_search(rgx_entity, string_with_tags)
|
|
|
|
offset = 0
|
|
for e in entities:
|
|
group = e.group(0)
|
|
tag = re.search(rgx_tag, group).group(1)
|
|
span = e.span()
|
|
|
|
start = max(0, span[0] + offset)
|
|
end = max(0, span[1] + offset - len(tag) * 2 - 5)
|
|
|
|
offset += e.end() - len(tag) * 2 - 5 # last entity len - length of the enclosing tags
|
|
|
|
yield start, end, tag
|
|
|
|
import examples as samples
|
|
|
|
examples = samples.examples
|
|
|
|
nlp = spacy.load("fr_core_news_lg")
|
|
|
|
def create_set(nlp, examples, output):
|
|
db = DocBin()
|
|
for text in examples:
|
|
entities = list(get_entities(text))
|
|
|
|
if not entities:
|
|
continue
|
|
|
|
text = rm_tags(text)
|
|
doc = nlp(text)
|
|
ents = []
|
|
for start, end, label in entities:
|
|
span = doc.char_span(start, end, label=label)
|
|
ents.append(span)
|
|
doc.ents = ents
|
|
print(ents)
|
|
db.add(doc)
|
|
db.to_disk(output)
|
|
|
|
|
|
shuffle(examples)
|
|
ntrain = int(0.8 * len(examples))
|
|
ndev = len(examples) - ntrain
|
|
print(len(examples), "train", ntrain, "dev", ndev)
|
|
create_set(nlp, examples[:ntrain], "train.spacy")
|
|
create_set(nlp, examples[ndev:], "dev.spacy")
|
|
|
|
|
|
# nlp = spacy.load("./output/model-best")
|
|
# doc = nlp("fol. 10 v° et 11 r° Transaction par laquelle il fut convenu que (Pierre) Fromond paierait 8 livres 6 sols parisis pour lods et ventes du plâtre qui était tiré dans une vigne située au territoire de Dives, du côté du gibet qui était en la censive du Cens-Commun, dans laquelle ledit Fromond avait PERSONmis de faire une carrière de plâtre sans le consentement du chapitre et dudit Bouchard (de Brétigny), lequel plâtre ne pourrait être tiré que pendant quatre ans, passé lesquels ledit Fromond serait tenu de replacer et mettre ladite vigne en bon état, etc.")
|
|
# displacy.serve(doc, style="ent")
|