initial commit
parent
0c637a5907
commit
6b2103d618
@ -0,0 +1,79 @@
|
|||||||
|
from random import shuffle
|
||||||
|
import spacy
|
||||||
|
import re
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
from spacy import displacy
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
ENTITIES = ["PERSON", "LOC"]
|
||||||
|
|
||||||
|
entities_disjunction = '|'.join(ENTITIES)
|
||||||
|
rgx_entity = re.compile(f"<({entities_disjunction})>(.+?)<\/({entities_disjunction})>")
|
||||||
|
rgx_tag = re.compile(f"<\/?({entities_disjunction})>")
|
||||||
|
|
||||||
|
|
||||||
|
def multi_re_search(pattern, string):
|
||||||
|
while True:
|
||||||
|
m = re.search(pattern, string)
|
||||||
|
if not m: break
|
||||||
|
yield m
|
||||||
|
string = string[m.end():]
|
||||||
|
|
||||||
|
|
||||||
|
def rm_tags(string_with_tags: str) -> str:
|
||||||
|
return re.sub(rgx_tag, "", string_with_tags)
|
||||||
|
|
||||||
|
def get_entities(string_with_tags: str) -> str:
|
||||||
|
entities = multi_re_search(rgx_entity, string_with_tags)
|
||||||
|
|
||||||
|
offset = 0
|
||||||
|
for e in entities:
|
||||||
|
group = e.group(0)
|
||||||
|
tag = re.search(rgx_tag, group).group(1)
|
||||||
|
span = e.span()
|
||||||
|
|
||||||
|
start = max(0, span[0] + offset)
|
||||||
|
end = max(0, span[1] + offset - len(tag) * 2 - 5)
|
||||||
|
|
||||||
|
offset += e.end() - len(tag) * 2 - 5 # last entity len - length of the enclosing tags
|
||||||
|
|
||||||
|
yield start, end, tag
|
||||||
|
|
||||||
|
import examples as samples
|
||||||
|
|
||||||
|
examples = samples.examples
|
||||||
|
|
||||||
|
nlp = spacy.load("fr_core_news_lg")
|
||||||
|
|
||||||
|
def create_set(nlp, examples, output):
|
||||||
|
db = DocBin()
|
||||||
|
for text in examples:
|
||||||
|
entities = list(get_entities(text))
|
||||||
|
|
||||||
|
if not entities:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text = rm_tags(text)
|
||||||
|
doc = nlp(text)
|
||||||
|
ents = []
|
||||||
|
for start, end, label in entities:
|
||||||
|
span = doc.char_span(start, end, label=label)
|
||||||
|
ents.append(span)
|
||||||
|
doc.ents = ents
|
||||||
|
print(ents)
|
||||||
|
db.add(doc)
|
||||||
|
db.to_disk(output)
|
||||||
|
|
||||||
|
|
||||||
|
shuffle(examples)
|
||||||
|
ntrain = int(0.8 * len(examples))
|
||||||
|
ndev = len(examples) - ntrain
|
||||||
|
print(len(examples), "train", ntrain, "dev", ndev)
|
||||||
|
create_set(nlp, examples[:ntrain], "train.spacy")
|
||||||
|
create_set(nlp, examples[ndev:], "dev.spacy")
|
||||||
|
|
||||||
|
|
||||||
|
# nlp = spacy.load("./output/model-best")
|
||||||
|
# doc = nlp("fol. 10 v° et 11 r° Transaction par laquelle il fut convenu que (Pierre) Fromond paierait 8 livres 6 sols parisis pour lods et ventes du plâtre qui était tiré dans une vigne située au territoire de Dives, du côté du gibet qui était en la censive du Cens-Commun, dans laquelle ledit Fromond avait PERSONmis de faire une carrière de plâtre sans le consentement du chapitre et dudit Bouchard (de Brétigny), lequel plâtre ne pourrait être tiré que pendant quatre ans, passé lesquels ledit Fromond serait tenu de replacer et mettre ladite vigne en bon état, etc.")
|
||||||
|
# displacy.serve(doc, style="ent")
|
||||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,143 @@
|
|||||||
|
[paths]
|
||||||
|
train = null
|
||||||
|
dev = null
|
||||||
|
vectors = "fr_core_news_lg"
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
gpu_allocator = null
|
||||||
|
seed = 0
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "fr"
|
||||||
|
pipeline = ["tok2vec","ner"]
|
||||||
|
batch_size = 1000
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
incorrect_spans_key = null
|
||||||
|
moves = null
|
||||||
|
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||||
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
attrs = ["ORTH","SHAPE"]
|
||||||
|
rows = [5000,2500]
|
||||||
|
include_static_vectors = true
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = 256
|
||||||
|
depth = 8
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
t = 0.0
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
ents_f = 1.0
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
ents_per_type = null
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -0,0 +1,98 @@
|
|||||||
|
[paths]
|
||||||
|
train = "./train.spacy"
|
||||||
|
dev = "./dev.spacy"
|
||||||
|
vectors = "fr_core_news_sm"
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
seed = 0
|
||||||
|
gpu_allocator = null
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "fr"
|
||||||
|
pipeline = []
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
batch_size = 1000
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
gold_preproc = false
|
||||||
|
max_length = 0
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
gold_preproc = false
|
||||||
|
max_length = 0
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
t = 0.0
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
{
|
||||||
|
"lang":"fr",
|
||||||
|
"name":"pipeline",
|
||||||
|
"version":"0.0.0",
|
||||||
|
"spacy_version":">=3.4.1,<3.5.0",
|
||||||
|
"description":"",
|
||||||
|
"author":"",
|
||||||
|
"email":"",
|
||||||
|
"url":"",
|
||||||
|
"license":"",
|
||||||
|
"spacy_git_version":"Unknown",
|
||||||
|
"vectors":{
|
||||||
|
"width":0,
|
||||||
|
"vectors":0,
|
||||||
|
"keys":0,
|
||||||
|
"name":null,
|
||||||
|
"mode":"default"
|
||||||
|
},
|
||||||
|
"labels":{
|
||||||
|
|
||||||
|
},
|
||||||
|
"pipeline":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"components":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"disabled":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"performance":{
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"mode":"default"
|
||||||
|
}
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
[paths]
|
||||||
|
train = "./train.spacy"
|
||||||
|
dev = "./dev.spacy"
|
||||||
|
vectors = "fr_core_news_sm"
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
seed = 0
|
||||||
|
gpu_allocator = null
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "fr"
|
||||||
|
pipeline = []
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
batch_size = 1000
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
gold_preproc = false
|
||||||
|
max_length = 0
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
gold_preproc = false
|
||||||
|
max_length = 0
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
t = 0.0
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
{
|
||||||
|
"lang":"fr",
|
||||||
|
"name":"pipeline",
|
||||||
|
"version":"0.0.0",
|
||||||
|
"spacy_version":">=3.4.1,<3.5.0",
|
||||||
|
"description":"",
|
||||||
|
"author":"",
|
||||||
|
"email":"",
|
||||||
|
"url":"",
|
||||||
|
"license":"",
|
||||||
|
"spacy_git_version":"Unknown",
|
||||||
|
"vectors":{
|
||||||
|
"width":0,
|
||||||
|
"vectors":0,
|
||||||
|
"keys":0,
|
||||||
|
"name":null,
|
||||||
|
"mode":"default"
|
||||||
|
},
|
||||||
|
"labels":{
|
||||||
|
|
||||||
|
},
|
||||||
|
"pipeline":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"components":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"disabled":[
|
||||||
|
|
||||||
|
],
|
||||||
|
"performance":{
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"mode":"default"
|
||||||
|
}
|
||||||
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Author : Jean-Damien Généro
|
||||||
|
Affiliation : French National Center for Scientific Research (CNRS)
|
||||||
|
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
|
||||||
|
Date : 2022-09-29
|
||||||
|
Update :
|
||||||
|
"""
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
def spacy_bb(xml):
|
||||||
|
result = []
|
||||||
|
with open(xml, 'r', encoding='utf-8') as corpus:
|
||||||
|
soup = BeautifulSoup(corpus, 'xml')
|
||||||
|
for note in soup.find_all('note'):
|
||||||
|
note.decompose()
|
||||||
|
actes = soup.find_all('div', {'type' : 'acte'})
|
||||||
|
nlp = spacy.load('fr_core_news_sm')
|
||||||
|
for div in actes:
|
||||||
|
print("\n\n\n ###############")
|
||||||
|
for p in div.find_all('p'):
|
||||||
|
doc = nlp(p.text)
|
||||||
|
for ent in doc.ents:
|
||||||
|
if ent.label_ in ['PER']:
|
||||||
|
print(ent)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_training_data(xml):
|
||||||
|
examples = []
|
||||||
|
with open(xml, 'r', encoding='utf-8') as opening:
|
||||||
|
soup = BeautifulSoup(opening, 'xml')
|
||||||
|
entity_tags = ['placeName', 'persName']
|
||||||
|
for entity in entity_tags:
|
||||||
|
for entity in soup.find_all(entity, {'ref': True}):
|
||||||
|
del entity['ref']
|
||||||
|
removed = ['note', 'pb']
|
||||||
|
for tag in removed:
|
||||||
|
for tag in soup.find_all(tag):
|
||||||
|
tag.decompose()
|
||||||
|
for hi in soup.findAll('hi'):
|
||||||
|
hi.replaceWithChildren()
|
||||||
|
for div in soup.find_all('div', {'type' : 'acte'}):
|
||||||
|
txt = str(div.p)
|
||||||
|
# txt = txt.replace('<p>', '')
|
||||||
|
# txt = txt.replace('</p>', '')
|
||||||
|
txt = txt.replace('persName', 'PERSON')
|
||||||
|
txt = txt.replace('placeName', 'LOC')
|
||||||
|
examples.append(txt)
|
||||||
|
return examples
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# spacy_bb("../bourbon-latex/charles-actes-latex.xml")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# nlp = spacy.load('fr_core_news_lg')
|
||||||
|
|
||||||
|
# ner = nlp.get_pipe("ner")
|
||||||
Binary file not shown.
Loading…
Reference in New Issue