rm corpus-nlp

main
jgenero 3 years ago
parent 4929d64e22
commit 16e5fac7db

@ -1,79 +0,0 @@
from random import shuffle
import spacy
import re
from spacy.training import Example
from spacy.tokens import DocBin
from spacy import displacy
from bs4 import BeautifulSoup
ENTITIES = ["PERSON", "LOC"]
entities_disjunction = '|'.join(ENTITIES)
rgx_entity = re.compile(f"<({entities_disjunction})>(.+?)<\/({entities_disjunction})>")
rgx_tag = re.compile(f"<\/?({entities_disjunction})>")
def multi_re_search(pattern, string):
while True:
m = re.search(pattern, string)
if not m: break
yield m
string = string[m.end():]
def rm_tags(string_with_tags: str) -> str:
return re.sub(rgx_tag, "", string_with_tags)
def get_entities(string_with_tags: str) -> str:
entities = multi_re_search(rgx_entity, string_with_tags)
offset = 0
for e in entities:
group = e.group(0)
tag = re.search(rgx_tag, group).group(1)
span = e.span()
start = max(0, span[0] + offset)
end = max(0, span[1] + offset - len(tag) * 2 - 5)
offset += e.end() - len(tag) * 2 - 5 # last entity len - length of the enclosing tags
yield start, end, tag
import examples as samples
examples = samples.examples
nlp = spacy.load("fr_core_news_lg")
def create_set(nlp, examples, output):
db = DocBin()
for text in examples:
entities = list(get_entities(text))
if not entities:
continue
text = rm_tags(text)
doc = nlp(text)
ents = []
for start, end, label in entities:
span = doc.char_span(start, end, label=label)
ents.append(span)
doc.ents = ents
print(ents)
db.add(doc)
db.to_disk(output)
shuffle(examples)
ntrain = int(0.8 * len(examples))
ndev = len(examples) - ntrain
print(len(examples), "train", ntrain, "dev", ndev)
create_set(nlp, examples[:ntrain], "train.spacy")
create_set(nlp, examples[ndev:], "dev.spacy")
# nlp = spacy.load("./output/model-best")
# doc = nlp("fol. 10 v° et 11 r° Transaction par laquelle il fut convenu que (Pierre) Fromond paierait 8 livres 6 sols parisis pour lods et ventes du plâtre qui était tiré dans une vigne située au territoire de Dives, du côté du gibet qui était en la censive du Cens-Commun, dans laquelle ledit Fromond avait PERSONmis de faire une carrière de plâtre sans le consentement du chapitre et dudit Bouchard (de Brétigny), lequel plâtre ne pourrait être tiré que pendant quatre ans, passé lesquels ledit Fromond serait tenu de replacer et mettre ladite vigne en bon état, etc.")
# displacy.serve(doc, style="ent")

File diff suppressed because one or more lines are too long

@ -1,143 +0,0 @@
[paths]
train = null
dev = null
vectors = "fr_core_news_lg"
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "fr"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["ORTH","SHAPE"]
rows = [5000,2500]
include_static_vectors = true
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]

Binary file not shown.

File diff suppressed because one or more lines are too long

@ -1,98 +0,0 @@
[paths]
train = "./train.spacy"
dev = "./dev.spacy"
vectors = "fr_core_news_sm"
init_tok2vec = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = "fr"
pipeline = []
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]

@ -1,34 +0,0 @@
{
"lang":"fr",
"name":"pipeline",
"version":"0.0.0",
"spacy_version":">=3.4.1,<3.5.0",
"description":"",
"author":"",
"email":"",
"url":"",
"license":"",
"spacy_git_version":"Unknown",
"vectors":{
"width":0,
"vectors":0,
"keys":0,
"name":null,
"mode":"default"
},
"labels":{
},
"pipeline":[
],
"components":[
],
"disabled":[
],
"performance":{
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

@ -1,98 +0,0 @@
[paths]
train = "./train.spacy"
dev = "./dev.spacy"
vectors = "fr_core_news_sm"
init_tok2vec = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = "fr"
pipeline = []
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]

@ -1,34 +0,0 @@
{
"lang":"fr",
"name":"pipeline",
"version":"0.0.0",
"spacy_version":">=3.4.1,<3.5.0",
"description":"",
"author":"",
"email":"",
"url":"",
"license":"",
"spacy_git_version":"Unknown",
"vectors":{
"width":0,
"vectors":0,
"keys":0,
"name":null,
"mode":"default"
},
"labels":{
},
"pipeline":[
],
"components":[
],
"disabled":[
],
"performance":{
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

@ -1,65 +0,0 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Author : Jean-Damien Généro
Affiliation : French National Center for Scientific Research (CNRS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2022-09-29
Update :
"""
from bs4 import BeautifulSoup
import spacy
def spacy_bb(xml):
result = []
with open(xml, 'r', encoding='utf-8') as corpus:
soup = BeautifulSoup(corpus, 'xml')
for note in soup.find_all('note'):
note.decompose()
actes = soup.find_all('div', {'type' : 'acte'})
nlp = spacy.load('fr_core_news_sm')
for div in actes:
print("\n\n\n ###############")
for p in div.find_all('p'):
doc = nlp(p.text)
for ent in doc.ents:
if ent.label_ in ['PER']:
print(ent)
def clean_training_data(xml):
examples = []
with open(xml, 'r', encoding='utf-8') as opening:
soup = BeautifulSoup(opening, 'xml')
entity_tags = ['placeName', 'persName']
for entity in entity_tags:
for entity in soup.find_all(entity, {'ref': True}):
del entity['ref']
removed = ['note', 'pb']
for tag in removed:
for tag in soup.find_all(tag):
tag.decompose()
for hi in soup.findAll('hi'):
hi.replaceWithChildren()
for div in soup.find_all('div', {'type' : 'acte'}):
txt = str(div.p)
# txt = txt.replace('<p>', '')
# txt = txt.replace('</p>', '')
txt = txt.replace('persName', 'PERSON')
txt = txt.replace('placeName', 'LOC')
examples.append(txt)
return examples
# spacy_bb("../bourbon-latex/charles-actes-latex.xml")
# nlp = spacy.load('fr_core_news_lg')
# ner = nlp.get_pipe("ner")

Binary file not shown.
Loading…
Cancel
Save