initial commit

4 years ago · 6b2103d618
parent 0c637a5907
commit 6b2103d618
23 changed files with 304763 additions and 0 deletions
--- a/corpus-nlp/build_train_dev.py
+++ b/corpus-nlp/build_train_dev.py
@ -0,0 +1,79 @@
 from random import shuffle
 import spacy
 import re
 from spacy.training import Example
 from spacy.tokens import DocBin
 from spacy import displacy
 from bs4 import BeautifulSoup
 ENTITIES = ["PERSON", "LOC"]
 entities_disjunction = '|'.join(ENTITIES)
 rgx_entity = re.compile(f"<({entities_disjunction})>(.+?)<\/({entities_disjunction})>")
 rgx_tag = re.compile(f"<\/?({entities_disjunction})>")
 def multi_re_search(pattern, string):
    while True:
        m = re.search(pattern, string)
        if not m: break
        yield m
        string = string[m.end():]
 def rm_tags(string_with_tags: str) -> str:
    return re.sub(rgx_tag, "", string_with_tags)
 def get_entities(string_with_tags: str) -> str:
    entities = multi_re_search(rgx_entity, string_with_tags)
    offset = 0
    for e in entities:
        group = e.group(0)
        tag = re.search(rgx_tag, group).group(1)
        span = e.span()
        start = max(0, span[0] + offset)
        end =   max(0, span[1] + offset - len(tag) * 2 - 5)
        offset += e.end() - len(tag) * 2 - 5  # last entity len - length of the enclosing tags
        yield start, end, tag
 import examples as samples
 examples = samples.examples
 nlp = spacy.load("fr_core_news_lg")
 def create_set(nlp, examples, output):
    db = DocBin()
    for text in examples:
        entities = list(get_entities(text))
        if not entities:
            continue
        text = rm_tags(text)
        doc = nlp(text)
        ents = []
        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        print(ents)
        db.add(doc)
    db.to_disk(output)
 shuffle(examples)
 ntrain = int(0.8 * len(examples))
 ndev = len(examples) - ntrain
 print(len(examples), "train", ntrain, "dev", ndev)
 create_set(nlp, examples[:ntrain], "train.spacy")
 create_set(nlp, examples[ndev:], "dev.spacy")
 # nlp = spacy.load("./output/model-best")
 # doc = nlp("fol. 10 v° et 11 r° Transaction par laquelle il fut convenu que (Pierre) Fromond paierait 8 livres 6 sols parisis pour lods et ventes du plâtre qui était tiré dans une vigne située au territoire de Dives, du côté du gibet qui était en la censive du Cens-Commun, dans laquelle ledit Fromond avait PERSONmis de faire une carrière de plâtre sans le consentement du chapitre et dudit Bouchard (de Brétigny), lequel plâtre ne pourrait être tiré que pendant quatre ans, passé lesquels ledit Fromond serait tenu de replacer et mettre ladite vigne en bon état, etc.")
 # displacy.serve(doc, style="ent")
--- a/corpus-nlp/charles-training-data.xml
+++ b/corpus-nlp/charles-training-data.xml
--- a/corpus-nlp/config.cfg
+++ b/corpus-nlp/config.cfg
@ -0,0 +1,143 @@
 [paths]
 train = null
 dev = null
 vectors = "fr_core_news_lg"
 init_tok2vec = null
 [system]
 gpu_allocator = null
 seed = 0
 [nlp]
 lang = "fr"
 pipeline = ["tok2vec","ner"]
 batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["ORTH","SHAPE"]
 rows = [5000,2500]
 include_static_vectors = true
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 256
 depth = 8
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 ents_f = 1.0
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/corpus-nlp/dev.spacy
+++ b/corpus-nlp/dev.spacy
--- a/corpus-nlp/examples.py
+++ b/corpus-nlp/examples.py
--- a/corpus-nlp/output/model-best/config.cfg
+++ b/corpus-nlp/output/model-best/config.cfg
@ -0,0 +1,98 @@
 [paths]
 train = "./train.spacy"
 dev = "./dev.spacy"
 vectors = "fr_core_news_sm"
 init_tok2vec = null
 [system]
 seed = 0
 gpu_allocator = null
 [nlp]
 lang = "fr"
 pipeline = []
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 batch_size = 1000
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components]
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [training]
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 before_to_disk = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/corpus-nlp/output/model-best/meta.json
+++ b/corpus-nlp/output/model-best/meta.json
@ -0,0 +1,34 @@
 {
  "lang":"fr",
  "name":"pipeline",
  "version":"0.0.0",
  "spacy_version":">=3.4.1,<3.5.0",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "spacy_git_version":"Unknown",
  "vectors":{
    "width":0,
    "vectors":0,
    "keys":0,
    "name":null,
    "mode":"default"
  },
  "labels":{
  },
  "pipeline":[
  ],
  "components":[
  ],
  "disabled":[
  ],
  "performance":{
  }
 }
--- a/corpus-nlp/output/model-best/tokenizer
+++ b/corpus-nlp/output/model-best/tokenizer
--- a/corpus-nlp/output/model-best/vocab/key2row
+++ b/corpus-nlp/output/model-best/vocab/key2row
@ -0,0 +1 @@
 <EFBFBD>
--- a/corpus-nlp/output/model-best/vocab/lookups.bin
+++ b/corpus-nlp/output/model-best/vocab/lookups.bin
@ -0,0 +1 @@
 <EFBFBD>
--- a/corpus-nlp/output/model-best/vocab/strings.json
+++ b/corpus-nlp/output/model-best/vocab/strings.json
--- a/corpus-nlp/output/model-best/vocab/vectors
+++ b/corpus-nlp/output/model-best/vocab/vectors
--- a/corpus-nlp/output/model-best/vocab/vectors.cfg
+++ b/corpus-nlp/output/model-best/vocab/vectors.cfg
@ -0,0 +1,3 @@
 {
  "mode":"default"
 }
--- a/corpus-nlp/output/model-last/config.cfg
+++ b/corpus-nlp/output/model-last/config.cfg
@ -0,0 +1,98 @@
 [paths]
 train = "./train.spacy"
 dev = "./dev.spacy"
 vectors = "fr_core_news_sm"
 init_tok2vec = null
 [system]
 seed = 0
 gpu_allocator = null
 [nlp]
 lang = "fr"
 pipeline = []
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 batch_size = 1000
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components]
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [training]
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 before_to_disk = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/corpus-nlp/output/model-last/meta.json
+++ b/corpus-nlp/output/model-last/meta.json
@ -0,0 +1,34 @@
 {
  "lang":"fr",
  "name":"pipeline",
  "version":"0.0.0",
  "spacy_version":">=3.4.1,<3.5.0",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "spacy_git_version":"Unknown",
  "vectors":{
    "width":0,
    "vectors":0,
    "keys":0,
    "name":null,
    "mode":"default"
  },
  "labels":{
  },
  "pipeline":[
  ],
  "components":[
  ],
  "disabled":[
  ],
  "performance":{
  }
 }
--- a/corpus-nlp/output/model-last/tokenizer
+++ b/corpus-nlp/output/model-last/tokenizer
--- a/corpus-nlp/output/model-last/vocab/key2row
+++ b/corpus-nlp/output/model-last/vocab/key2row
@ -0,0 +1 @@
 <EFBFBD>
--- a/corpus-nlp/output/model-last/vocab/lookups.bin
+++ b/corpus-nlp/output/model-last/vocab/lookups.bin
@ -0,0 +1 @@
 <EFBFBD>
--- a/corpus-nlp/output/model-last/vocab/strings.json
+++ b/corpus-nlp/output/model-last/vocab/strings.json
--- a/corpus-nlp/output/model-last/vocab/vectors
+++ b/corpus-nlp/output/model-last/vocab/vectors
--- a/corpus-nlp/output/model-last/vocab/vectors.cfg
+++ b/corpus-nlp/output/model-last/vocab/vectors.cfg
@ -0,0 +1,3 @@
 {
  "mode":"default"
 }
--- a/corpus-nlp/spacy_bourbon.py
+++ b/corpus-nlp/spacy_bourbon.py
@ -0,0 +1,65 @@
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 """
 Author : Jean-Damien Généro
 Affiliation : French National Center for Scientific Research (CNRS)
 Assigned at the Centre de recherches historiques (CRH, UMR 8558)
 Date : 2022-09-29
 Update :
 """
 from bs4 import BeautifulSoup
 import spacy
 def spacy_bb(xml):
    result = []
    with open(xml, 'r', encoding='utf-8') as corpus:
        soup = BeautifulSoup(corpus, 'xml')
    for note in soup.find_all('note'):
        note.decompose()
    actes = soup.find_all('div', {'type' : 'acte'})
    nlp = spacy.load('fr_core_news_sm')
    for div in actes:
         print("\n\n\n ###############")
         for p in div.find_all('p'):
             doc = nlp(p.text)
             for ent in doc.ents:
                 if ent.label_ in ['PER']:
                     print(ent)
 def clean_training_data(xml):
    examples = []
    with open(xml, 'r', encoding='utf-8') as opening:
        soup = BeautifulSoup(opening, 'xml')
    entity_tags = ['placeName', 'persName']
    for entity in entity_tags:
        for entity in soup.find_all(entity, {'ref': True}):
            del entity['ref']
    removed = ['note', 'pb']
    for tag in removed:
        for tag in soup.find_all(tag):
            tag.decompose()
    for hi in soup.findAll('hi'):
            hi.replaceWithChildren()
    for div in soup.find_all('div', {'type' : 'acte'}):
        txt = str(div.p)
        # txt = txt.replace('<p>', '')
        # txt = txt.replace('</p>', '')
        txt = txt.replace('persName', 'PERSON')
        txt = txt.replace('placeName', 'LOC')
        examples.append(txt)
    return examples
 # spacy_bb("../bourbon-latex/charles-actes-latex.xml")
 # nlp = spacy.load('fr_core_news_lg')
 # ner = nlp.get_pipe("ner")
--- a/corpus-nlp/train.spacy
+++ b/corpus-nlp/train.spacy