You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

158 lines
5.2 KiB
Python

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Authors : Jean-Damien Généro
Affiliation : French National Center for Scientific Research (CNRS)
Assigned at the Centre de recherches historiques (CRH, UMR 8558)
Date : 2022-10-11
Update : 2022-10-13
"""
import os
import re
from bs4 import BeautifulSoup
from peewee import *
from tqdm import tqdm
from modeles.princes_db_tables import db, Institution, State, Production_place, Diplo_type, Document, Acte
from data.institution_data import institution
from data.state_data import state
from data.diplo_type_data import diplomatic_type
def make_soup(file):
"""open a xml file and return a BeautifulSoup object"""
with open(file, 'r', encoding="utf-8") as opening:
xml = BeautifulSoup(opening, 'xml')
return xml
def _create_institution(data_lst: list)-> None:
"""create institution table"""
for data in tqdm(data_lst, desc="Populating Institution..."):
Institution.create(**data)
def _create_state(data_lst: list)-> None:
"""create state table"""
for data in tqdm(data_lst, desc="Populating State..."):
State.create(**data)
def _create_diplo_type(data_lst: list)-> None:
"""create diplo type table"""
for data in tqdm(data_lst, desc="Populating Diplo_type..."):
Diplo_type.create(**data)
def _create_produc_place(xml_file: str, folder: str)-> None:
"""create production place table"""
places_xtract = []
production_places = []
for acte in os.listdir(folder):
soup = make_soup(os.path.join(folder, acte))
for place in soup.find('placeName', {'type': 'production_place'}):
places_xtract.append(place)
production_places = [{"placename": xtraction} for xtraction in set(places_xtract)]
for data in tqdm(production_places, desc="Populating Place..."):
Production_place.create(**data)
def _create_doc(xml_file: str, folder: str)-> None:
"""create doc table"""
details_doc = []
infos_doc = []
# 1/ get repository (doc archives) + doc collection in a list
for acte in os.listdir(folder):
soup = make_soup(os.path.join(folder, acte))
inst_doc = soup.repository.text
nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text
details_doc.append(inst_doc + " == " + nb_doc_1)
# 2/ make a query on table Inst to get inst id
# then pretiffy data for the table Doc
for doc in set(details_doc):
doc_archives = re.sub('(.+) == .+', '\\1', doc)
doc_cote = re.sub('.+ == (.+)', '\\1', doc)
inst_query = [t.id_institution for t in Institution.select().where(
Institution.full_label == doc_archives)]
infos_doc.append({
"inst_doc": inst_query[0],
"collection_doc": doc_cote,
})
# 3/ create the table
for data in tqdm(infos_doc, desc="Populating Document..."):
Document.create(**data)
def _create_acte(xml_file: str, folder: str)-> None:
actes = []
for acte in os.listdir(folder):
soup = make_soup(os.path.join(folder, acte))
numb = soup.TEI["xml:id"]
date_time = soup.msItem.docDate["when"]
date = soup.msItem.docDate.text
analyse = soup.abstract.p.text
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
if len(ref) > 0:
ref_acte = ref[0].text
else:
ref_acte = "NS"
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
type_diplo = soup.body.div["subtype"]
diplo_state = soup.body.div["type"]
place_query = [t.id_place for t in Production_place.select().where(
Production_place.placename == prod_place)]
doc_query = [t.id_document for t in Document.select().where(
Document.collection_doc == doc.text)]
diplo_query = [t.id_diplo_type for t in Diplo_type.select().where(
Diplo_type.diplo_label == type_diplo)]
state_query = [t.id_state for t in State.select().where(
State.state_label == diplo_state)]
actes.append({
"numb_acte": numb,
"date_time": date_time,
"date": date,
"analysis": analyse,
"doc_acte": doc_query[0],
"ref_acte": ref_acte,
"state_doc": state_query[0],
"diplo_type_acte": diplo_query[0]
})
for data in tqdm(actes, desc="Populating Actes..."):
Acte.create(**data)
def __find_indiv(xml_soup, role: str, indiv_lst: list)-> None:
princes = xml_soup.sourceDesc.find_all("listPerson", {"type": role})
for prince in princes:
dukes = prince.find_all("person")
for duke in dukes:
indiv_lst.append(duke.text.replace("\n", ""))
def _create_individual(xml_file: str, folder: str)-> None:
indiv_prince = []
indiv_secret = []
for acte in os.listdir(folder):
soup = make_soup(os.path.join(folder, acte))
__find_indiv(soup, "prince", indiv_prince)
__find_indiv(soup, "signatory", indiv_secret)
print(set(indiv_secret))
print(set(indiv_prince))
def init():
"""initializing db"""
db.connect()
print("Dropping existing DB...")
db.drop_tables([Institution, State, Production_place, Diplo_type, Document, Acte])
print("Re-creating schema...")
db.create_tables([Institution, State, Production_place, Diplo_type, Document, Acte])
_create_institution(institution)
_create_state(state)
_create_diplo_type(diplomatic_type)
_create_produc_place(xml, xml_folder)
_create_doc(xml, xml_folder)
_create_acte(xml, xml_folder)
xml = "../bourbon-latex/charles-actes-latex.xml"
xml_folder = "./static/xml/Bourbon/Brb_5_Charles_Ier"
# init()
_create_individual(xml, xml_folder)