#!/usr/bin/python # -*- coding: UTF-8 -*- """ Authors : Jean-Damien Généro Affiliation : French National Center for Scientific Research (CNRS) Assigned at the Centre de recherches historiques (CRH, UMR 8558) Date : 2022-10-11 Update : 2022-10-13 """ import os import re from bs4 import BeautifulSoup from peewee import * from tqdm import tqdm from modeles.princes_db_tables import db, Institution, State, Production_place, Diplo_type, Document, Acte from data.institution_data import institution from data.state_data import state from data.diplo_type_data import diplomatic_type def make_soup(file): """open a xml file and return a BeautifulSoup object""" with open(file, 'r', encoding="utf-8") as opening: xml = BeautifulSoup(opening, 'xml') return xml def _create_institution(data_lst: list)-> None: """create institution table""" for data in tqdm(data_lst, desc="Populating Institution..."): Institution.create(**data) def _create_state(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating State..."): State.create(**data) def _create_diplo_type(data_lst: list)-> None: """create diplo type table""" for data in tqdm(data_lst, desc="Populating Diplo_type..."): Diplo_type.create(**data) def _create_produc_place(xml_file: str, folder: str)-> None: """create production place table""" places_xtract = [] production_places = [] for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) for place in soup.find('placeName', {'type': 'production_place'}): places_xtract.append(place) production_places = [{"placename": xtraction} for xtraction in set(places_xtract)] for data in tqdm(production_places, desc="Populating Place..."): Production_place.create(**data) def _create_doc(xml_file: str, folder: str)-> None: """create doc table""" details_doc = [] infos_doc = [] # 1/ get repository (doc archives) + doc collection in a list for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) inst_doc = soup.repository.text nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text details_doc.append(inst_doc + " == " + nb_doc_1) # 2/ make a query on table Inst to get inst id # then pretiffy data for the table Doc for doc in set(details_doc): doc_archives = re.sub('(.+) == .+', '\\1', doc) doc_cote = re.sub('.+ == (.+)', '\\1', doc) inst_query = [t.id_institution for t in Institution.select().where( Institution.full_label == doc_archives)] infos_doc.append({ "inst_doc": inst_query[0], "collection_doc": doc_cote, }) # 3/ create the table for data in tqdm(infos_doc, desc="Populating Document..."): Document.create(**data) def _create_acte(xml_file: str, folder: str)-> None: actes = [] for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) numb = soup.TEI["xml:id"] date_time = soup.msItem.docDate["when"] date = soup.msItem.docDate.text analyse = soup.abstract.p.text ref = soup.msIdentifier.find_all("idno", {"n": "2"}) if len(ref) > 0: ref_acte = ref[0].text else: ref_acte = "NS" prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] type_diplo = soup.body.div["subtype"] diplo_state = soup.body.div["type"] place_query = [t.id_place for t in Production_place.select().where( Production_place.placename == prod_place)] doc_query = [t.id_document for t in Document.select().where( Document.collection_doc == doc.text)] diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( Diplo_type.diplo_label == type_diplo)] state_query = [t.id_state for t in State.select().where( State.state_label == diplo_state)] actes.append({ "numb_acte": numb, "date_time": date_time, "date": date, "analysis": analyse, "doc_acte": doc_query[0], "ref_acte": ref_acte, "state_doc": state_query[0], "diplo_type_acte": diplo_query[0] }) for data in tqdm(actes, desc="Populating Actes..."): Acte.create(**data) def __find_indiv(xml_soup, role: str, indiv_lst: list)-> None: princes = xml_soup.sourceDesc.find_all("listPerson", {"type": role}) for prince in princes: dukes = prince.find_all("person") for duke in dukes: indiv_lst.append(duke.text.replace("\n", "")) def _create_individual(xml_file: str, folder: str)-> None: indiv_prince = [] indiv_secret = [] for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) __find_indiv(soup, "prince", indiv_prince) __find_indiv(soup, "signatory", indiv_secret) print(set(indiv_secret)) print(set(indiv_prince)) def init(): """initializing db""" db.connect() print("Dropping existing DB...") db.drop_tables([Institution, State, Production_place, Diplo_type, Document, Acte]) print("Re-creating schema...") db.create_tables([Institution, State, Production_place, Diplo_type, Document, Acte]) _create_institution(institution) _create_state(state) _create_diplo_type(diplomatic_type) _create_produc_place(xml, xml_folder) _create_doc(xml, xml_folder) _create_acte(xml, xml_folder) xml = "../bourbon-latex/charles-actes-latex.xml" xml_folder = "./static/xml/Bourbon/Brb_5_Charles_Ier" # init() _create_individual(xml, xml_folder)