diff --git a/app/db_maker.py b/app/db_maker.py index 00243fe..70ea930 100644 --- a/app/db_maker.py +++ b/app/db_maker.py @@ -6,12 +6,12 @@ Authors : Jean-Damien Généro Affiliation : French National Center for Scientific Research (CNRS) Assigned at the Centre de recherches historiques (CRH, UMR 8558) Date : 2022-10-11 -Update : +Update : 2022-10-13 """ import os +import re from bs4 import BeautifulSoup -from collections import OrderedDict from peewee import * from tqdm import tqdm @@ -30,18 +30,22 @@ def make_soup(file): def _create_institution(data_lst: list)-> None: + """create institution table""" for data in tqdm(data_lst, desc="Populating Institution..."): Institution.create(**data) def _create_state(data_lst: list)-> None: + """create state table""" for data in tqdm(data_lst, desc="Populating State..."): State.create(**data) def _create_diplo_type(data_lst: list)-> None: + """create diplo type table""" for data in tqdm(data_lst, desc="Populating Diplo_type..."): Diplo_type.create(**data) def _create_produc_place(xml_file: str, folder: str)-> None: + """create production place table""" places_xtract = [] production_places = [] for acte in os.listdir(folder): @@ -53,42 +57,32 @@ def _create_produc_place(xml_file: str, folder: str)-> None: Production_place.create(**data) def _create_doc(xml_file: str, folder: str)-> None: - details_doc = {} + """create doc table""" + details_doc = [] infos_doc = [] + # 1/ get repository (doc archives) + doc collection in a list for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) - # doc_id = soup.TEI["xml:id"] - inst_doc = soup.repository - nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"}) - diplo_state = soup.body.div["type"] - # details_doc.append([inst_doc, nb_doc_1, diplo_state]) - details_doc[nb_doc_1]= [inst_doc, diplo_state] - print(details_doc) - """ - for item in details_doc: + inst_doc = soup.repository.text + nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text + details_doc.append(inst_doc + " == " + nb_doc_1) + # 2/ make a query on table Inst to get inst id + # then pretiffy data for the table Doc + for doc in set(details_doc): + doc_archives = re.sub('(.+) == .+', '\\1', doc) + doc_cote = re.sub('.+ == (.+)', '\\1', doc) inst_query = [t.id_institution for t in Institution.select().where( - Institution.full_label == item[0].text)] - state_query = [t.id_state for t in State.select().where( - State.state_label == item[2])] - if len(item[1]) > 0: - # if there is a collection - infos_doc.append({ + Institution.full_label == doc_archives)] + infos_doc.append({ "inst_doc": inst_query[0], - "collection_doc": item[1][0].text, - "state_doc": state_query[0], - }) - else: - # if collection is missing - infos_doc.append({ - "inst_doc": inst_query[0], - "collection_doc": "", - "state_doc": state_query[0], + "collection_doc": doc_cote, }) + # 3/ create the table for data in tqdm(infos_doc, desc="Populating Document..."): - print(data) Document.create(**data) -""" + def init(): + """initializing db""" db.connect() print("Dropping existing DB...") db.drop_tables([Institution, State, Production_place, Diplo_type, Document, Acte]) @@ -98,11 +92,11 @@ def init(): _create_state(state) _create_diplo_type(diplomatic_type) _create_produc_place(xml, "./static/xml/Bourbon/Brb_5_Charles_Ier") - print("ok") _create_doc(xml, "./static/xml/Bourbon/Brb_5_Charles_Ier") xml = "../bourbon-latex/charles-actes-latex.xml" -# init() +init() -_create_doc(xml, "./static/xml/Bourbon/Brb_5_Charles_Ier") +# diplo_state = soup.body.div["type"] +# state_query = [t.id_state for t in State.select().where(State.state_label == item[2])] \ No newline at end of file