diff --git a/app/cmd/db.py b/app/cmd/db.py new file mode 100644 index 0000000..1e8f8e4 --- /dev/null +++ b/app/cmd/db.py @@ -0,0 +1,223 @@ +import csv +import os +import re +import typing as t + +from bs4 import BeautifulSoup +from flask.cli import AppGroup +from lxml import etree +from tqdm import tqdm + +from app.app import APPPATH, db +from app.data_actes import diplomatic_type, institution, state +from app.modeles import Institution, State, Production_place, Diplo_type, Document, Acte, Individual, Duke, Produced_by + +# from app.data import institution +# from app.data import state +# from app.data import diplomatic_type + + +db_cli = AppGroup("db") + + +def _nonempty(s: str) -> t.Optional[str]: + """Returns content only if non-empty; otherwise returns None""" + if s: + return s + else: + return None + + +def _capitalize_first(s: t.Optional[str]) -> t.Optional[str]: + return (s[0].upper() + s[1:]) if s else None + + +def make_soup(file): + """open a xml file and return a BeautifulSoup object""" + with open(file, 'r', encoding="utf-8") as opening: + xml = BeautifulSoup(opening, 'xml') + return xml + + +def _create_institution(data_lst: list)-> None: + """create institution table""" + for data in tqdm(data_lst, desc="Populating Institution..."): + Institution.create(**data) + +def _create_state(data_lst: list)-> None: + """create state table""" + for data in tqdm(data_lst, desc="Populating State..."): + State.create(**data) + +def _create_diplo_type(data_lst: list)-> None: + """create diplo type table""" + for data in tqdm(data_lst, desc="Populating Diplo_type..."): + Diplo_type.create(**data) + +def _create_produc_place(xml_file: str, folder: str)-> None: + """create production place table""" + places_xtract = [] + production_places = [] + for acte in os.listdir(folder): + soup = make_soup(os.path.join(folder, acte)) + for place in soup.find('placeName', {'type': 'production_place'}): + places_xtract.append(place) + production_places = [{"placename": xtraction} for xtraction in set(places_xtract)] + for data in tqdm(production_places, desc="Populating Place..."): + Production_place.create(**data) + +def _create_doc(xml_file: str, folder: str)-> None: + """create doc table""" + details_doc = [] + infos_doc = [] + # 1/ get repository (doc archives) + doc collection in a list + for acte in os.listdir(folder): + soup = make_soup(os.path.join(folder, acte)) + inst_doc = soup.repository.text + nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text + details_doc.append(inst_doc + " == " + nb_doc_1) + # 2/ make a query on table Inst to get inst id + # then pretiffy data for the table Doc + for doc in set(details_doc): + doc_archives = re.sub('(.+) == .+', '\\1', doc) + doc_cote = re.sub('.+ == (.+)', '\\1', doc) + inst_query = [t.id_institution for t in Institution.select().where( + Institution.full_label == doc_archives)] + infos_doc.append({ + "inst_doc": inst_query[0], + "collection_doc": doc_cote, + }) + # 3/ create the table + for data in tqdm(infos_doc, desc="Populating Document..."): + Document.create(**data) + +def _create_acte(xml_file: str, folder: str)-> None: + actes = [] + for acte in os.listdir(folder): + soup = make_soup(os.path.join(folder, acte)) + numb = soup.TEI["xml:id"] + date_time = soup.msItem.docDate["when"] + date = soup.msItem.docDate.text + analyse = soup.abstract.p.text + ref = soup.msIdentifier.find_all("idno", {"n": "2"}) + if len(ref) > 0: + ref_acte = ref[0].text + else: + ref_acte = "NS" + prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text + doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] + type_diplo = soup.body.div["subtype"] + diplo_state = soup.body.div["type"] + place_query = [t.id_place for t in Production_place.select().where( + Production_place.placename == prod_place)] + doc_query = [t.id_document for t in Document.select().where( + Document.collection_doc == doc.text)] + diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( + Diplo_type.diplo_label == type_diplo)] + state_query = [t.id_state for t in State.select().where( + State.state_label == diplo_state)] + actes.append({ + "numb_acte": numb, + "date_time": date_time, + "date": date, + "prod_place_acte": place_query[0], + "analysis": analyse, + "doc_acte": doc_query[0], + "ref_acte": ref_acte, + "state_doc": state_query[0], + "diplo_type_acte": diplo_query[0] + }) + for data in tqdm(actes, desc="Populating Actes..."): + Acte.create(**data) + + +def __find_indiv(xml_soup, role: str, indiv_lst: list)-> None: + princes = xml_soup.sourceDesc.find_all("listPerson", {"type": role}) + for prince in princes: + dukes = prince.find_all("person") + for duke in dukes: + indiv_lst.append(duke.text.replace("\n", "")) +""" +def _create_individual(xml_file: str, folder: str)-> None: + indiv_prince = [] + indiv_secret = [] + for acte in os.listdir(folder): + soup = make_soup(os.path.join(folder, acte)) + __find_indiv(soup, "prince", indiv_prince) + __find_indiv(soup, "signatory", indiv_secret) + print(set(indiv_secret)) + print(set(indiv_prince)) +""" + +def __indiv_infos(indiv_type): + with open(os.path.join(APPPATH, "static", "csv", "actors.csv"), 'r', encoding="utf-8") as opening: + actors_csv = csv.reader(opening, delimiter=";") + next(actors_csv, None) + lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type] + return lst_of_indiv + +def _create_indiv(): + actors = [*__indiv_infos("secret"), *__indiv_infos("prince")] + individuals = [{"name_indiv": actor[0], "role_indiv": actor[1]} + for actor in actors] + for data in tqdm(individuals, desc="Populating Individual..."): + Individual.create(**data) + +def _create_duke(): + dukes = [] + for info in __indiv_infos("prince"): + indiv_query = [t.id_indiv for t in Individual.select().where( + Individual.name_indiv == info[0])] + dukes.append({"house": info[2], "indiv_duke": indiv_query[0], + "birth": info[3], "reign": info[4], "death": info[4]}) + for data in tqdm(dukes, desc="Populating Duke..."): + Duke.create(**data) + +def _create_produced_by(xml_file: str, folder: str): + princes_actes = [] + for acte in os.listdir(folder): + acte_q = [t.id_acte for t in Acte.select().where( + Acte.numb_acte == acte.replace(".xml", ""))] + # print(acte, "==", acte_q[0]) + soup = make_soup(os.path.join(folder, acte)) + princes = soup.sourceDesc.find_all("listPerson", {"type": "prince"}) + for prince in princes: + dukes = prince.find_all("person") + for duke in dukes: + prince = duke.text.replace("\n", "") + prince_q = [t.id_indiv for t in Individual.select().where( + Individual.name_indiv == duke.text.replace("\n", ""))] + duke_q = [t.id_duke for t in Duke.select().where( + Duke.indiv_duke == prince_q[0])] + # print(prince, "==", prince_q[0], "==", duke_q[0]) + princes_actes.append({"produced_by_acte": acte_q[0], + "produced_by_prince": duke_q[0]}) + for data in tqdm(princes_actes, desc="Populating Produced_by..."): + Produced_by.create(**data) + + +@db_cli.command() +def init() -> None: + """Initialization of the database""" + xml = os.path.join(APPPATH, "static", "xml", + "Bourbon", "Brb_5_Charles_Ier"), ".xml" + xml_folder = os.path.join(APPPATH, "static", "xml", + "Bourbon", "Brb_5_Charles_Ier") + + print("Dropping existing DB...") + db.drop_tables([Institution, State, Production_place, + Diplo_type, Document, Acte, Individual, Duke, + Produced_by]) + print("Re-creating schema...") + db.create_tables([Institution, State, Production_place, + Diplo_type, Document, Acte, Individual, Duke, + Produced_by]) + _create_institution(institution) + _create_state(state) + _create_diplo_type(diplomatic_type) + _create_produc_place(xml, xml_folder) + _create_doc(xml, xml_folder) + _create_acte(xml, xml_folder) + _create_indiv() + _create_duke() + _create_produced_by(xml, xml_folder)