import csv import os import re import typing as t from bs4 import BeautifulSoup from flask.cli import AppGroup from lxml import etree from tqdm import tqdm from app.app import APPPATH, db from app.data_actes import diplomatic_type, institution, state, houses, interventions from app.modeles import Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Acte, Individual, Involved_in db_cli = AppGroup("db") def _nonempty(s: str) -> t.Optional[str]: """Returns content only if non-empty; otherwise returns None""" if s: return s else: return None def _capitalize_first(s: t.Optional[str]) -> t.Optional[str]: return (s[0].upper() + s[1:]) if s else None def make_soup(file): """open a xml file and return a BeautifulSoup object""" with open(file, 'r', encoding="utf-8") as opening: xml = BeautifulSoup(opening, 'xml') return xml def _create_institution(data_lst: list)-> None: """create institution table""" for data in tqdm(data_lst, desc="Populating Institution..."): Institution.create(**data) def _create_state(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating State..."): State.create(**data) def _create_house(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating House..."): House.create(**data) def _create_interv_type(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating Intervention_type..."): Intervention_type.create(**data) def _create_diplo_type(data_lst: list)-> None: """create diplo type table""" for data in tqdm(data_lst, desc="Populating Diplo_type..."): Diplo_type.create(**data) def _create_produc_place(xml_file: str, folder: str)-> None: """create production place table""" places_xtract = [] production_places = [] for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) for place in soup.find('placeName', {'type': 'production_place'}): places_xtract.append(place) production_places = [{"placename": xtraction} for xtraction in set(places_xtract)] for data in tqdm(production_places, desc="Populating Place..."): Production_place.create(**data) def _create_doc(xml_file: str, folder: str)-> None: """create doc table""" details_doc = [] infos_doc = [] # 1/ get repository (doc archives) + doc collection in a list for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) inst_doc = soup.repository.text nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text details_doc.append(inst_doc + " == " + nb_doc_1) # 2/ make a query on table Inst to get inst id # then pretiffy data for the table Doc for doc in set(details_doc): doc_archives = re.sub('(.+) == .+', '\\1', doc) doc_cote = re.sub('.+ == (.+)', '\\1', doc) inst_query = [t.id_institution for t in Institution.select().where( Institution.full_label == doc_archives)] infos_doc.append({ "inst_doc": inst_query[0], "collection_doc": doc_cote, }) # 3/ create the table for data in tqdm(infos_doc, desc="Populating Document..."): Document.create(**data) def _create_acte(xml_file: str, folder: str)-> None: actes = [] counter = 0 for acte in sorted(os.listdir(folder)): counter += 1 soup = make_soup(os.path.join(folder, acte)) numb = soup.TEI["xml:id"] date_time = soup.msItem.docDate["when"] date = soup.msItem.docDate.text analyse = soup.abstract.p.text ref = soup.msIdentifier.find_all("idno", {"n": "2"}) if len(ref) > 0: ref_acte = ref[0].text else: ref_acte = "NS" prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] type_diplo = soup.body.div["subtype"] diplo_state = soup.body.div["type"] place_query = [t.id_place for t in Production_place.select().where( Production_place.placename == prod_place)] doc_query = [t.id_document for t in Document.select().where( Document.collection_doc == doc.text)] diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( Diplo_type.diplo_label == type_diplo)] state_query = [t.id_state for t in State.select().where( State.state_label == diplo_state)] actes.append({ "num_acte": counter, "filename": numb, "date_time": date_time, "date": date, "prod_place_acte": place_query[0], "analysis": analyse, "doc_acte": doc_query[0], "ref_acte": ref_acte, "state_doc": state_query[0], "diplo_type_acte": diplo_query[0] }) for data in tqdm(actes, desc="Populating Actes..."): Acte.create(**data) def __find_indiv(folder: str, role: str)-> None: indiv_lst = [] for acte in os.listdir(folder): soup = make_soup(os.path.join(folder, acte)) xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role}) for xml_indiv in xml_indivs: persons = xml_indiv.find_all("person") for person in persons: indiv_lst.append(person.text.replace("\n", "")) return set(indiv_lst) def __csv_indiv_infos(indiv_type): with open(os.path.join(APPPATH, "static", "csv", "actors.csv"), 'r', encoding="utf-8") as opening: actors_csv = csv.reader(opening, delimiter=";") next(actors_csv, None) lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type] return lst_of_indiv def __compareList(l1,l2): l1.sort() l2.sort() if(l1==l2): return "Equal" else: return "Non equal" def _create_indiv(list_csv): individuals = [{"name_indiv": actor[0], "role_indiv": actor[1], "house_indiv": [t.id_house for t in House.select().where( House.house_label == actor[2])][0], "date1": actor[3], "date2": actor[4], "date3": actor[5]} for actor in list_csv] for data in tqdm(individuals, desc="Populating Individual..."): Individual.create(**data) def __grape_indiv(list_person, role: str): for persons in list_person: for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] # duke_q = [t.id_duke for t in Duke.select().where( # Duke.indiv_duke == prince_q[0])] print(person_text, "==", prince_q[0]) def _create_involved_in(xml_file: str, folder: str): princes_actes = [] for acte in os.listdir(folder): acte_q = [t.id_acte for t in Acte.select().where( Acte.filename == acte.replace(".xml", ""))] # print(acte, "==", acte_q[0]) soup = make_soup(os.path.join(folder, acte)) for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}): for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] interv_q = [t.id_intev for t in Intervention_type.select().where( Intervention_type.interv_label == "producer")] # print(person_text, "==", prince_q[0]) try: prince_q[0] except IndexError: print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv") continue princes_actes.append({"involved_in_acte": acte_q[0], "involved_in_prince": prince_q[0], "invol_in_interv": interv_q[0]}) for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}): for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] interv_q = [t.id_intev for t in Intervention_type.select().where( Intervention_type.interv_label == "signatory")] # print(person_text, "==", prince_q[0]) try: prince_q[0] except IndexError: print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv") continue princes_actes.append({"involved_in_acte": acte_q[0], "involved_in_prince": prince_q[0], "invol_in_interv": interv_q[0]}) for data in tqdm(princes_actes, desc="Populating involved_in..."): Involved_in.create(**data) @db_cli.command() def init() -> None: """Initialization of the database""" xml = os.path.join(APPPATH, "static", "xml", "Bourbon", "Brb_5_Charles_Ier"), ".xml" xml_folder = os.path.join(APPPATH, "static", "xml", "Bourbon", "Brb_5_Charles_Ier") print("Dropping existing DB...") db.drop_tables([Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Acte, Individual, Involved_in]) print("Re-creating schema...") db.create_tables([Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Acte, Individual, Involved_in]) _create_institution(institution) _create_state(state) _create_house(houses) _create_interv_type(interventions) _create_diplo_type(diplomatic_type) _create_produc_place(xml, xml_folder) _create_doc(xml, xml_folder) _create_acte(xml, xml_folder) # check which names need to be add to the actors.csv actors = [*__csv_indiv_infos("secret"), *__csv_indiv_infos("prince")] # names_in_csv = [actor[0] for actor in actors] # names_in_xml = __find_indiv(xml_folder, "signatory") # for name in [x for x in names_in_xml if x not in names_in_csv]: # print("!! name " + name + " not found in /app/static/csv/actors.csv") _create_indiv(actors) _create_involved_in(xml, xml_folder)