import csv import os import re import typing as t from bs4 import BeautifulSoup from flask.cli import AppGroup from lxml import etree from tqdm import tqdm from app.app import APPPATH, db from app.data_actes import diplomatic_type, institution, state, houses, interventions from app.modeles import Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by, Individual, Involved_in db_cli = AppGroup("db") def _nonempty(s: str) -> t.Optional[str]: """Returns content only if non-empty; otherwise returns None""" if s: return s else: return None def _capitalize_first(s: t.Optional[str]) -> t.Optional[str]: return (s[0].upper() + s[1:]) if s else None def make_soup(file): """open a xml file and return a BeautifulSoup object""" with open(file, 'r', encoding="utf-8") as opening: xml = BeautifulSoup(opening, 'xml') return xml def _create_institution(data_lst: list)-> None: """create institution table""" for data in tqdm(data_lst, desc="Populating Institution..."): Institution.create(**data) def _create_state(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating State..."): State.create(**data) def _create_house(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating House..."): House.create(**data) def _create_interv_type(data_lst: list)-> None: """create state table""" for data in tqdm(data_lst, desc="Populating Intervention_type..."): Intervention_type.create(**data) def _create_diplo_type(data_lst: list)-> None: """create diplo type table""" for data in tqdm(data_lst, desc="Populating Diplo_type..."): Diplo_type.create(**data) def _create_produc_place(folder: str)-> None: """create production place table""" places_xtract = [] production_places = [] for acte in os.listdir(folder): if acte.endswith(".xml"): soup = make_soup(os.path.join(folder, acte)) # search for # //body//div[@type='front']/docdate/placeName[@type='production_place'] # and add to list places_xtract for place in soup.find('placeName', {'type': 'production_place'}): places_xtract.append(place) # make data list (production_places) by iterating on set(places_xtract) production_places = [{"placename": xtraction} for xtraction in set(places_xtract)] for data in tqdm(production_places, desc="Populating Place..."): Production_place.create(**data) def _create_doc(folder: str)-> None: """create doc table""" details_doc = [] infos_doc = [] # 1/ get repository (doc Archives) + doc number in a list for acte in os.listdir(folder): if acte.endswith(".xml"): soup = make_soup(os.path.join(folder, acte)) inst_doc = soup.repository.text # //sourceDesc//msIdentifier/repository # //sourceDesc//msIdentifier/idno[@n='1'] is always the # archive box or manuscript collection id # (//sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside # the box or the page number inside a manuscript) nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text details_doc.append(inst_doc + " == " + nb_doc_1) # 2/ make the data list for doc in set(details_doc): doc_archives = re.sub('(.+) == .+', '\\1', doc) # only the Archives doc_cote = re.sub('.+ == (.+)', '\\1', doc) # only the box number (or ms id) # query on table institution with to get the correspondant institution key inst_query = [t.id_institution for t in Institution.select().where( Institution.full_label == doc_archives)] infos_doc.append({ "inst_doc": inst_query[0], "collection_doc": doc_cote, }) # 3/ create the table for data in tqdm(infos_doc, desc="Populating Document..."): Document.create(**data) def __find_transcribers(folder: str)-> None: """""" transcribers = [] for acte in sorted(os.listdir(folder)): if acte.endswith(".xml"): soup = make_soup(os.path.join(folder, acte)) transcriber = soup.fileDesc.titleStmt for item in transcriber.find_all("respStmt"): transcribers.append(item.find("name").text) return set(transcribers) def _create_agent(name_lst: list)-> None: """create table agent""" for data in tqdm(name_lst, desc="Populating Agent..."): Agent.create(**data) def _create_acte(folder: str)-> None: """create table acte""" actes = [] counter = 0 for acte in sorted(os.listdir(folder)): if acte.endswith(".xml"): counter += 1 soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date date = soup.msItem.docDate.text # verbose date analyse = soup.abstract.p.text # acte's short analysis ref = soup.msIdentifier.find_all("idno", {"n": "2"}) # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the # archive box or the page number inside a manuscript (see _create_doc) # warning: the analysis may not have been written yet, # which would result in List Index Out of Range Error. Hence : if len(ref) > 0: # there is an analysis ref_acte = ref[0].text else: # there is no analysis ref_acte = "NS" prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text # //sourceDesc//msIdentifier/idno[@n='1'] is always the # archive box or manuscript collection id doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] type_diplo = soup.body.div["subtype"] diplo_state = soup.body.div["type"] # 1.2/ For some data, we need to make queries to get foreign keys place_query = [t.id_place for t in Production_place.select().where( Production_place.placename == prod_place)] doc_query = [t.id_document for t in Document.select().where( Document.collection_doc == doc.text)] diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( Diplo_type.diplo_label == type_diplo)] state_query = [t.id_state for t in State.select().where( State.state_label == diplo_state)] # 2/ Make the data list actes.append({ "num_acte": counter, "filename": numb, "date_time": date_time, "date": date, "prod_place_acte": place_query[0], "analysis": analyse, "doc_acte": doc_query[0], "ref_acte": ref_acte, "state_doc": state_query[0], "diplo_type_acte": diplo_query[0] }) # 4/ create the table for data in tqdm(actes, desc="Populating Actes..."): Acte.create(**data) def _create_transcribed_by(folder: str)-> None: transcribed = [] for acte in os.listdir(folder): if acte.endswith(".xml"): soup = make_soup(os.path.join(folder, acte)) acte_q = [t.id_acte for t in Acte.select().where( Acte.filename == acte.replace(".xml", ""))] transcriber = soup.fileDesc.titleStmt for item in transcriber.find_all("respStmt"): agent_q = [t.id_agent for t in Agent.select().where( Agent.agent_name == item.find("name").text)] transcribed.append({"transcr_acte": acte_q[0], "transcr_agent": agent_q[0]}) for data in tqdm(transcribed, desc="Populating Transcribed_by..."): Transcribed_by.create(**data) def __find_indiv(folder: str, role: str)-> None: indiv_lst = [] for acte in os.listdir(folder): if acte.endswith(".xml"): soup = make_soup(os.path.join(folder, acte)) xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role}) for xml_indiv in xml_indivs: persons = xml_indiv.find_all("person") for person in persons: indiv_lst.append(person.text.replace("\n", "")) return set(indiv_lst) def __csv_indiv_infos(indiv_type): with open(os.path.join(APPPATH, "static", "csv", "actors.csv"), 'r', encoding="utf-8") as opening: actors_csv = csv.reader(opening, delimiter=";") next(actors_csv, None) lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type] return lst_of_indiv def _create_indiv(list_csv): individuals = [{"name_indiv": actor[0], "role_indiv": actor[1], "house_indiv": [t.id_house for t in House.select().where( House.house_label == actor[2])][0], "date1": actor[3], "date2": actor[4], "date3": actor[5]} for actor in list_csv] for data in tqdm(individuals, desc="Populating Individual..."): Individual.create(**data) def __grape_indiv(list_person, role: str): for persons in list_person: for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] # duke_q = [t.id_duke for t in Duke.select().where( # Duke.indiv_duke == prince_q[0])] print(person_text, "==", prince_q[0]) def _create_involved_in(folder: str): princes_actes = [] for acte in os.listdir(folder): if acte.endswith(".xml"): acte_q = [t.id_acte for t in Acte.select().where( Acte.filename == acte.replace(".xml", ""))] # print(acte, "==", acte_q[0]) soup = make_soup(os.path.join(folder, acte)) for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}): for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] interv_q = [t.id_intev for t in Intervention_type.select().where( Intervention_type.interv_label == "producer")] # print(person_text, "==", prince_q[0]) try: prince_q[0] except IndexError: print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv") continue princes_actes.append({"involved_in_acte": acte_q[0], "involved_in_prince": prince_q[0], "invol_in_interv": interv_q[0]}) for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}): for person_tag in persons.find_all("person"): person_text = person_tag.text.replace("\n", "") if person_text != "None": prince_q = [t.id_indiv for t in Individual.select().where( Individual.name_indiv == person_text)] interv_q = [t.id_intev for t in Intervention_type.select().where( Intervention_type.interv_label == "signatory")] # print(person_text, "==", prince_q[0]) try: prince_q[0] except IndexError: print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv") continue princes_actes.append({"involved_in_acte": acte_q[0], "involved_in_prince": prince_q[0], "invol_in_interv": interv_q[0]}) for data in tqdm(princes_actes, desc="Populating involved_in..."): Involved_in.create(**data) @db_cli.command() def init() -> None: """Initialization of the database""" print("Dropping existing DB...") db.drop_tables([Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by, Individual, Involved_in]) print("Re-creating schema...") db.create_tables([Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by, Individual, Involved_in]) _create_institution(institution) _create_state(state) _create_house(houses) _create_interv_type(interventions) _create_diplo_type(diplomatic_type) actors = [*__csv_indiv_infos("secret"), *__csv_indiv_infos("prince")] _create_indiv(actors) # if new houses were to be added, princes_houses need to be completed princes_houses = ["Berry", "Bourbon", "Anjou"] agents_names = [] # iteration over houses for prince_house in princes_houses: # 1/ make path to house xml folder xml_folder = os.path.join(APPPATH, "static", "xml", prince_house) print("\n\n**** HOUSE ", prince_house, " ****") # 2/ add all transcribers names to list agents_names for name in __find_transcribers(xml_folder): agents_names.append(name) # 3/ check which names need to be add to the actors.csv names_in_csv = [actor[0] for actor in actors] names_in_xml = __find_indiv(xml_folder, "signatory") for name in [x for x in names_in_xml if x not in names_in_csv]: print("!! name " + name + " not found in /app/static/csv/actors.csv") # 4/ create tables _create_produc_place(xml_folder) _create_doc(xml_folder) _create_acte(xml_folder) # table Agent agents = [{"agent_name": agent} for agent in set(agents_names)] _create_agent(agents) for prince_house in princes_houses: print("\n\n**** INVOLVED IN ", prince_house, " ****") xml_folder = os.path.join(APPPATH, "static", "xml", prince_house) _create_transcribed_by(xml_folder) _create_involved_in(xml_folder)