From b68c50abfad1a2b03a9589acc2f98b655a025d5f Mon Sep 17 00:00:00 2001 From: jgenero Date: Fri, 21 Oct 2022 11:51:40 +0200 Subject: [PATCH] delete list compare function + remove argument xml_file + add os.endwith --- app/cmd/db.py | 213 ++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 113 deletions(-) diff --git a/app/cmd/db.py b/app/cmd/db.py index 98684e9..acbc926 100644 --- a/app/cmd/db.py +++ b/app/cmd/db.py @@ -60,28 +60,30 @@ def _create_diplo_type(data_lst: list)-> None: for data in tqdm(data_lst, desc="Populating Diplo_type..."): Diplo_type.create(**data) -def _create_produc_place(xml_file: str, folder: str)-> None: +def _create_produc_place(folder: str)-> None: """create production place table""" places_xtract = [] production_places = [] for acte in os.listdir(folder): - soup = make_soup(os.path.join(folder, acte)) - for place in soup.find('placeName', {'type': 'production_place'}): - places_xtract.append(place) + if acte.endswith(".xml"): + soup = make_soup(os.path.join(folder, acte)) + for place in soup.find('placeName', {'type': 'production_place'}): + places_xtract.append(place) production_places = [{"placename": xtraction} for xtraction in set(places_xtract)] for data in tqdm(production_places, desc="Populating Place..."): Production_place.create(**data) -def _create_doc(xml_file: str, folder: str)-> None: +def _create_doc(folder: str)-> None: """create doc table""" details_doc = [] infos_doc = [] # 1/ get repository (doc archives) + doc collection in a list for acte in os.listdir(folder): - soup = make_soup(os.path.join(folder, acte)) - inst_doc = soup.repository.text - nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text - details_doc.append(inst_doc + " == " + nb_doc_1) + if acte.endswith(".xml"): + soup = make_soup(os.path.join(folder, acte)) + inst_doc = soup.repository.text + nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text + details_doc.append(inst_doc + " == " + nb_doc_1) # 2/ make a query on table Inst to get inst id # then pretiffy data for the table Doc for doc in set(details_doc): @@ -97,57 +99,59 @@ def _create_doc(xml_file: str, folder: str)-> None: for data in tqdm(infos_doc, desc="Populating Document..."): Document.create(**data) -def _create_acte(xml_file: str, folder: str)-> None: +def _create_acte(folder: str)-> None: actes = [] counter = 0 for acte in sorted(os.listdir(folder)): - counter += 1 - soup = make_soup(os.path.join(folder, acte)) - numb = soup.TEI["xml:id"] - date_time = soup.msItem.docDate["when"] - date = soup.msItem.docDate.text - analyse = soup.abstract.p.text - ref = soup.msIdentifier.find_all("idno", {"n": "2"}) - if len(ref) > 0: - ref_acte = ref[0].text - else: - ref_acte = "NS" - prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text - doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] - type_diplo = soup.body.div["subtype"] - diplo_state = soup.body.div["type"] - place_query = [t.id_place for t in Production_place.select().where( - Production_place.placename == prod_place)] - doc_query = [t.id_document for t in Document.select().where( - Document.collection_doc == doc.text)] - diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( - Diplo_type.diplo_label == type_diplo)] - state_query = [t.id_state for t in State.select().where( - State.state_label == diplo_state)] - actes.append({ - "num_acte": counter, - "filename": numb, - "date_time": date_time, - "date": date, - "prod_place_acte": place_query[0], - "analysis": analyse, - "doc_acte": doc_query[0], - "ref_acte": ref_acte, - "state_doc": state_query[0], - "diplo_type_acte": diplo_query[0] - }) + if acte.endswith(".xml"): + counter += 1 + soup = make_soup(os.path.join(folder, acte)) + numb = soup.TEI["xml:id"] + date_time = soup.msItem.docDate["when"] + date = soup.msItem.docDate.text + analyse = soup.abstract.p.text + ref = soup.msIdentifier.find_all("idno", {"n": "2"}) + if len(ref) > 0: + ref_acte = ref[0].text + else: + ref_acte = "NS" + prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text + doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0] + type_diplo = soup.body.div["subtype"] + diplo_state = soup.body.div["type"] + place_query = [t.id_place for t in Production_place.select().where( + Production_place.placename == prod_place)] + doc_query = [t.id_document for t in Document.select().where( + Document.collection_doc == doc.text)] + diplo_query = [t.id_diplo_type for t in Diplo_type.select().where( + Diplo_type.diplo_label == type_diplo)] + state_query = [t.id_state for t in State.select().where( + State.state_label == diplo_state)] + actes.append({ + "num_acte": counter, + "filename": numb, + "date_time": date_time, + "date": date, + "prod_place_acte": place_query[0], + "analysis": analyse, + "doc_acte": doc_query[0], + "ref_acte": ref_acte, + "state_doc": state_query[0], + "diplo_type_acte": diplo_query[0] + }) for data in tqdm(actes, desc="Populating Actes..."): Acte.create(**data) def __find_indiv(folder: str, role: str)-> None: indiv_lst = [] for acte in os.listdir(folder): - soup = make_soup(os.path.join(folder, acte)) - xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role}) - for xml_indiv in xml_indivs: - persons = xml_indiv.find_all("person") - for person in persons: - indiv_lst.append(person.text.replace("\n", "")) + if acte.endswith(".xml"): + soup = make_soup(os.path.join(folder, acte)) + xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role}) + for xml_indiv in xml_indivs: + persons = xml_indiv.find_all("person") + for person in persons: + indiv_lst.append(person.text.replace("\n", "")) return set(indiv_lst) def __csv_indiv_infos(indiv_type): @@ -157,14 +161,6 @@ def __csv_indiv_infos(indiv_type): lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type] return lst_of_indiv -def __compareList(l1,l2): - l1.sort() - l2.sort() - if(l1==l2): - return "Equal" - else: - return "Non equal" - def _create_indiv(list_csv): individuals = [{"name_indiv": actor[0], "role_indiv": actor[1], "house_indiv": [t.id_house for t in House.select().where( @@ -187,51 +183,48 @@ def __grape_indiv(list_person, role: str): -def _create_involved_in(xml_file: str, folder: str): +def _create_involved_in(folder: str): princes_actes = [] for acte in os.listdir(folder): - acte_q = [t.id_acte for t in Acte.select().where( - Acte.filename == acte.replace(".xml", ""))] - # print(acte, "==", acte_q[0]) - soup = make_soup(os.path.join(folder, acte)) - for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}): - for person_tag in persons.find_all("person"): - person_text = person_tag.text.replace("\n", "") - if person_text != "None": - prince_q = [t.id_indiv for t in Individual.select().where( - Individual.name_indiv == person_text)] - interv_q = [t.id_intev for t in Intervention_type.select().where( - Intervention_type.interv_label == "producer")] - # print(person_text, "==", prince_q[0]) - - try: - prince_q[0] - except IndexError: - print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv") - continue - - princes_actes.append({"involved_in_acte": acte_q[0], - "involved_in_prince": prince_q[0], - "invol_in_interv": interv_q[0]}) - for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}): - for person_tag in persons.find_all("person"): - person_text = person_tag.text.replace("\n", "") - if person_text != "None": - prince_q = [t.id_indiv for t in Individual.select().where( - Individual.name_indiv == person_text)] - interv_q = [t.id_intev for t in Intervention_type.select().where( - Intervention_type.interv_label == "signatory")] - # print(person_text, "==", prince_q[0]) - - try: - prince_q[0] - except IndexError: - print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv") - continue - - princes_actes.append({"involved_in_acte": acte_q[0], - "involved_in_prince": prince_q[0], - "invol_in_interv": interv_q[0]}) + if acte.endswith(".xml"): + acte_q = [t.id_acte for t in Acte.select().where( + Acte.filename == acte.replace(".xml", ""))] + # print(acte, "==", acte_q[0]) + soup = make_soup(os.path.join(folder, acte)) + for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}): + for person_tag in persons.find_all("person"): + person_text = person_tag.text.replace("\n", "") + if person_text != "None": + prince_q = [t.id_indiv for t in Individual.select().where( + Individual.name_indiv == person_text)] + interv_q = [t.id_intev for t in Intervention_type.select().where( + Intervention_type.interv_label == "producer")] + # print(person_text, "==", prince_q[0]) + try: + prince_q[0] + except IndexError: + print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv") + continue + princes_actes.append({"involved_in_acte": acte_q[0], + "involved_in_prince": prince_q[0], + "invol_in_interv": interv_q[0]}) + for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}): + for person_tag in persons.find_all("person"): + person_text = person_tag.text.replace("\n", "") + if person_text != "None": + prince_q = [t.id_indiv for t in Individual.select().where( + Individual.name_indiv == person_text)] + interv_q = [t.id_intev for t in Intervention_type.select().where( + Intervention_type.interv_label == "signatory")] + # print(person_text, "==", prince_q[0]) + try: + prince_q[0] + except IndexError: + print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv") + continue + princes_actes.append({"involved_in_acte": acte_q[0], + "involved_in_prince": prince_q[0], + "invol_in_interv": interv_q[0]}) for data in tqdm(princes_actes, desc="Populating involved_in..."): Involved_in.create(**data) @@ -239,10 +232,8 @@ def _create_involved_in(xml_file: str, folder: str): @db_cli.command() def init() -> None: """Initialization of the database""" - xml = os.path.join(APPPATH, "static", "xml", - "Bourbon", "Brb_5_Charles_Ier"), ".xml" xml_folder = os.path.join(APPPATH, "static", "xml", - "Bourbon", "Brb_5_Charles_Ier") + "Bourbon") print("Dropping existing DB...") db.drop_tables([Institution, State, House, Intervention_type, @@ -256,14 +247,10 @@ def init() -> None: _create_house(houses) _create_interv_type(interventions) _create_diplo_type(diplomatic_type) - _create_produc_place(xml, xml_folder) - _create_doc(xml, xml_folder) - _create_acte(xml, xml_folder) + _create_produc_place(xml_folder) + _create_doc(xml_folder) + _create_acte(xml_folder) # check which names need to be add to the actors.csv actors = [*__csv_indiv_infos("secret"), *__csv_indiv_infos("prince")] - # names_in_csv = [actor[0] for actor in actors] - # names_in_xml = __find_indiv(xml_folder, "signatory") - # for name in [x for x in names_in_xml if x not in names_in_csv]: - # print("!! name " + name + " not found in /app/static/csv/actors.csv") _create_indiv(actors) - _create_involved_in(xml, xml_folder) + _create_involved_in(xml_folder)