You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

325 lines
14 KiB
Python

import csv
import os
import re
import typing as t
from bs4 import BeautifulSoup
from flask.cli import AppGroup
from lxml import etree
from tqdm import tqdm
from app.app import APPPATH, db
from app.data_actes import diplomatic_type, institution, state, houses, interventions
from app.modeles import Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by, Individual, Involved_in
db_cli = AppGroup("db")
def _nonempty(s: str) -> t.Optional[str]:
"""Returns content only if non-empty; otherwise returns None"""
if s:
return s
else:
return None
def _capitalize_first(s: t.Optional[str]) -> t.Optional[str]:
return (s[0].upper() + s[1:]) if s else None
def make_soup(file):
"""open a xml file and return a BeautifulSoup object"""
with open(file, 'r', encoding="utf-8") as opening:
xml = BeautifulSoup(opening, 'xml')
return xml
def _create_institution(data_lst: list)-> None:
"""create institution table"""
for data in tqdm(data_lst, desc="Populating Institution..."):
Institution.create(**data)
def _create_state(data_lst: list)-> None:
"""create state table"""
for data in tqdm(data_lst, desc="Populating State..."):
State.create(**data)
def _create_house(data_lst: list)-> None:
"""create state table"""
for data in tqdm(data_lst, desc="Populating House..."):
House.create(**data)
def _create_interv_type(data_lst: list)-> None:
"""create state table"""
for data in tqdm(data_lst, desc="Populating Intervention_type..."):
Intervention_type.create(**data)
def _create_diplo_type(data_lst: list)-> None:
"""create diplo type table"""
for data in tqdm(data_lst, desc="Populating Diplo_type..."):
Diplo_type.create(**data)
def _create_produc_place(folder: str)-> None:
"""create production place table"""
places_xtract = []
production_places = []
for acte in os.listdir(folder):
if acte.endswith(".xml"):
soup = make_soup(os.path.join(folder, acte))
# search for
# //body//div[@type='front']/docdate/placeName[@type='production_place']
# and add to list places_xtract
for place in soup.find('placeName', {'type': 'production_place'}):
places_xtract.append(place)
# make data list (production_places) by iterating on set(places_xtract)
production_places = [{"placename": xtraction} for xtraction in set(places_xtract)]
for data in tqdm(production_places, desc="Populating Place..."):
Production_place.create(**data)
def _create_doc(folder: str)-> None:
"""create doc table"""
details_doc = []
infos_doc = []
# 1/ get repository (doc Archives) + doc number in a list
for acte in os.listdir(folder):
if acte.endswith(".xml"):
soup = make_soup(os.path.join(folder, acte))
inst_doc = soup.repository.text # //sourceDesc//msIdentifier/repository
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
# archive box or manuscript collection id
# (//sourceDesc//msIdentifier/idno[@n='1'] is the doc id inside
# the box or the page number inside a manuscript)
nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text
details_doc.append(inst_doc + " == " + nb_doc_1)
# 2/ make the data list
for doc in set(details_doc):
doc_archives = re.sub('(.+) == .+', '\\1', doc) # only the Archives
doc_cote = re.sub('.+ == (.+)', '\\1', doc) # only the box number (or ms id)
# query on table institution with <doc_archives> to get the correspondant institution key
inst_query = [t.id_institution for t in Institution.select().where(
Institution.full_label == doc_archives)]
infos_doc.append({
"inst_doc": inst_query[0],
"collection_doc": doc_cote,
})
# 3/ create the table
for data in tqdm(infos_doc, desc="Populating Document..."):
Document.create(**data)
def __find_transcribers(folder: str)-> None:
""""""
transcribers = []
for acte in sorted(os.listdir(folder)):
if acte.endswith(".xml"):
soup = make_soup(os.path.join(folder, acte))
transcriber = soup.fileDesc.titleStmt
for item in transcriber.find_all("respStmt"):
transcribers.append(item.find("name").text)
return set(transcribers)
def _create_agent(name_lst: list)-> None:
"""create table agent"""
for data in tqdm(name_lst, desc="Populating Agent..."):
Agent.create(**data)
def _create_acte(folder: str)-> None:
actes = []
counter = 0
for acte in sorted(os.listdir(folder)):
if acte.endswith(".xml"):
counter += 1
soup = make_soup(os.path.join(folder, acte))
numb = soup.TEI["xml:id"]
date_time = soup.msItem.docDate["when"]
date = soup.msItem.docDate.text
analyse = soup.abstract.p.text
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
if len(ref) > 0:
ref_acte = ref[0].text
else:
ref_acte = "NS"
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
type_diplo = soup.body.div["subtype"]
diplo_state = soup.body.div["type"]
place_query = [t.id_place for t in Production_place.select().where(
Production_place.placename == prod_place)]
doc_query = [t.id_document for t in Document.select().where(
Document.collection_doc == doc.text)]
diplo_query = [t.id_diplo_type for t in Diplo_type.select().where(
Diplo_type.diplo_label == type_diplo)]
state_query = [t.id_state for t in State.select().where(
State.state_label == diplo_state)]
actes.append({
"num_acte": counter,
"filename": numb,
"date_time": date_time,
"date": date,
"prod_place_acte": place_query[0],
"analysis": analyse,
"doc_acte": doc_query[0],
"ref_acte": ref_acte,
"state_doc": state_query[0],
"diplo_type_acte": diplo_query[0]
})
for data in tqdm(actes, desc="Populating Actes..."):
Acte.create(**data)
def _create_transcribed_by(folder: str)-> None:
transcribed = []
for acte in os.listdir(folder):
if acte.endswith(".xml"):
soup = make_soup(os.path.join(folder, acte))
acte_q = [t.id_acte for t in Acte.select().where(
Acte.filename == acte.replace(".xml", ""))]
transcriber = soup.fileDesc.titleStmt
for item in transcriber.find_all("respStmt"):
agent_q = [t.id_agent for t in Agent.select().where(
Agent.agent_name == item.find("name").text)]
transcribed.append({"transcr_acte": acte_q[0],
"transcr_agent": agent_q[0]})
for data in tqdm(transcribed, desc="Populating Transcribed_by..."):
Transcribed_by.create(**data)
def __find_indiv(folder: str, role: str)-> None:
indiv_lst = []
for acte in os.listdir(folder):
if acte.endswith(".xml"):
soup = make_soup(os.path.join(folder, acte))
xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role})
for xml_indiv in xml_indivs:
persons = xml_indiv.find_all("person")
for person in persons:
indiv_lst.append(person.text.replace("\n", ""))
return set(indiv_lst)
def __csv_indiv_infos(indiv_type):
with open(os.path.join(APPPATH, "static", "csv", "actors.csv"), 'r', encoding="utf-8") as opening:
actors_csv = csv.reader(opening, delimiter=";")
next(actors_csv, None)
lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type]
return lst_of_indiv
def _create_indiv(list_csv):
individuals = [{"name_indiv": actor[0], "role_indiv": actor[1],
"house_indiv": [t.id_house for t in House.select().where(
House.house_label == actor[2])][0],
"date1": actor[3], "date2": actor[4], "date3": actor[5]}
for actor in list_csv]
for data in tqdm(individuals, desc="Populating Individual..."):
Individual.create(**data)
def __grape_indiv(list_person, role: str):
for persons in list_person:
for person_tag in persons.find_all("person"):
person_text = person_tag.text.replace("\n", "")
if person_text != "None":
prince_q = [t.id_indiv for t in Individual.select().where(
Individual.name_indiv == person_text)]
# duke_q = [t.id_duke for t in Duke.select().where(
# Duke.indiv_duke == prince_q[0])]
print(person_text, "==", prince_q[0])
def _create_involved_in(folder: str):
princes_actes = []
for acte in os.listdir(folder):
if acte.endswith(".xml"):
acte_q = [t.id_acte for t in Acte.select().where(
Acte.filename == acte.replace(".xml", ""))]
# print(acte, "==", acte_q[0])
soup = make_soup(os.path.join(folder, acte))
for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}):
for person_tag in persons.find_all("person"):
person_text = person_tag.text.replace("\n", "")
if person_text != "None":
prince_q = [t.id_indiv for t in Individual.select().where(
Individual.name_indiv == person_text)]
interv_q = [t.id_intev for t in Intervention_type.select().where(
Intervention_type.interv_label == "producer")]
# print(person_text, "==", prince_q[0])
try:
prince_q[0]
except IndexError:
print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv")
continue
princes_actes.append({"involved_in_acte": acte_q[0],
"involved_in_prince": prince_q[0],
"invol_in_interv": interv_q[0]})
for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}):
for person_tag in persons.find_all("person"):
person_text = person_tag.text.replace("\n", "")
if person_text != "None":
prince_q = [t.id_indiv for t in Individual.select().where(
Individual.name_indiv == person_text)]
interv_q = [t.id_intev for t in Intervention_type.select().where(
Intervention_type.interv_label == "signatory")]
# print(person_text, "==", prince_q[0])
try:
prince_q[0]
except IndexError:
print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv")
continue
princes_actes.append({"involved_in_acte": acte_q[0],
"involved_in_prince": prince_q[0],
"invol_in_interv": interv_q[0]})
for data in tqdm(princes_actes, desc="Populating involved_in..."):
Involved_in.create(**data)
@db_cli.command()
def init() -> None:
"""Initialization of the database"""
print("Dropping existing DB...")
db.drop_tables([Institution, State, House, Intervention_type,
Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by,
Individual, Involved_in])
print("Re-creating schema...")
db.create_tables([Institution, State, House, Intervention_type,
Production_place, Diplo_type, Document, Agent, Acte,
Transcribed_by, Individual, Involved_in])
_create_institution(institution)
_create_state(state)
_create_house(houses)
_create_interv_type(interventions)
_create_diplo_type(diplomatic_type)
actors = [*__csv_indiv_infos("secret"), *__csv_indiv_infos("prince")]
_create_indiv(actors)
# if new houses were to be added, princes_houses need to be completed
princes_houses = ["Berry", "Bourbon", "Anjou"]
agents_names = []
# iteration over houses
for prince_house in princes_houses:
# 1/ make path to house xml folder
xml_folder = os.path.join(APPPATH, "static", "xml", prince_house)
print("\n\n**** HOUSE ", prince_house, " ****")
# 2/ add all transcribers names to list agents_names
for name in __find_transcribers(xml_folder):
agents_names.append(name)
# 3/ check which names need to be add to the actors.csv
names_in_csv = [actor[0] for actor in actors]
names_in_xml = __find_indiv(xml_folder, "signatory")
for name in [x for x in names_in_xml if x not in names_in_csv]:
print("!! name " + name + " not found in /app/static/csv/actors.csv")
# 4/ create tables
_create_produc_place(xml_folder)
_create_doc(xml_folder)
_create_acte(xml_folder)
# table Agent
agents = [{"agent_name": agent} for agent in set(agents_names)]
_create_agent(agents)
for prince_house in princes_houses:
print("\n\n**** INVOLVED IN ", prince_house, " ****")
xml_folder = os.path.join(APPPATH, "static", "xml", prince_house)
_create_transcribed_by(xml_folder)
_create_involved_in(xml_folder)