actes-princiers/app/cmd/db.py

import csv
import os
import re
import typing as t

from bs4 import BeautifulSoup
from flask.cli import AppGroup
from lxml import etree
from tqdm import tqdm

from app.app import APPPATH, db
from app.data_actes import diplomatic_type, institution, state, houses, interventions
from app.modeles import Institution, State, House, Intervention_type, Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by, Individual, Involved_in


db_cli = AppGroup("db")


def _nonempty(s: str) -> t.Optional[str]:
    """Returns content only if non-empty; otherwise returns None"""
    if s:
        return s
    else:
        return None


def _capitalize_first(s: t.Optional[str]) -> t.Optional[str]:
    return (s[0].upper() + s[1:]) if s else None


def make_soup(file):
    """open a xml file and return a BeautifulSoup object"""
    with open(file, 'r', encoding="utf-8") as opening:
        xml = BeautifulSoup(opening, 'xml')
    return xml


def _create_institution(data_lst: list)-> None:
    """create institution table"""
    for data in tqdm(data_lst, desc="Populating Institution..."):
        Institution.create(**data)

def _create_state(data_lst: list)-> None:
    """create state table"""
    for data in tqdm(data_lst, desc="Populating State..."):
        State.create(**data)

def _create_house(data_lst: list)-> None:
    """create state table"""
    for data in tqdm(data_lst, desc="Populating House..."):
        House.create(**data)

def _create_interv_type(data_lst: list)-> None:
    """create state table"""
    for data in tqdm(data_lst, desc="Populating Intervention_type..."):
        Intervention_type.create(**data)

def _create_diplo_type(data_lst: list)-> None:
    """create diplo type table"""
    for data in tqdm(data_lst, desc="Populating Diplo_type..."):
        Diplo_type.create(**data)

def _create_produc_place(folder: str)-> None:
    """create production place table"""
    places_xtract = []
    production_places = []
    for acte in os.listdir(folder):
        if acte.endswith(".xml"):
            soup = make_soup(os.path.join(folder, acte))
            # search for
            # //body//div[@type='front']/docdate/placeName[@type='production_place']
            # and add to list places_xtract
            for place in soup.find('placeName', {'type': 'production_place'}):
                places_xtract.append(place)
    # make data list (production_places) by iterating on set(places_xtract)
    production_places = [{"placename": xtraction} for xtraction in set(places_xtract)]
    for data in tqdm(production_places, desc="Populating Place..."):
        Production_place.create(**data)

def _create_doc(folder: str)-> None:
    """create doc table"""
    details_doc = []
    infos_doc = []
    # 1/ get repository (doc Archives) + doc number in a list
    for acte in os.listdir(folder):
        if acte.endswith(".xml"):
            soup = make_soup(os.path.join(folder, acte))
            inst_doc = soup.repository.text  # //sourceDesc//msIdentifier/repository
            # //sourceDesc//msIdentifier/idno[@n='1'] is always the
            # archive box or manuscript collection id
            # (//sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside
            # the box or the page number inside a manuscript)
            nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text
            details_doc.append(inst_doc + " == " + nb_doc_1)
    # 2/ make the data list
    for doc in set(details_doc):
        doc_archives = re.sub('(.+) == .+', '\\1', doc)  # only the Archives
        doc_cote = re.sub('.+ == (.+)', '\\1', doc)  # only the box number (or ms id)
        # query on table institution with <doc_archives> to get the correspondant institution key
        inst_query = [t.id_institution for t in Institution.select().where(
            Institution.full_label == doc_archives)]
        infos_doc.append({
                "inst_doc": inst_query[0],
                "collection_doc": doc_cote,
            })
    # 3/ create the table
    for data in tqdm(infos_doc, desc="Populating Document..."):
        Document.create(**data)

def __find_transcribers(folder: str)-> None:
    """"""
    transcribers = []
    for acte in sorted(os.listdir(folder)):
        if acte.endswith(".xml"):
            soup = make_soup(os.path.join(folder, acte))
            transcriber = soup.fileDesc.titleStmt
            for item in transcriber.find_all("respStmt"):
                transcribers.append(item.find("name").text)
    return set(transcribers)

def _create_agent(name_lst: list)-> None:
    """create table agent"""
    for data in tqdm(name_lst, desc="Populating Agent..."):
        Agent.create(**data)

def _create_acte(folder: str)-> None:
    """create table acte"""
    actes = []
    counter = 0
    for acte in sorted(os.listdir(folder)):
        if acte.endswith(".xml"):
            counter += 1
            soup = make_soup(os.path.join(folder, acte))

            # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
            numb = soup.TEI["xml:id"]  # /TEI[@xml:id] is always the acte's ID
            date_time = soup.msItem.docDate["when"]  # YYYY-MM-DD or YYYY-MM date
            date = soup.msItem.docDate.text  # verbose date
            analyse = soup.abstract.p.text  # acte's short analysis
            ref = soup.msIdentifier.find_all("idno", {"n": "2"})
            # //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
            # archive box or the page number inside a manuscript (see _create_doc)
            # warning: the analysis may not have been written yet,
            # which would result in List Index Out of Range Error. Hence :
            if len(ref) > 0:  # there is an analysis
                ref_acte = ref[0].text
            else:  # there is no analysis
                ref_acte = "NS"
            prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
            # //sourceDesc//msIdentifier/idno[@n='1'] is always the
            # archive box or manuscript collection id
            doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
            type_diplo = soup.body.div["subtype"]
            diplo_state = soup.body.div["type"]

            # 1.2/ For some data, we need to make queries to get foreign keys
            place_query = [t.id_place for t in Production_place.select().where(
                Production_place.placename == prod_place)]
            doc_query = [t.id_document for t in Document.select().where(
                Document.collection_doc == doc.text)]
            diplo_query = [t.id_diplo_type for t in Diplo_type.select().where(
                Diplo_type.diplo_label == type_diplo)]
            state_query = [t.id_state for t in State.select().where(
                State.state_label == diplo_state)]

            # 2/ Make the data list
            actes.append({
                "num_acte": counter,
                "filename": numb,
                "date_time": date_time,
                "date": date,
                "prod_place_acte": place_query[0],
                "analysis": analyse,
                "doc_acte": doc_query[0],
                "ref_acte": ref_acte,
                "state_doc": state_query[0],
                "diplo_type_acte": diplo_query[0]
                })
    # 4/ create the table
    for data in tqdm(actes, desc="Populating Actes..."):
        Acte.create(**data)

def _create_transcribed_by(folder: str)-> None:
    """create table transcribed_by"""
    transcribed = []
    for acte in os.listdir(folder):
        if acte.endswith(".xml"):
            soup = make_soup(os.path.join(folder, acte))
            # 1.1/ Acte's id.
            # the file name is the acte'id when you remove ".xml"
            # so we make a query on Acte to get the primary key
            acte_q = [t.id_acte for t in Acte.select().where(
                Acte.filename == acte.replace(".xml", ""))]
            # 1.2/ Agent's name (transcriber).
            # transcriber name is in //fileDesc/titleStmt/respStmt/name
            titlestmt = soup.fileDesc.titleStmt
            for item in titlestmt.find_all("respStmt"):
                # query on table Agent to get the primary key
                agent_q = [t.id_agent for t in Agent.select().where(
                    Agent.agent_name == item.find("name").text)]
                # 2/ Make the data list with both acte's and agent's keys
                transcribed.append({"transcr_acte": acte_q[0],
                    "transcr_agent": agent_q[0]})
    # 3/ Create the table
    for data in tqdm(transcribed, desc="Populating Transcribed_by..."):
        Transcribed_by.create(**data)


def __find_indiv(folder: str, role: str)-> None:
    indiv_lst = []
    for acte in os.listdir(folder):
        if acte.endswith(".xml"):
            soup = make_soup(os.path.join(folder, acte))
            xml_indivs = soup.sourceDesc.find_all("listPerson", {"type": role})
            for xml_indiv in xml_indivs:
                persons = xml_indiv.find_all("person")
                for person in persons:
                    indiv_lst.append(person.text.replace("\n", ""))
    return set(indiv_lst)

def __csv_indiv_infos(indiv_type):
    with open(os.path.join(APPPATH, "static", "csv", "actors.csv"), 'r', encoding="utf-8") as opening:
        actors_csv = csv.reader(opening, delimiter=";")
        next(actors_csv, None)
        lst_of_indiv = [row for row in actors_csv if row[1] == indiv_type]
    return lst_of_indiv

def _create_indiv(list_csv):
    individuals = [{"name_indiv": actor[0], "role_indiv": actor[1],
        "house_indiv": [t.id_house for t in House.select().where(
            House.house_label == actor[2])][0],
        "date1": actor[3], "date2": actor[4], "date3": actor[5]}
        for actor in list_csv]
    for data in tqdm(individuals, desc="Populating Individual..."):
        Individual.create(**data)

def __grape_indiv(list_person, role: str):
    for persons in list_person:
        for person_tag in persons.find_all("person"):
            person_text = person_tag.text.replace("\n", "")
            if person_text != "None":
                prince_q = [t.id_indiv for t in Individual.select().where(
                    Individual.name_indiv == person_text)]
                # duke_q = [t.id_duke for t in Duke.select().where(
                #     Duke.indiv_duke == prince_q[0])]
                print(person_text, "==", prince_q[0])

def _create_involved_in(folder: str):
    princes_actes = []
    for acte in os.listdir(folder):
        if acte.endswith(".xml"):
            acte_q = [t.id_acte for t in Acte.select().where(
                Acte.filename == acte.replace(".xml", ""))]
            # print(acte, "==", acte_q[0])
            soup = make_soup(os.path.join(folder, acte))
            for persons in soup.sourceDesc.find_all("listPerson", {"type": "prince"}):
                for person_tag in persons.find_all("person"):
                    person_text = person_tag.text.replace("\n", "")
                    if person_text != "None":
                        prince_q = [t.id_indiv for t in Individual.select().where(
                            Individual.name_indiv == person_text)]
                        interv_q = [t.id_intev for t in Intervention_type.select().where(
                            Intervention_type.interv_label == "producer")]
                        # print(person_text, "==", prince_q[0])
                        try:
                            prince_q[0]
                        except IndexError:
                            print("!! name " + person_text + " (prince) not found in /app/static/csv/actors.csv")
                            continue
                        princes_actes.append({"involved_in_acte": acte_q[0],
                            "involved_in_prince": prince_q[0],
                            "invol_in_interv": interv_q[0]})
            for persons in soup.sourceDesc.find_all("listPerson", {"type": "signatory"}):
                for person_tag in persons.find_all("person"):
                    person_text = person_tag.text.replace("\n", "")
                    if person_text != "None":
                        prince_q = [t.id_indiv for t in Individual.select().where(
                            Individual.name_indiv == person_text)]
                        interv_q = [t.id_intev for t in Intervention_type.select().where(
                            Intervention_type.interv_label == "signatory")]
                        # print(person_text, "==", prince_q[0])
                        try:
                            prince_q[0]
                        except IndexError:
                            print("!! name " + person_text + " (signatory) not found in /app/static/csv/actors.csv")
                            continue
                        princes_actes.append({"involved_in_acte": acte_q[0],
                            "involved_in_prince": prince_q[0],
                            "invol_in_interv": interv_q[0]})
    for data in tqdm(princes_actes, desc="Populating involved_in..."):
        Involved_in.create(**data)


@db_cli.command()
def init() -> None:
    """Initialization of the database"""
    print("Dropping existing DB...")
    db.drop_tables([Institution, State, House, Intervention_type,
        Production_place, Diplo_type, Document, Agent, Acte, Transcribed_by,
        Individual, Involved_in])
    print("Re-creating schema...")
    db.create_tables([Institution, State, House, Intervention_type,
        Production_place, Diplo_type, Document, Agent, Acte,
        Transcribed_by, Individual, Involved_in])

    _create_institution(institution)
    _create_state(state)
    _create_house(houses)
    _create_interv_type(interventions)
    _create_diplo_type(diplomatic_type)
    actors = [*__csv_indiv_infos("secret"), *__csv_indiv_infos("prince")]
    _create_indiv(actors)

    # if new houses were to be added, princes_houses need to be completed
    princes_houses = ["Berry", "Bourbon", "Anjou"]
    agents_names = []

    # iteration over houses
    for prince_house in princes_houses:
        # 1/ make path to house xml folder
        xml_folder = os.path.join(APPPATH, "static", "xml", prince_house)
        print("\n\n**** HOUSE ", prince_house, " ****")

        # 2/ add all transcribers names to list agents_names
        for name in __find_transcribers(xml_folder):
            agents_names.append(name)

        # 3/ check which names need to be add to the actors.csv
        names_in_csv = [actor[0] for actor in actors]
        names_in_xml = __find_indiv(xml_folder, "signatory")
        for name in [x for x in names_in_xml if x not in names_in_csv]:
            print("!! name " + name + " not found in /app/static/csv/actors.csv")

        # 4/ create tables
        _create_produc_place(xml_folder)
        _create_doc(xml_folder)
        _create_acte(xml_folder)

    # table Agent
    agents = [{"agent_name": agent} for agent in set(agents_names)]
    _create_agent(agents)

    for prince_house in princes_houses:
        print("\n\n**** INVOLVED IN ", prince_house, " ****")
        xml_folder = os.path.join(APPPATH, "static", "xml", prince_house)
        _create_transcribed_by(xml_folder)
        _create_involved_in(xml_folder)