|
|
|
@ -88,7 +88,7 @@ def _create_doc(folder: str)-> None:
|
|
|
|
inst_doc = soup.repository.text # //sourceDesc//msIdentifier/repository
|
|
|
|
inst_doc = soup.repository.text # //sourceDesc//msIdentifier/repository
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
# (//sourceDesc//msIdentifier/idno[@n='1'] is the doc id inside
|
|
|
|
# (//sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside
|
|
|
|
# the box or the page number inside a manuscript)
|
|
|
|
# the box or the page number inside a manuscript)
|
|
|
|
nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text
|
|
|
|
nb_doc_1 = soup.msIdentifier.find_all("idno", {"n": "1"})[0].text
|
|
|
|
details_doc.append(inst_doc + " == " + nb_doc_1)
|
|
|
|
details_doc.append(inst_doc + " == " + nb_doc_1)
|
|
|
|
@ -124,25 +124,36 @@ def _create_agent(name_lst: list)-> None:
|
|
|
|
Agent.create(**data)
|
|
|
|
Agent.create(**data)
|
|
|
|
|
|
|
|
|
|
|
|
def _create_acte(folder: str)-> None:
|
|
|
|
def _create_acte(folder: str)-> None:
|
|
|
|
|
|
|
|
"""create table acte"""
|
|
|
|
actes = []
|
|
|
|
actes = []
|
|
|
|
counter = 0
|
|
|
|
counter = 0
|
|
|
|
for acte in sorted(os.listdir(folder)):
|
|
|
|
for acte in sorted(os.listdir(folder)):
|
|
|
|
if acte.endswith(".xml"):
|
|
|
|
if acte.endswith(".xml"):
|
|
|
|
counter += 1
|
|
|
|
counter += 1
|
|
|
|
soup = make_soup(os.path.join(folder, acte))
|
|
|
|
soup = make_soup(os.path.join(folder, acte))
|
|
|
|
numb = soup.TEI["xml:id"]
|
|
|
|
|
|
|
|
date_time = soup.msItem.docDate["when"]
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
date = soup.msItem.docDate.text
|
|
|
|
numb = soup.TEI["xml:id"] # /TEI[@xml:id] is always the acte's ID
|
|
|
|
analyse = soup.abstract.p.text
|
|
|
|
date_time = soup.msItem.docDate["when"] # YYYY-MM-DD or YYYY-MM date
|
|
|
|
|
|
|
|
date = soup.msItem.docDate.text # verbose date
|
|
|
|
|
|
|
|
analyse = soup.abstract.p.text # acte's short analysis
|
|
|
|
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
ref = soup.msIdentifier.find_all("idno", {"n": "2"})
|
|
|
|
if len(ref) > 0:
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='2'] is the doc id inside the
|
|
|
|
|
|
|
|
# archive box or the page number inside a manuscript (see _create_doc)
|
|
|
|
|
|
|
|
# warning: the analysis may not have been written yet,
|
|
|
|
|
|
|
|
# which would result in List Index Out of Range Error. Hence :
|
|
|
|
|
|
|
|
if len(ref) > 0: # there is an analysis
|
|
|
|
ref_acte = ref[0].text
|
|
|
|
ref_acte = ref[0].text
|
|
|
|
else:
|
|
|
|
else: # there is no analysis
|
|
|
|
ref_acte = "NS"
|
|
|
|
ref_acte = "NS"
|
|
|
|
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
prod_place = soup.find_all("placeName", {"type": "production_place"})[0].text
|
|
|
|
|
|
|
|
# //sourceDesc//msIdentifier/idno[@n='1'] is always the
|
|
|
|
|
|
|
|
# archive box or manuscript collection id
|
|
|
|
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
doc = soup.msIdentifier.find_all("idno", {"n": "1"})[0]
|
|
|
|
type_diplo = soup.body.div["subtype"]
|
|
|
|
type_diplo = soup.body.div["subtype"]
|
|
|
|
diplo_state = soup.body.div["type"]
|
|
|
|
diplo_state = soup.body.div["type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 1.2/ For some data, we need to make queries to get foreign keys
|
|
|
|
place_query = [t.id_place for t in Production_place.select().where(
|
|
|
|
place_query = [t.id_place for t in Production_place.select().where(
|
|
|
|
Production_place.placename == prod_place)]
|
|
|
|
Production_place.placename == prod_place)]
|
|
|
|
doc_query = [t.id_document for t in Document.select().where(
|
|
|
|
doc_query = [t.id_document for t in Document.select().where(
|
|
|
|
@ -151,6 +162,8 @@ def _create_acte(folder: str)-> None:
|
|
|
|
Diplo_type.diplo_label == type_diplo)]
|
|
|
|
Diplo_type.diplo_label == type_diplo)]
|
|
|
|
state_query = [t.id_state for t in State.select().where(
|
|
|
|
state_query = [t.id_state for t in State.select().where(
|
|
|
|
State.state_label == diplo_state)]
|
|
|
|
State.state_label == diplo_state)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 2/ Make the data list
|
|
|
|
actes.append({
|
|
|
|
actes.append({
|
|
|
|
"num_acte": counter,
|
|
|
|
"num_acte": counter,
|
|
|
|
"filename": numb,
|
|
|
|
"filename": numb,
|
|
|
|
@ -163,6 +176,7 @@ def _create_acte(folder: str)-> None:
|
|
|
|
"state_doc": state_query[0],
|
|
|
|
"state_doc": state_query[0],
|
|
|
|
"diplo_type_acte": diplo_query[0]
|
|
|
|
"diplo_type_acte": diplo_query[0]
|
|
|
|
})
|
|
|
|
})
|
|
|
|
|
|
|
|
# 4/ create the table
|
|
|
|
for data in tqdm(actes, desc="Populating Actes..."):
|
|
|
|
for data in tqdm(actes, desc="Populating Actes..."):
|
|
|
|
Acte.create(**data)
|
|
|
|
Acte.create(**data)
|
|
|
|
|
|
|
|
|
|
|
|
|