|
|
|
|
@ -3,7 +3,7 @@ import json
|
|
|
|
|
from typing import Dict, Any
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
|
|
|
|
from unidecode import unidecode # to remove accents in the urls
|
|
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
@ -121,6 +121,31 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
prince_code = "_".join(cut[1:3])
|
|
|
|
|
return prince_code
|
|
|
|
|
|
|
|
|
|
def make_acte_url_from_filestem(self, house, prince_name, filestem):
|
|
|
|
|
"""
|
|
|
|
|
url sample : /acte/Bourbon/Louis_ii/1367_04_26a
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# bourbon -> Bourbon
|
|
|
|
|
house = house.capitalize()
|
|
|
|
|
# "Louis II de Bourbon" -> "Louis"
|
|
|
|
|
prince_name = prince_name.split(" ")[0]
|
|
|
|
|
# "anj_isa_i" -> "i"
|
|
|
|
|
prince_number = filestem.split("_")[2]
|
|
|
|
|
prince_name = unidecode(prince_name)
|
|
|
|
|
# Isabelle + i -> Isabelle_i
|
|
|
|
|
prince_name = prince_name + "_" + prince_number
|
|
|
|
|
timeitem = self.make_timeitem_from_filestem(filestem)
|
|
|
|
|
# final url
|
|
|
|
|
return "/" + "/".join(['acte', house, prince_name, timeitem])
|
|
|
|
|
|
|
|
|
|
def make_timeitem_from_filestem(self, filestem):
|
|
|
|
|
"""
|
|
|
|
|
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
|
|
|
|
|
"""
|
|
|
|
|
trs_fname = filestem.split('_')
|
|
|
|
|
return "_".join(trs_fname[3:])
|
|
|
|
|
|
|
|
|
|
def transform(self):
|
|
|
|
|
#soup = make_soup(os.path.join(folder, acte))
|
|
|
|
|
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
|
|
|
|
|
@ -202,24 +227,24 @@ class BsXMLDataSet(XMLDataSet):
|
|
|
|
|
image = self.soup.find("graphic")
|
|
|
|
|
if image is not None:
|
|
|
|
|
image = image.get('url')
|
|
|
|
|
prince_name = self.find_prince_name()
|
|
|
|
|
#self._filepath
|
|
|
|
|
# data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml
|
|
|
|
|
house = self.filepath.split("/")[3]
|
|
|
|
|
return {
|
|
|
|
|
# "num_acte": counter,
|
|
|
|
|
"prince_name": self.find_prince_name(),
|
|
|
|
|
"prince_name": prince_name,
|
|
|
|
|
"prince_code": self.extract_prince_code_from_filestem(numb),
|
|
|
|
|
"filename": numb,
|
|
|
|
|
"date_time": date_time,
|
|
|
|
|
"date": date,
|
|
|
|
|
# "prod_place_acte": place_query[0],
|
|
|
|
|
"analysis": analyse,
|
|
|
|
|
# "doc_acte": doc_query[0],
|
|
|
|
|
"ref_acte": ref_acte,
|
|
|
|
|
"transcribers": self.find_transcribers(),
|
|
|
|
|
"place": place,
|
|
|
|
|
"diplo_type": type_diplo,
|
|
|
|
|
"diplo_state": diplo_state,
|
|
|
|
|
"image": image
|
|
|
|
|
# "state_doc": state_query[0],
|
|
|
|
|
# "diplo_type_acte": diplo_query[0]
|
|
|
|
|
"image": image,
|
|
|
|
|
"url": self.make_acte_url_from_filestem(house, prince_name, numb)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class DataSetCollection(AbstractDataSet):
|
|
|
|
|
|