add url in the acte db

develop
gwen 2 years ago
parent a48585dde9
commit 1e6f84ca89

@ -54,6 +54,7 @@ class Acte(Document):
diplo_state = StringField(required=False) # sample: "diplo_state": "Original", diplo_state = StringField(required=False) # sample: "diplo_state": "Original",
diplo_type = StringField(required=False) # sample: "diplo_type": "Lettres_patentes", diplo_type = StringField(required=False) # sample: "diplo_type": "Lettres_patentes",
image = StringField(required=False) image = StringField(required=False)
url = StringField(required=True)
def db_connect(storage_ip, db_name, mongodb_admin, mongodb_password): def db_connect(storage_ip, db_name, mongodb_admin, mongodb_password):
#mongodb://%s:%s@149.202.41.75:27017' % (username, password) #mongodb://%s:%s@149.202.41.75:27017' % (username, password)

@ -3,7 +3,7 @@ import json
from typing import Dict, Any from typing import Dict, Any
from pathlib import Path from pathlib import Path
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from unidecode import unidecode # to remove accents in the urls
from lxml import etree from lxml import etree
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -121,6 +121,31 @@ class BsXMLDataSet(XMLDataSet):
prince_code = "_".join(cut[1:3]) prince_code = "_".join(cut[1:3])
return prince_code return prince_code
def make_acte_url_from_filestem(self, house, prince_name, filestem):
"""
url sample : /acte/Bourbon/Louis_ii/1367_04_26a
"""
# bourbon -> Bourbon
house = house.capitalize()
# "Louis II de Bourbon" -> "Louis"
prince_name = prince_name.split(" ")[0]
# "anj_isa_i" -> "i"
prince_number = filestem.split("_")[2]
prince_name = unidecode(prince_name)
# Isabelle + i -> Isabelle_i
prince_name = prince_name + "_" + prince_number
timeitem = self.make_timeitem_from_filestem(filestem)
# final url
return "/" + "/".join(['acte', house, prince_name, timeitem])
def make_timeitem_from_filestem(self, filestem):
"""
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
"""
trs_fname = filestem.split('_')
return "_".join(trs_fname[3:])
def transform(self): def transform(self):
#soup = make_soup(os.path.join(folder, acte)) #soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
@ -202,24 +227,24 @@ class BsXMLDataSet(XMLDataSet):
image = self.soup.find("graphic") image = self.soup.find("graphic")
if image is not None: if image is not None:
image = image.get('url') image = image.get('url')
prince_name = self.find_prince_name()
#self._filepath
# data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml
house = self.filepath.split("/")[3]
return { return {
# "num_acte": counter, "prince_name": prince_name,
"prince_name": self.find_prince_name(),
"prince_code": self.extract_prince_code_from_filestem(numb), "prince_code": self.extract_prince_code_from_filestem(numb),
"filename": numb, "filename": numb,
"date_time": date_time, "date_time": date_time,
"date": date, "date": date,
# "prod_place_acte": place_query[0],
"analysis": analyse, "analysis": analyse,
# "doc_acte": doc_query[0],
"ref_acte": ref_acte, "ref_acte": ref_acte,
"transcribers": self.find_transcribers(), "transcribers": self.find_transcribers(),
"place": place, "place": place,
"diplo_type": type_diplo, "diplo_type": type_diplo,
"diplo_state": diplo_state, "diplo_state": diplo_state,
"image": image "image": image,
# "state_doc": state_query[0], "url": self.make_acte_url_from_filestem(house, prince_name, numb)
# "diplo_type_acte": diplo_query[0]
} }
class DataSetCollection(AbstractDataSet): class DataSetCollection(AbstractDataSet):

@ -11,3 +11,4 @@ nbstripout~=0.4
pymongo~=4.5.0 pymongo~=4.5.0
mongoengine~=0.27.0 mongoengine~=0.27.0
folium~=0.14.0 folium~=0.14.0
Unidecode~=1.3.7

Loading…
Cancel
Save