add url in the acte db

develop
gwen 2 years ago
parent a48585dde9
commit 1e6f84ca89

@ -54,6 +54,7 @@ class Acte(Document):
diplo_state = StringField(required=False) # sample: "diplo_state": "Original",
diplo_type = StringField(required=False) # sample: "diplo_type": "Lettres_patentes",
image = StringField(required=False)
url = StringField(required=True)
def db_connect(storage_ip, db_name, mongodb_admin, mongodb_password):
#mongodb://%s:%s@149.202.41.75:27017' % (username, password)

@ -3,7 +3,7 @@ import json
from typing import Dict, Any
from pathlib import Path
from abc import ABC, abstractmethod
from unidecode import unidecode # to remove accents in the urls
from lxml import etree
from bs4 import BeautifulSoup
@ -121,6 +121,31 @@ class BsXMLDataSet(XMLDataSet):
prince_code = "_".join(cut[1:3])
return prince_code
def make_acte_url_from_filestem(self, house, prince_name, filestem):
"""
url sample : /acte/Bourbon/Louis_ii/1367_04_26a
"""
# bourbon -> Bourbon
house = house.capitalize()
# "Louis II de Bourbon" -> "Louis"
prince_name = prince_name.split(" ")[0]
# "anj_isa_i" -> "i"
prince_number = filestem.split("_")[2]
prince_name = unidecode(prince_name)
# Isabelle + i -> Isabelle_i
prince_name = prince_name + "_" + prince_number
timeitem = self.make_timeitem_from_filestem(filestem)
# final url
return "/" + "/".join(['acte', house, prince_name, timeitem])
def make_timeitem_from_filestem(self, filestem):
"""
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
"""
trs_fname = filestem.split('_')
return "_".join(trs_fname[3:])
def transform(self):
#soup = make_soup(os.path.join(folder, acte))
# 1.1/ Get all data from XML (9). counter is the id (= numb_acte)
@ -198,28 +223,28 @@ class BsXMLDataSet(XMLDataSet):
latitude = latitude,
longitude = longitude
)
# nakala
# nakala
image = self.soup.find("graphic")
if image is not None:
image = image.get('url')
prince_name = self.find_prince_name()
#self._filepath
# data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml
house = self.filepath.split("/")[3]
return {
# "num_acte": counter,
"prince_name": self.find_prince_name(),
"prince_name": prince_name,
"prince_code": self.extract_prince_code_from_filestem(numb),
"filename": numb,
"date_time": date_time,
"date": date,
# "prod_place_acte": place_query[0],
"analysis": analyse,
# "doc_acte": doc_query[0],
"ref_acte": ref_acte,
"transcribers": self.find_transcribers(),
"place": place,
"diplo_type": type_diplo,
"diplo_state": diplo_state,
"image": image
# "state_doc": state_query[0],
# "diplo_type_acte": diplo_query[0]
"image": image,
"url": self.make_acte_url_from_filestem(house, prince_name, numb)
}
class DataSetCollection(AbstractDataSet):

@ -11,3 +11,4 @@ nbstripout~=0.4
pymongo~=4.5.0
mongoengine~=0.27.0
folium~=0.14.0
Unidecode~=1.3.7

Loading…
Cancel
Save