diff --git a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py index 619120a..d5cb665 100755 --- a/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py +++ b/actes-princiers/src/actes_princiers/pipelines/populate_mongo/nodes.py @@ -54,6 +54,7 @@ class Acte(Document): diplo_state = StringField(required=False) # sample: "diplo_state": "Original", diplo_type = StringField(required=False) # sample: "diplo_type": "Lettres_patentes", image = StringField(required=False) + url = StringField(required=True) def db_connect(storage_ip, db_name, mongodb_admin, mongodb_password): #mongodb://%s:%s@149.202.41.75:27017' % (username, password) diff --git a/actes-princiers/src/actesdataset.py b/actes-princiers/src/actesdataset.py index 357be1d..0cd8f09 100644 --- a/actes-princiers/src/actesdataset.py +++ b/actes-princiers/src/actesdataset.py @@ -3,7 +3,7 @@ import json from typing import Dict, Any from pathlib import Path from abc import ABC, abstractmethod - +from unidecode import unidecode # to remove accents in the urls from lxml import etree from bs4 import BeautifulSoup @@ -121,6 +121,31 @@ class BsXMLDataSet(XMLDataSet): prince_code = "_".join(cut[1:3]) return prince_code + def make_acte_url_from_filestem(self, house, prince_name, filestem): + """ + url sample : /acte/Bourbon/Louis_ii/1367_04_26a + + """ + # bourbon -> Bourbon + house = house.capitalize() + # "Louis II de Bourbon" -> "Louis" + prince_name = prince_name.split(" ")[0] + # "anj_isa_i" -> "i" + prince_number = filestem.split("_")[2] + prince_name = unidecode(prince_name) + # Isabelle + i -> Isabelle_i + prince_name = prince_name + "_" + prince_number + timeitem = self.make_timeitem_from_filestem(filestem) + # final url + return "/" + "/".join(['acte', house, prince_name, timeitem]) + + def make_timeitem_from_filestem(self, filestem): + """ + "anj_isa_i_1441_08_05a" -> "1441_08_05a" + """ + trs_fname = filestem.split('_') + return "_".join(trs_fname[3:]) + def transform(self): #soup = make_soup(os.path.join(folder, acte)) # 1.1/ Get all data from XML (9). counter is the id (= numb_acte) @@ -198,28 +223,28 @@ class BsXMLDataSet(XMLDataSet): latitude = latitude, longitude = longitude ) - # nakala + # nakala image = self.soup.find("graphic") if image is not None: image = image.get('url') + prince_name = self.find_prince_name() + #self._filepath + # data/01_raw/xml/Berry/bry_je_i_1405_05_04a.xml + house = self.filepath.split("/")[3] return { -# "num_acte": counter, - "prince_name": self.find_prince_name(), + "prince_name": prince_name, "prince_code": self.extract_prince_code_from_filestem(numb), "filename": numb, "date_time": date_time, "date": date, -# "prod_place_acte": place_query[0], "analysis": analyse, -# "doc_acte": doc_query[0], "ref_acte": ref_acte, "transcribers": self.find_transcribers(), "place": place, "diplo_type": type_diplo, "diplo_state": diplo_state, - "image": image -# "state_doc": state_query[0], -# "diplo_type_acte": diplo_query[0] + "image": image, + "url": self.make_acte_url_from_filestem(house, prince_name, numb) } class DataSetCollection(AbstractDataSet): diff --git a/actes-princiers/src/requirements.txt b/actes-princiers/src/requirements.txt index 832f671..ee0a313 100644 --- a/actes-princiers/src/requirements.txt +++ b/actes-princiers/src/requirements.txt @@ -11,3 +11,4 @@ nbstripout~=0.4 pymongo~=4.5.0 mongoengine~=0.27.0 folium~=0.14.0 +Unidecode~=1.3.7