From 7a6a911d30ee410ea79b1bca8dabdfb697c1de8a Mon Sep 17 00:00:00 2001 From: gwen Date: Tue, 3 Oct 2023 15:12:16 +0200 Subject: [PATCH] add word extracts in the search results --- app/helper.py | 24 ++++++++++++++++++++---- app/templates/plainsearch.html | 4 ++-- requirements.txt | 2 ++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/app/helper.py b/app/helper.py index 90ac0a8..205c241 100644 --- a/app/helper.py +++ b/app/helper.py @@ -1,10 +1,11 @@ -"""helper functions (is it really usefull?) +"""helper functions -TODO: maybe all these calculations are to be put in the db storage """ +from bs4 import BeautifulSoup from .dbinit import inverted_prince_bigram + def make_timeitem_from_filename(filename): """ "anj_isa_i_1441_08_05a" -> "1441_08_05a" @@ -35,21 +36,36 @@ def plaintext_response(search, actecol, prince_bigram): 'filename': '$filename', "place": "$place.name", "diplo_state": "$diplo_state", - "diplo_type": "$diplo_type" + "diplo_type": "$diplo_type", + "xmlcontent": "$xmlcontent" } } }] + results = list(actecol.aggregate(query)) transformed_query = [pr['_id'] for pr in results] invert_prince_bigram = {val: key for key, val in prince_bigram.items()} - # constructing the dateitem for trs in transformed_query: trs['house'] = trs['house'].capitalize() trs['dateitem'] = make_timeitem_from_filename(trs['filename']) bigram, number = trs['prince_code'].split('_') long_prince_bigram = inverted_prince_bigram(bigram) + '_' + number trs['prince_url'] = prince_name=long_prince_bigram.capitalize() + + # find the word in the text + cleantext = BeautifulSoup(trs["xmlcontent"], "lxml").text + cleantext = cleantext.lower() + searchtext = search.lower() + begin = cleantext.find(searchtext) + #end = cleantext.find(searchtext, begin, len(cleantext)) + strframe = 35 + begin_mark = begin - strframe + if begin_mark < 0: + begin_mark = 0 + end_mark = begin + len(searchtext) + strframe + trs['show_text'] = cleantext[begin_mark:end_mark] + return transformed_query diff --git a/app/templates/plainsearch.html b/app/templates/plainsearch.html index cd7e709..43195bd 100644 --- a/app/templates/plainsearch.html +++ b/app/templates/plainsearch.html @@ -6,7 +6,7 @@

Résulat de la recherche

-

Résulat de la recherche sur "{{ search }}" :

+

{{ actes | count }} résulats pour la recherche sur "{{ search }}" :

{% for acte in actes %}
@@ -14,7 +14,7 @@

-

{{ acte['prince_name'] }}, {{ acte['date'] }}.

+

{{ acte['prince_name'] }}, {{ acte['date'] }}, extrait : "... {{ acte['show_text'] }} ..."

{% if acte['place'] != 'NS' %} {{ acte['place'] }} diff --git a/requirements.txt b/requirements.txt index 903d0e1..e141833 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,5 @@ zipp==3.9.0 pyyaml==6.0.1 Unidecode==1.3.6 folium==0.14.0 +beautifulsoup4==4.12.2 +lxml==4.9.3