|
|
|
@ -1,10 +1,11 @@
|
|
|
|
"""helper functions (is it really usefull?)
|
|
|
|
"""helper functions
|
|
|
|
|
|
|
|
|
|
|
|
TODO: maybe all these calculations are to be put in the db storage
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
from .dbinit import inverted_prince_bigram
|
|
|
|
from .dbinit import inverted_prince_bigram
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_timeitem_from_filename(filename):
|
|
|
|
def make_timeitem_from_filename(filename):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
|
|
|
|
"anj_isa_i_1441_08_05a" -> "1441_08_05a"
|
|
|
|
@ -35,21 +36,36 @@ def plaintext_response(search, actecol, prince_bigram):
|
|
|
|
'filename': '$filename',
|
|
|
|
'filename': '$filename',
|
|
|
|
"place": "$place.name",
|
|
|
|
"place": "$place.name",
|
|
|
|
"diplo_state": "$diplo_state",
|
|
|
|
"diplo_state": "$diplo_state",
|
|
|
|
"diplo_type": "$diplo_type"
|
|
|
|
"diplo_type": "$diplo_type",
|
|
|
|
|
|
|
|
"xmlcontent": "$xmlcontent"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}]
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
|
|
results = list(actecol.aggregate(query))
|
|
|
|
results = list(actecol.aggregate(query))
|
|
|
|
transformed_query = [pr['_id'] for pr in results]
|
|
|
|
transformed_query = [pr['_id'] for pr in results]
|
|
|
|
invert_prince_bigram = {val: key for key, val in prince_bigram.items()}
|
|
|
|
invert_prince_bigram = {val: key for key, val in prince_bigram.items()}
|
|
|
|
|
|
|
|
|
|
|
|
# constructing the dateitem
|
|
|
|
|
|
|
|
for trs in transformed_query:
|
|
|
|
for trs in transformed_query:
|
|
|
|
trs['house'] = trs['house'].capitalize()
|
|
|
|
trs['house'] = trs['house'].capitalize()
|
|
|
|
trs['dateitem'] = make_timeitem_from_filename(trs['filename'])
|
|
|
|
trs['dateitem'] = make_timeitem_from_filename(trs['filename'])
|
|
|
|
bigram, number = trs['prince_code'].split('_')
|
|
|
|
bigram, number = trs['prince_code'].split('_')
|
|
|
|
long_prince_bigram = inverted_prince_bigram(bigram) + '_' + number
|
|
|
|
long_prince_bigram = inverted_prince_bigram(bigram) + '_' + number
|
|
|
|
trs['prince_url'] = prince_name=long_prince_bigram.capitalize()
|
|
|
|
trs['prince_url'] = prince_name=long_prince_bigram.capitalize()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# find the word in the text
|
|
|
|
|
|
|
|
cleantext = BeautifulSoup(trs["xmlcontent"], "lxml").text
|
|
|
|
|
|
|
|
cleantext = cleantext.lower()
|
|
|
|
|
|
|
|
searchtext = search.lower()
|
|
|
|
|
|
|
|
begin = cleantext.find(searchtext)
|
|
|
|
|
|
|
|
#end = cleantext.find(searchtext, begin, len(cleantext))
|
|
|
|
|
|
|
|
strframe = 35
|
|
|
|
|
|
|
|
begin_mark = begin - strframe
|
|
|
|
|
|
|
|
if begin_mark < 0:
|
|
|
|
|
|
|
|
begin_mark = 0
|
|
|
|
|
|
|
|
end_mark = begin + len(searchtext) + strframe
|
|
|
|
|
|
|
|
trs['show_text'] = cleantext[begin_mark:end_mark]
|
|
|
|
|
|
|
|
|
|
|
|
return transformed_query
|
|
|
|
return transformed_query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|