Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -219,11 +219,12 @@ with col_about:
|
|
219 |
with st.expander("About"):
|
220 |
st.markdown(
|
221 |
"""
|
|
|
222 |
**This app is a prototype for testing purposes.**
|
223 |
The intended use is to explore AI-generated answers using publicly available project data from the German International Cooperation Society (GIZ) as of 23rd February 2025.
|
224 |
**Please do NOT enter sensitive or personal information.**
|
225 |
Note: The generated answers are AI-generated and may be wrong or misleading.
|
226 |
-
""")
|
227 |
|
228 |
###########################################
|
229 |
# Query input and budget slider (Change 9)
|
@@ -313,173 +314,177 @@ with col5:
|
|
313 |
# Checkbox for exact matches
|
314 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
315 |
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
return f"€{int(float(value)):,}"
|
335 |
-
except (ValueError, TypeError):
|
336 |
-
return value
|
337 |
-
|
338 |
-
###########################################
|
339 |
-
# Display Results (Lexical and Semantic)
|
340 |
-
###########################################
|
341 |
-
# --- Lexical Results Branch ---
|
342 |
-
if show_exact_matches:
|
343 |
-
st.write("Showing **Top 15 Lexical Search results**")
|
344 |
-
query_substring = var.strip().lower()
|
345 |
-
lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
|
346 |
-
filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
347 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
remainder_text = " ".join(words[preview_word_count:])
|
374 |
-
st.markdown(highlight_query(preview_text, var), unsafe_allow_html=True)
|
375 |
-
# Create two columns: left for "Show more" (remainder text) and right for additional details.
|
376 |
-
col_left, col_right = st.columns(2)
|
377 |
-
with col_left:
|
378 |
-
if remainder_text:
|
379 |
-
with st.expander("Show more"):
|
380 |
-
st.write(remainder_text)
|
381 |
-
with col_right:
|
382 |
-
# Format additional text with line breaks using <br>
|
383 |
-
start_year = metadata.get('start_year', None)
|
384 |
-
end_year = metadata.get('end_year', None)
|
385 |
-
start_year_str = extract_year(start_year) if start_year else "Unknown"
|
386 |
-
end_year_str = extract_year(end_year) if end_year else "Unknown"
|
387 |
-
total_project = metadata.get('total_project', "Unknown")
|
388 |
-
total_volume = metadata.get('total_volume', "Unknown")
|
389 |
-
formatted_project_budget = format_currency(total_project)
|
390 |
-
formatted_total_volume = format_currency(total_volume)
|
391 |
-
try:
|
392 |
-
c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
|
393 |
-
except json.JSONDecodeError:
|
394 |
-
c_list = []
|
395 |
-
matched_countries = []
|
396 |
-
for code in c_list:
|
397 |
-
if len(code) == 2:
|
398 |
-
resolved_name = get_country_name(code.upper(), region_df)
|
399 |
-
if resolved_name.upper() != code.upper():
|
400 |
-
matched_countries.append(resolved_name)
|
401 |
-
crs_key = metadata.get("crs_key", "").strip()
|
402 |
-
new_crs_value = lookup_crs_value(crs_key)
|
403 |
-
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
|
404 |
-
client_name = metadata.get('client', 'Unknown Client')
|
405 |
-
contact = metadata.get("contact", "").strip()
|
406 |
-
additional_text = (
|
407 |
-
f"Objective: **{objective}**<br>"
|
408 |
-
f"Commissioned by **{client_name}**<br>"
|
409 |
-
f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
|
410 |
-
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
|
411 |
-
f"Country: **{', '.join(matched_countries)}**<br>"
|
412 |
-
f"Sector: **{crs_combined}**"
|
413 |
-
)
|
414 |
-
if contact and contact.lower() != "[email protected]":
|
415 |
-
additional_text += f"<br>Contact: **{contact}**"
|
416 |
-
st.markdown(additional_text, unsafe_allow_html=True)
|
417 |
st.divider()
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
else:
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
|
|
|
|
|
|
|
219 |
with st.expander("About"):
|
220 |
st.markdown(
|
221 |
"""
|
222 |
+
ℹ️ **About:**
|
223 |
**This app is a prototype for testing purposes.**
|
224 |
The intended use is to explore AI-generated answers using publicly available project data from the German International Cooperation Society (GIZ) as of 23rd February 2025.
|
225 |
**Please do NOT enter sensitive or personal information.**
|
226 |
Note: The generated answers are AI-generated and may be wrong or misleading.
|
227 |
+
""", unsafe_allow_html=True)
|
228 |
|
229 |
###########################################
|
230 |
# Query input and budget slider (Change 9)
|
|
|
314 |
# Checkbox for exact matches
|
315 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
316 |
|
317 |
+
if not var.strip():
|
318 |
+
st.info("Please enter a query to see results.")
|
319 |
+
else:
|
320 |
+
|
321 |
+
###########################################
|
322 |
+
# Run the search and apply filters
|
323 |
+
###########################################
|
324 |
+
results = hybrid_search(client, var, collection_name, limit=500)
|
325 |
+
semantic_all = results[0]
|
326 |
+
lexical_all = results[1]
|
327 |
+
semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
|
328 |
+
lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
|
329 |
+
semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
|
330 |
+
|
331 |
+
# Pass the budget filter (min_budget) into filter_results
|
332 |
+
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
333 |
+
filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
334 |
+
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
336 |
+
|
337 |
+
def format_currency(value):
|
338 |
+
try:
|
339 |
+
return f"€{int(float(value)):,}"
|
340 |
+
except (ValueError, TypeError):
|
341 |
+
return value
|
342 |
+
|
343 |
+
###########################################
|
344 |
+
# Display Results (Lexical and Semantic)
|
345 |
+
###########################################
|
346 |
+
# --- Lexical Results Branch ---
|
347 |
+
if show_exact_matches:
|
348 |
+
st.write("Showing **Top 15 Lexical Search results**")
|
349 |
+
query_substring = var.strip().lower()
|
350 |
+
lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
|
351 |
+
filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
352 |
+
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
353 |
+
if not filtered_lexical_no_dupe:
|
354 |
+
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
355 |
+
else:
|
356 |
+
top_results = filtered_lexical_no_dupe[:10]
|
357 |
+
rag_answer = get_rag_answer(var, top_results)
|
358 |
+
# Use the query as heading; increase size and center it.
|
359 |
+
st.markdown(f"<h2 style='text-align:center; font-size:1.5em;'>{var}</h2>", unsafe_allow_html=True)
|
360 |
+
st.write(rag_answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
st.divider()
|
362 |
+
for res in top_results:
|
363 |
+
metadata = res.payload.get('metadata', {})
|
364 |
+
if "title" not in metadata:
|
365 |
+
metadata["title"] = compute_title(metadata)
|
366 |
+
# Highlight query matches in title (rendered with HTML)
|
367 |
+
title_html = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
|
368 |
+
st.markdown(f"#### {title_html}", unsafe_allow_html=True)
|
369 |
+
# Build snippet from objectives and description
|
370 |
+
objective = metadata.get("objective", "None")
|
371 |
+
desc_en = metadata.get("description.en", "").strip()
|
372 |
+
desc_de = metadata.get("description.de", "").strip()
|
373 |
+
description = desc_en if desc_en != "" else desc_de
|
374 |
+
full_snippet = f"{description}"
|
375 |
+
words = full_snippet.split()
|
376 |
+
preview_word_count = 90
|
377 |
+
preview_text = " ".join(words[:preview_word_count])
|
378 |
+
remainder_text = " ".join(words[preview_word_count:])
|
379 |
+
# Create two columns: left for full description and right for additional details.
|
380 |
+
col_left, col_right = st.columns(2)
|
381 |
+
with col_left:
|
382 |
+
# Combine preview and remainder into one full description block.
|
383 |
+
full_description = preview_text + (" " + remainder_text if remainder_text else "")
|
384 |
+
st.markdown(highlight_query(full_description, var), unsafe_allow_html=True)
|
385 |
+
|
386 |
+
with col_right:
|
387 |
+
# Format additional text with line breaks using <br>
|
388 |
+
start_year = metadata.get('start_year', None)
|
389 |
+
end_year = metadata.get('end_year', None)
|
390 |
+
start_year_str = extract_year(start_year) if start_year else "Unknown"
|
391 |
+
end_year_str = extract_year(end_year) if end_year else "Unknown"
|
392 |
+
total_project = metadata.get('total_project', "Unknown")
|
393 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
394 |
+
formatted_project_budget = format_currency(total_project)
|
395 |
+
formatted_total_volume = format_currency(total_volume)
|
396 |
+
try:
|
397 |
+
c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
|
398 |
+
except json.JSONDecodeError:
|
399 |
+
c_list = []
|
400 |
+
matched_countries = []
|
401 |
+
for code in c_list:
|
402 |
+
if len(code) == 2:
|
403 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
404 |
+
if resolved_name.upper() != code.upper():
|
405 |
+
matched_countries.append(resolved_name)
|
406 |
+
crs_key = metadata.get("crs_key", "").strip()
|
407 |
+
new_crs_value = lookup_crs_value(crs_key)
|
408 |
+
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
|
409 |
+
client_name = metadata.get('client', 'Unknown Client')
|
410 |
+
contact = metadata.get("contact", "").strip()
|
411 |
+
additional_text = (
|
412 |
+
f"Objective: **{objective}**<br>"
|
413 |
+
f"Commissioned by **{client_name}**<br>"
|
414 |
+
f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
|
415 |
+
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
|
416 |
+
f"Country: **{', '.join(matched_countries)}**<br>"
|
417 |
+
f"Sector: **{crs_combined}**"
|
418 |
+
)
|
419 |
+
if contact and contact.lower() != "[email protected]":
|
420 |
+
additional_text += f"<br>Contact: **{contact}**"
|
421 |
+
st.markdown(additional_text, unsafe_allow_html=True)
|
422 |
+
st.divider()
|
423 |
+
|
424 |
+
# --- Semantic Results Branch ---
|
425 |
else:
|
426 |
+
if not filtered_semantic_no_dupe:
|
427 |
+
st.write("No relevant results found.")
|
428 |
+
else:
|
429 |
+
top_results = filtered_semantic_no_dupe[:10]
|
430 |
+
rag_answer = get_rag_answer(var, top_results)
|
431 |
+
st.markdown(f"<h2 style='text-align:center; font-size:2.5em;'>{var}</h2>", unsafe_allow_html=True)
|
432 |
+
st.write(rag_answer)
|
433 |
+
st.divider()
|
434 |
+
st.write("Showing **Top 15 Semantic Search results**")
|
435 |
+
for res in top_results:
|
436 |
+
metadata = res.payload.get('metadata', {})
|
437 |
+
if "title" not in metadata:
|
438 |
+
metadata["title"] = compute_title(metadata)
|
439 |
+
st.markdown(f"#### {metadata['title']}")
|
440 |
+
objective = metadata.get("objective", "")
|
441 |
+
desc_en = metadata.get("description.en", "").strip()
|
442 |
+
desc_de = metadata.get("description.de", "").strip()
|
443 |
+
description = desc_en if desc_en != "" else desc_de
|
444 |
+
full_snippet = f"{description}"
|
445 |
+
words = full_snippet.split()
|
446 |
+
preview_word_count = 90
|
447 |
+
preview_text = " ".join(words[:preview_word_count])
|
448 |
+
remainder_text = " ".join(words[preview_word_count:])
|
449 |
+
# Create two columns: left for full description (preview + remainder) and right for additional details.
|
450 |
+
col_left, col_right = st.columns(2)
|
451 |
+
with col_left:
|
452 |
+
# Combine preview and remainder into one text block.
|
453 |
+
full_description = preview_text + (" " + remainder_text if remainder_text else "")
|
454 |
+
st.markdown(full_description)
|
455 |
+
with col_right:
|
456 |
+
start_year = metadata.get('start_year', None)
|
457 |
+
end_year = metadata.get('end_year', None)
|
458 |
+
start_year_str = extract_year(start_year) if start_year else "Unknown"
|
459 |
+
end_year_str = extract_year(end_year) if end_year else "Unknown"
|
460 |
+
total_project = metadata.get('total_project', "Unknown")
|
461 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
462 |
+
formatted_project_budget = format_currency(total_project)
|
463 |
+
formatted_total_volume = format_currency(total_volume)
|
464 |
+
try:
|
465 |
+
c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
|
466 |
+
except json.JSONDecodeError:
|
467 |
+
c_list = []
|
468 |
+
matched_countries = []
|
469 |
+
for code in c_list:
|
470 |
+
if len(code) == 2:
|
471 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
472 |
+
if resolved_name.upper() != code.upper():
|
473 |
+
matched_countries.append(resolved_name)
|
474 |
+
crs_key = metadata.get("crs_key", "").strip()
|
475 |
+
new_crs_value = lookup_crs_value(crs_key)
|
476 |
+
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
|
477 |
+
client_name = metadata.get('client', 'Unknown Client')
|
478 |
+
contact = metadata.get("contact", "").strip()
|
479 |
+
additional_text = (
|
480 |
+
f"Objective: **{objective}**<br>"
|
481 |
+
f"Commissioned by **{client_name}**<br>"
|
482 |
+
f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
|
483 |
+
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
|
484 |
+
f"Country: **{', '.join(matched_countries)}**<br>"
|
485 |
+
f"Sector: **{crs_combined}**"
|
486 |
+
)
|
487 |
+
if contact and contact.lower() != "[email protected]":
|
488 |
+
additional_text += f"<br>Contact: **{contact}**"
|
489 |
+
st.markdown(additional_text, unsafe_allow_html=True)
|
490 |
+
st.divider()
|