annikwag commited on
Commit
fdfd226
·
verified ·
1 Parent(s): 4def7e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -168
app.py CHANGED
@@ -219,11 +219,12 @@ with col_about:
219
  with st.expander("About"):
220
  st.markdown(
221
  """
 
222
  **This app is a prototype for testing purposes.**
223
  The intended use is to explore AI-generated answers using publicly available project data from the German International Cooperation Society (GIZ) as of 23rd February 2025.
224
  **Please do NOT enter sensitive or personal information.**
225
  Note: The generated answers are AI-generated and may be wrong or misleading.
226
- """)
227
 
228
  ###########################################
229
  # Query input and budget slider (Change 9)
@@ -313,173 +314,177 @@ with col5:
313
  # Checkbox for exact matches
314
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
315
 
316
- ###########################################
317
- # Run the search and apply filters
318
- ###########################################
319
- results = hybrid_search(client, var, collection_name, limit=500)
320
- semantic_all = results[0]
321
- lexical_all = results[1]
322
- semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
323
- lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
324
- semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
325
-
326
- # Pass the budget filter (min_budget) into filter_results
327
- filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter, min_budget)
328
- filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter, min_budget)
329
- filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
330
- filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
331
-
332
- def format_currency(value):
333
- try:
334
- return f"€{int(float(value)):,}"
335
- except (ValueError, TypeError):
336
- return value
337
-
338
- ###########################################
339
- # Display Results (Lexical and Semantic)
340
- ###########################################
341
- # --- Lexical Results Branch ---
342
- if show_exact_matches:
343
- st.write("Showing **Top 15 Lexical Search results**")
344
- query_substring = var.strip().lower()
345
- lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
346
- filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter, min_budget)
347
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
348
- if not filtered_lexical_no_dupe:
349
- st.write('No exact matches, consider unchecking "Show only exact matches"')
350
- else:
351
- top_results = filtered_lexical_no_dupe[:10]
352
- rag_answer = get_rag_answer(var, top_results)
353
- # Use the query as heading; increase size and center it.
354
- st.markdown(f"<h2 style='text-align:center; font-size:1.5em;'>{var}</h2>", unsafe_allow_html=True)
355
- st.write(rag_answer)
356
- st.divider()
357
- for res in top_results:
358
- metadata = res.payload.get('metadata', {})
359
- if "title" not in metadata:
360
- metadata["title"] = compute_title(metadata)
361
- # Highlight query matches in title (rendered with HTML)
362
- title_html = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
363
- st.markdown(f"#### {title_html}", unsafe_allow_html=True)
364
- # Build snippet from objectives and description
365
- objective = metadata.get("objective", "None")
366
- desc_en = metadata.get("description.en", "").strip()
367
- desc_de = metadata.get("description.de", "").strip()
368
- description = desc_en if desc_en != "" else desc_de
369
- full_snippet = f"{description}"
370
- words = full_snippet.split()
371
- preview_word_count = 90
372
- preview_text = " ".join(words[:preview_word_count])
373
- remainder_text = " ".join(words[preview_word_count:])
374
- st.markdown(highlight_query(preview_text, var), unsafe_allow_html=True)
375
- # Create two columns: left for "Show more" (remainder text) and right for additional details.
376
- col_left, col_right = st.columns(2)
377
- with col_left:
378
- if remainder_text:
379
- with st.expander("Show more"):
380
- st.write(remainder_text)
381
- with col_right:
382
- # Format additional text with line breaks using <br>
383
- start_year = metadata.get('start_year', None)
384
- end_year = metadata.get('end_year', None)
385
- start_year_str = extract_year(start_year) if start_year else "Unknown"
386
- end_year_str = extract_year(end_year) if end_year else "Unknown"
387
- total_project = metadata.get('total_project', "Unknown")
388
- total_volume = metadata.get('total_volume', "Unknown")
389
- formatted_project_budget = format_currency(total_project)
390
- formatted_total_volume = format_currency(total_volume)
391
- try:
392
- c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
393
- except json.JSONDecodeError:
394
- c_list = []
395
- matched_countries = []
396
- for code in c_list:
397
- if len(code) == 2:
398
- resolved_name = get_country_name(code.upper(), region_df)
399
- if resolved_name.upper() != code.upper():
400
- matched_countries.append(resolved_name)
401
- crs_key = metadata.get("crs_key", "").strip()
402
- new_crs_value = lookup_crs_value(crs_key)
403
- crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
404
- client_name = metadata.get('client', 'Unknown Client')
405
- contact = metadata.get("contact", "").strip()
406
- additional_text = (
407
- f"Objective: **{objective}**<br>"
408
- f"Commissioned by **{client_name}**<br>"
409
- f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
410
- f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
411
- f"Country: **{', '.join(matched_countries)}**<br>"
412
- f"Sector: **{crs_combined}**"
413
- )
414
- if contact and contact.lower() != "[email protected]":
415
- additional_text += f"<br>Contact: **{contact}**"
416
- st.markdown(additional_text, unsafe_allow_html=True)
417
  st.divider()
418
-
419
- # --- Semantic Results Branch ---
420
- else:
421
- if not filtered_semantic_no_dupe:
422
- st.write("No relevant results found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  else:
424
- top_results = filtered_semantic_no_dupe[:10]
425
- rag_answer = get_rag_answer(var, top_results)
426
- st.markdown(f"<h2 style='text-align:center; font-size:2.5em;'>{var}</h2>", unsafe_allow_html=True)
427
- st.write(rag_answer)
428
- st.divider()
429
- st.write("Showing **Top 15 Semantic Search results**")
430
- for res in top_results:
431
- metadata = res.payload.get('metadata', {})
432
- if "title" not in metadata:
433
- metadata["title"] = compute_title(metadata)
434
- st.markdown(f"#### {metadata['title']}")
435
- objective = metadata.get("objective", "")
436
- desc_en = metadata.get("description.en", "").strip()
437
- desc_de = metadata.get("description.de", "").strip()
438
- description = desc_en if desc_en != "" else desc_de
439
- full_snippet = f"{description}"
440
- words = full_snippet.split()
441
- preview_word_count = 90
442
- preview_text = " ".join(words[:preview_word_count])
443
- remainder_text = " ".join(words[preview_word_count:])
444
- st.write(preview_text)
445
- col_left, col_right = st.columns(2)
446
- with col_left:
447
- if remainder_text:
448
- with st.expander("Show more"):
449
- st.write(remainder_text)
450
- with col_right:
451
- start_year = metadata.get('start_year', None)
452
- end_year = metadata.get('end_year', None)
453
- start_year_str = extract_year(start_year) if start_year else "Unknown"
454
- end_year_str = extract_year(end_year) if end_year else "Unknown"
455
- total_project = metadata.get('total_project', "Unknown")
456
- total_volume = metadata.get('total_volume', "Unknown")
457
- formatted_project_budget = format_currency(total_project)
458
- formatted_total_volume = format_currency(total_volume)
459
- try:
460
- c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
461
- except json.JSONDecodeError:
462
- c_list = []
463
- matched_countries = []
464
- for code in c_list:
465
- if len(code) == 2:
466
- resolved_name = get_country_name(code.upper(), region_df)
467
- if resolved_name.upper() != code.upper():
468
- matched_countries.append(resolved_name)
469
- crs_key = metadata.get("crs_key", "").strip()
470
- new_crs_value = lookup_crs_value(crs_key)
471
- crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
472
- client_name = metadata.get('client', 'Unknown Client')
473
- contact = metadata.get("contact", "").strip()
474
- additional_text = (
475
- f"Objective: **{objective}**<br>"
476
- f"Commissioned by **{client_name}**<br>"
477
- f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
478
- f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
479
- f"Country: **{', '.join(matched_countries)}**<br>"
480
- f"Sector: **{crs_combined}**"
481
- )
482
- if contact and contact.lower() != "[email protected]":
483
- additional_text += f"<br>Contact: **{contact}**"
484
- st.markdown(additional_text, unsafe_allow_html=True)
485
- st.divider()
 
 
 
 
219
  with st.expander("About"):
220
  st.markdown(
221
  """
222
+ ℹ️ **About:**
223
  **This app is a prototype for testing purposes.**
224
  The intended use is to explore AI-generated answers using publicly available project data from the German International Cooperation Society (GIZ) as of 23rd February 2025.
225
  **Please do NOT enter sensitive or personal information.**
226
  Note: The generated answers are AI-generated and may be wrong or misleading.
227
+ """, unsafe_allow_html=True)
228
 
229
  ###########################################
230
  # Query input and budget slider (Change 9)
 
314
  # Checkbox for exact matches
315
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
316
 
317
+ if not var.strip():
318
+ st.info("Please enter a query to see results.")
319
+ else:
320
+
321
+ ###########################################
322
+ # Run the search and apply filters
323
+ ###########################################
324
+ results = hybrid_search(client, var, collection_name, limit=500)
325
+ semantic_all = results[0]
326
+ lexical_all = results[1]
327
+ semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
328
+ lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
329
+ semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
330
+
331
+ # Pass the budget filter (min_budget) into filter_results
332
+ filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter, min_budget)
333
+ filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter, min_budget)
334
+ filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
336
+
337
+ def format_currency(value):
338
+ try:
339
+ return f"€{int(float(value)):,}"
340
+ except (ValueError, TypeError):
341
+ return value
342
+
343
+ ###########################################
344
+ # Display Results (Lexical and Semantic)
345
+ ###########################################
346
+ # --- Lexical Results Branch ---
347
+ if show_exact_matches:
348
+ st.write("Showing **Top 15 Lexical Search results**")
349
+ query_substring = var.strip().lower()
350
+ lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
351
+ filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter, min_budget)
352
+ filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
353
+ if not filtered_lexical_no_dupe:
354
+ st.write('No exact matches, consider unchecking "Show only exact matches"')
355
+ else:
356
+ top_results = filtered_lexical_no_dupe[:10]
357
+ rag_answer = get_rag_answer(var, top_results)
358
+ # Use the query as heading; increase size and center it.
359
+ st.markdown(f"<h2 style='text-align:center; font-size:1.5em;'>{var}</h2>", unsafe_allow_html=True)
360
+ st.write(rag_answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  st.divider()
362
+ for res in top_results:
363
+ metadata = res.payload.get('metadata', {})
364
+ if "title" not in metadata:
365
+ metadata["title"] = compute_title(metadata)
366
+ # Highlight query matches in title (rendered with HTML)
367
+ title_html = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
368
+ st.markdown(f"#### {title_html}", unsafe_allow_html=True)
369
+ # Build snippet from objectives and description
370
+ objective = metadata.get("objective", "None")
371
+ desc_en = metadata.get("description.en", "").strip()
372
+ desc_de = metadata.get("description.de", "").strip()
373
+ description = desc_en if desc_en != "" else desc_de
374
+ full_snippet = f"{description}"
375
+ words = full_snippet.split()
376
+ preview_word_count = 90
377
+ preview_text = " ".join(words[:preview_word_count])
378
+ remainder_text = " ".join(words[preview_word_count:])
379
+ # Create two columns: left for full description and right for additional details.
380
+ col_left, col_right = st.columns(2)
381
+ with col_left:
382
+ # Combine preview and remainder into one full description block.
383
+ full_description = preview_text + (" " + remainder_text if remainder_text else "")
384
+ st.markdown(highlight_query(full_description, var), unsafe_allow_html=True)
385
+
386
+ with col_right:
387
+ # Format additional text with line breaks using <br>
388
+ start_year = metadata.get('start_year', None)
389
+ end_year = metadata.get('end_year', None)
390
+ start_year_str = extract_year(start_year) if start_year else "Unknown"
391
+ end_year_str = extract_year(end_year) if end_year else "Unknown"
392
+ total_project = metadata.get('total_project', "Unknown")
393
+ total_volume = metadata.get('total_volume', "Unknown")
394
+ formatted_project_budget = format_currency(total_project)
395
+ formatted_total_volume = format_currency(total_volume)
396
+ try:
397
+ c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
398
+ except json.JSONDecodeError:
399
+ c_list = []
400
+ matched_countries = []
401
+ for code in c_list:
402
+ if len(code) == 2:
403
+ resolved_name = get_country_name(code.upper(), region_df)
404
+ if resolved_name.upper() != code.upper():
405
+ matched_countries.append(resolved_name)
406
+ crs_key = metadata.get("crs_key", "").strip()
407
+ new_crs_value = lookup_crs_value(crs_key)
408
+ crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
409
+ client_name = metadata.get('client', 'Unknown Client')
410
+ contact = metadata.get("contact", "").strip()
411
+ additional_text = (
412
+ f"Objective: **{objective}**<br>"
413
+ f"Commissioned by **{client_name}**<br>"
414
+ f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
415
+ f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
416
+ f"Country: **{', '.join(matched_countries)}**<br>"
417
+ f"Sector: **{crs_combined}**"
418
+ )
419
+ if contact and contact.lower() != "[email protected]":
420
+ additional_text += f"<br>Contact: **{contact}**"
421
+ st.markdown(additional_text, unsafe_allow_html=True)
422
+ st.divider()
423
+
424
+ # --- Semantic Results Branch ---
425
  else:
426
+ if not filtered_semantic_no_dupe:
427
+ st.write("No relevant results found.")
428
+ else:
429
+ top_results = filtered_semantic_no_dupe[:10]
430
+ rag_answer = get_rag_answer(var, top_results)
431
+ st.markdown(f"<h2 style='text-align:center; font-size:2.5em;'>{var}</h2>", unsafe_allow_html=True)
432
+ st.write(rag_answer)
433
+ st.divider()
434
+ st.write("Showing **Top 15 Semantic Search results**")
435
+ for res in top_results:
436
+ metadata = res.payload.get('metadata', {})
437
+ if "title" not in metadata:
438
+ metadata["title"] = compute_title(metadata)
439
+ st.markdown(f"#### {metadata['title']}")
440
+ objective = metadata.get("objective", "")
441
+ desc_en = metadata.get("description.en", "").strip()
442
+ desc_de = metadata.get("description.de", "").strip()
443
+ description = desc_en if desc_en != "" else desc_de
444
+ full_snippet = f"{description}"
445
+ words = full_snippet.split()
446
+ preview_word_count = 90
447
+ preview_text = " ".join(words[:preview_word_count])
448
+ remainder_text = " ".join(words[preview_word_count:])
449
+ # Create two columns: left for full description (preview + remainder) and right for additional details.
450
+ col_left, col_right = st.columns(2)
451
+ with col_left:
452
+ # Combine preview and remainder into one text block.
453
+ full_description = preview_text + (" " + remainder_text if remainder_text else "")
454
+ st.markdown(full_description)
455
+ with col_right:
456
+ start_year = metadata.get('start_year', None)
457
+ end_year = metadata.get('end_year', None)
458
+ start_year_str = extract_year(start_year) if start_year else "Unknown"
459
+ end_year_str = extract_year(end_year) if end_year else "Unknown"
460
+ total_project = metadata.get('total_project', "Unknown")
461
+ total_volume = metadata.get('total_volume', "Unknown")
462
+ formatted_project_budget = format_currency(total_project)
463
+ formatted_total_volume = format_currency(total_volume)
464
+ try:
465
+ c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
466
+ except json.JSONDecodeError:
467
+ c_list = []
468
+ matched_countries = []
469
+ for code in c_list:
470
+ if len(code) == 2:
471
+ resolved_name = get_country_name(code.upper(), region_df)
472
+ if resolved_name.upper() != code.upper():
473
+ matched_countries.append(resolved_name)
474
+ crs_key = metadata.get("crs_key", "").strip()
475
+ new_crs_value = lookup_crs_value(crs_key)
476
+ crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
477
+ client_name = metadata.get('client', 'Unknown Client')
478
+ contact = metadata.get("contact", "").strip()
479
+ additional_text = (
480
+ f"Objective: **{objective}**<br>"
481
+ f"Commissioned by **{client_name}**<br>"
482
+ f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
483
+ f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
484
+ f"Country: **{', '.join(matched_countries)}**<br>"
485
+ f"Sector: **{crs_combined}**"
486
+ )
487
+ if contact and contact.lower() != "[email protected]":
488
+ additional_text += f"<br>Contact: **{contact}**"
489
+ st.markdown(additional_text, unsafe_allow_html=True)
490
+ st.divider()