Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,31 +7,81 @@ from appStore.prep_utils import create_documents, get_client
|
|
7 |
from appStore.embed import hybrid_embed_chunks
|
8 |
from appStore.search import hybrid_search
|
9 |
from appStore.region_utils import load_region_data, get_country_name, get_regions
|
10 |
-
from appStore.tfidf_extraction import extract_top_keywords
|
11 |
from torch import cuda
|
12 |
import json
|
13 |
from datetime import datetime
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
#
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
|
|
|
|
|
|
27 |
def get_rag_answer(query, top_results):
|
28 |
-
context
|
29 |
-
|
30 |
-
#
|
31 |
-
|
|
|
32 |
prompt = (
|
33 |
"You are a project portfolio adviser at the development cooperation GIZ. "
|
34 |
-
"Using the following context, answer the question in
|
|
|
35 |
"Only output the final answer below, without repeating the context or question.\n\n"
|
36 |
f"Context:\n{context}\n\n"
|
37 |
f"Question: {query}\n\n"
|
@@ -52,24 +102,9 @@ def get_rag_answer(query, top_results):
|
|
52 |
else:
|
53 |
return f"Error in generating answer: {response.text}"
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
if len(s) > 5:
|
59 |
-
return s[:4] + "." + s[4:-1] + "." + s[-1]
|
60 |
-
return s
|
61 |
-
|
62 |
-
# Helper: Compute title from metadata using name.en (or name.de if empty)
|
63 |
-
def compute_title(metadata):
|
64 |
-
name_en = metadata.get("name.en", "").strip()
|
65 |
-
name_de = metadata.get("name.de", "").strip()
|
66 |
-
base = name_en if name_en else name_de
|
67 |
-
pid = metadata.get("id", "")
|
68 |
-
if base and pid:
|
69 |
-
return f"{base} [{format_project_id(pid)}]"
|
70 |
-
return base or "No Title"
|
71 |
-
|
72 |
-
# Helper: Get CRS filter options from all documents
|
73 |
@st.cache_data
|
74 |
def get_crs_options(_client, collection_name):
|
75 |
results = hybrid_search(_client, "", collection_name)
|
@@ -77,20 +112,23 @@ def get_crs_options(_client, collection_name):
|
|
77 |
crs_set = set()
|
78 |
for res in all_results:
|
79 |
metadata = res.payload.get('metadata', {})
|
80 |
-
crs_value = metadata.get("crs_value", "").strip()
|
81 |
crs_key = metadata.get("crs_key", "").strip()
|
82 |
-
if crs_key
|
83 |
-
|
84 |
-
|
85 |
-
crs_int = int(float(crs_key))
|
86 |
-
except:
|
87 |
-
crs_int = crs_key
|
88 |
-
crs_combined = f"{crs_int}: {crs_value}"
|
89 |
crs_set.add(crs_combined)
|
90 |
return sorted(crs_set)
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
filtered = []
|
95 |
for r in results:
|
96 |
metadata = r.payload.get('metadata', {})
|
@@ -118,18 +156,21 @@ def filter_results(results, country_filter, region_filter, end_year_range, crs_f
|
|
118 |
countries_in_region = c_list
|
119 |
|
120 |
crs_key = metadata.get("crs_key", "").strip()
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
crs_combined = f"{crs_key}: {crs_int}" if (crs_key or crs_key) else ""
|
127 |
-
# Only enforce CRS filter if result has a CRS value.
|
128 |
if crs_filter != "All/Not allocated" and crs_combined:
|
129 |
if crs_filter != crs_combined:
|
130 |
continue
|
131 |
|
132 |
-
#
|
|
|
|
|
|
|
|
|
|
|
133 |
year_ok = True if end_year_val == 0 else (end_year_range[0] <= end_year_val <= end_year_range[1])
|
134 |
|
135 |
if ((country_filter == "All/Not allocated" or (selected_iso_code and selected_iso_code in c_list))
|
@@ -138,19 +179,44 @@ def filter_results(results, country_filter, region_filter, end_year_range, crs_f
|
|
138 |
filtered.append(r)
|
139 |
return filtered
|
140 |
|
141 |
-
|
|
|
|
|
142 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
143 |
|
|
|
|
|
|
|
144 |
st.set_page_config(page_title="SEARCH IATI", layout='wide')
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
var = st.text_input("Enter Search Question")
|
|
|
147 |
|
148 |
-
|
149 |
-
# Load
|
|
|
150 |
region_lookup_path = "docStore/regions_lookup.csv"
|
151 |
region_df = load_region_data(region_lookup_path)
|
152 |
|
153 |
-
|
|
|
|
|
|
|
154 |
# the steps below need to be performed only once and then commented out any unnecssary compute over-run
|
155 |
##### First we process and create the chunks for relvant data source
|
156 |
#chunks = process_giz_worldwide()
|
@@ -160,7 +226,9 @@ region_df = load_region_data(region_lookup_path)
|
|
160 |
collection_name = "giz_worldwide"
|
161 |
#hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
|
162 |
|
163 |
-
|
|
|
|
|
164 |
client = get_client()
|
165 |
print(client.get_collections())
|
166 |
max_end_year = get_max_end_year(client, collection_name)
|
@@ -192,11 +260,10 @@ client = get_client()
|
|
192 |
country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
|
193 |
unique_country_names = sorted(country_name_mapping.keys())
|
194 |
|
195 |
-
# Layout
|
196 |
col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
|
197 |
with col1:
|
198 |
region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
|
199 |
-
# Compute filtered_country_names based on region_filter:
|
200 |
if region_filter == "All/Not allocated":
|
201 |
filtered_country_names = unique_country_names
|
202 |
else:
|
@@ -214,19 +281,19 @@ with col4:
|
|
214 |
# Checkbox for exact matches
|
215 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
216 |
|
217 |
-
|
|
|
|
|
218 |
results = hybrid_search(client, var, collection_name, limit=500)
|
219 |
semantic_all = results[0]
|
220 |
lexical_all = results[1]
|
221 |
-
|
222 |
semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
|
223 |
lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
|
224 |
-
|
225 |
semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
|
231 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
232 |
|
@@ -236,193 +303,149 @@ def format_currency(value):
|
|
236 |
except (ValueError, TypeError):
|
237 |
return value
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
###############################
|
245 |
-
# Display Lexical Results Branch
|
246 |
-
###############################
|
247 |
if show_exact_matches:
|
248 |
-
st.write(
|
249 |
query_substring = var.strip().lower()
|
250 |
lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
|
251 |
-
filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter)
|
252 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
253 |
if not filtered_lexical_no_dupe:
|
254 |
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
255 |
else:
|
256 |
top_results = filtered_lexical_no_dupe[:10]
|
257 |
rag_answer = get_rag_answer(var, top_results)
|
258 |
-
|
|
|
259 |
st.write(rag_answer)
|
260 |
st.divider()
|
261 |
for res in top_results:
|
262 |
metadata = res.payload.get('metadata', {})
|
263 |
if "title" not in metadata:
|
264 |
metadata["title"] = compute_title(metadata)
|
265 |
-
# Highlight query in
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
)
|
270 |
-
proj_id = metadata.get('id', 'Unknown')
|
271 |
-
st.markdown(f"#### {metadata['title']}")
|
272 |
-
countries = metadata.get('countries')
|
273 |
-
client_name = metadata.get('client', 'Unknown Client')
|
274 |
-
start_year = metadata.get('start_year', None)
|
275 |
-
end_year = metadata.get('end_year', None)
|
276 |
-
total_volume = metadata.get('total_volume', "Unknown")
|
277 |
-
total_project = metadata.get('total_project', "Unknown")
|
278 |
objectives = metadata.get("objectives", "")
|
279 |
-
|
280 |
-
|
281 |
-
description =
|
282 |
full_snippet = f"{objectives} {description}"
|
283 |
words = full_snippet.split()
|
284 |
preview_word_count = 90
|
285 |
preview_text = " ".join(words[:preview_word_count])
|
286 |
remainder_text = " ".join(words[preview_word_count:])
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**\n"
|
330 |
-
f"Country: **{', '.join(matched_countries)}**\n"
|
331 |
-
f"Sector: **{crs_combined}**"
|
332 |
-
)
|
333 |
-
#contact = metadata.get("contact", "").strip()
|
334 |
-
#if contact and contact.lower() != "[email protected]":
|
335 |
-
# additional_text += f" | Contact: **{contact}**"
|
336 |
-
|
337 |
-
st.markdown(additional_text)
|
338 |
st.divider()
|
339 |
|
340 |
-
|
341 |
-
# Display Semantic Results Branch
|
342 |
-
###############################
|
343 |
else:
|
344 |
if not filtered_semantic_no_dupe:
|
345 |
st.write("No relevant results found.")
|
346 |
else:
|
347 |
top_results = filtered_semantic_no_dupe[:10]
|
348 |
rag_answer = get_rag_answer(var, top_results)
|
349 |
-
st.markdown(f"
|
350 |
st.write(rag_answer)
|
351 |
st.divider()
|
352 |
-
st.write(
|
353 |
for res in top_results:
|
354 |
metadata = res.payload.get('metadata', {})
|
355 |
if "title" not in metadata:
|
356 |
metadata["title"] = compute_title(metadata)
|
357 |
-
|
358 |
-
st.markdown(f"#### {display_title}")
|
359 |
-
countries = metadata.get('countries')
|
360 |
-
client_name = metadata.get('client', 'Unknown Client')
|
361 |
-
start_year = metadata.get('start_year', None)
|
362 |
-
end_year = metadata.get('end_year', None)
|
363 |
-
total_volume = metadata.get('total_volume', "Unknown")
|
364 |
-
total_project = metadata.get('total_project', "Unknown")
|
365 |
objectives = metadata.get("objectives", "")
|
366 |
-
|
367 |
-
|
368 |
-
description =
|
369 |
full_snippet = f"{objectives} {description}"
|
370 |
words = full_snippet.split()
|
371 |
preview_word_count = 90
|
372 |
preview_text = " ".join(words[:preview_word_count])
|
373 |
remainder_text = " ".join(words[preview_word_count:])
|
374 |
st.write(preview_text)
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
f"Country: **{', '.join(matched_countries)}**\n"
|
416 |
-
f"Sector: **{crs_combined}**"
|
417 |
-
)
|
418 |
-
#contact = metadata.get("contact", "").strip()
|
419 |
-
#if contact and contact.lower() != "[email protected]":
|
420 |
-
# additional_text += f" | Contact: **{contact}**"
|
421 |
-
|
422 |
-
st.markdown(additional_text)
|
423 |
-
st.divider()
|
424 |
-
# for i in results:
|
425 |
-
# st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
|
426 |
-
# st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")
|
427 |
-
# st.write(i.page_content)
|
428 |
-
# st.divider()
|
|
|
7 |
from appStore.embed import hybrid_embed_chunks
|
8 |
from appStore.search import hybrid_search
|
9 |
from appStore.region_utils import load_region_data, get_country_name, get_regions
|
10 |
+
#from appStore.tfidf_extraction import extract_top_keywords # TF-IDF part commented out
|
11 |
from torch import cuda
|
12 |
import json
|
13 |
from datetime import datetime
|
14 |
|
15 |
+
###########################################
|
16 |
+
# Helper functions for data processing
|
17 |
+
###########################################
|
18 |
+
|
19 |
+
# New helper: Truncate a text to a given (approximate) token count.
|
20 |
+
def truncate_to_tokens(text, max_tokens):
|
21 |
+
tokens = text.split() # simple approximation
|
22 |
+
if len(tokens) > max_tokens:
|
23 |
+
return " ".join(tokens[:max_tokens])
|
24 |
+
return text
|
25 |
+
|
26 |
+
# Build a context string for a single result using title, objectives and description.
|
27 |
+
def build_context_for_result(res):
|
28 |
+
metadata = res.payload.get('metadata', {})
|
29 |
+
# Compute title if not already present.
|
30 |
+
title = metadata.get("title", compute_title(metadata))
|
31 |
+
objectives = metadata.get("objectives", "")
|
32 |
+
# Use description.en if available; otherwise use description.de.
|
33 |
+
desc_en = metadata.get("description.en", "").strip()
|
34 |
+
desc_de = metadata.get("description.de", "").strip()
|
35 |
+
description = desc_en if desc_en != "" else desc_de
|
36 |
+
return f"{title}\n{objectives}\n{description}"
|
37 |
+
|
38 |
+
# Updated highlight: return HTML that makes the matched query red and bold.
|
39 |
+
def highlight_query(text, query):
|
40 |
+
pattern = re.compile(re.escape(query), re.IGNORECASE)
|
41 |
+
return pattern.sub(lambda m: f"<span style='color:red; font-weight:bold;'>{m.group(0)}</span>", text)
|
42 |
+
|
43 |
+
# Helper: Format project id (e.g., "201940485" -> "2019.4048.5")
|
44 |
+
def format_project_id(pid):
|
45 |
+
s = str(pid)
|
46 |
+
if len(s) > 5:
|
47 |
+
return s[:4] + "." + s[4:-1] + "." + s[-1]
|
48 |
+
return s
|
49 |
|
50 |
+
# Helper: Compute title from metadata using name.en (or name.de if empty)
|
51 |
+
def compute_title(metadata):
|
52 |
+
name_en = metadata.get("name.en", "").strip()
|
53 |
+
name_de = metadata.get("name.de", "").strip()
|
54 |
+
base = name_en if name_en else name_de
|
55 |
+
pid = metadata.get("id", "")
|
56 |
+
if base and pid:
|
57 |
+
return f"{base} [{format_project_id(pid)}]"
|
58 |
+
return base or "No Title"
|
59 |
|
60 |
+
# Load CRS lookup CSV and define a lookup function.
|
61 |
+
crs_lookup = pd.read_csv("docStore/crs5_code.csv") # Assumes columns: "code" and "new_crs_value"
|
62 |
+
def lookup_crs_value(crs_key):
|
63 |
+
row = crs_lookup[crs_lookup["code"] == crs_key]
|
64 |
+
if not row.empty:
|
65 |
+
# Convert to integer (drop decimals) and then to string.
|
66 |
+
try:
|
67 |
+
return str(int(float(row.iloc[0]["new_crs_value"])))
|
68 |
+
except:
|
69 |
+
return str(row.iloc[0]["new_crs_value"])
|
70 |
+
return ""
|
71 |
|
72 |
+
###########################################
|
73 |
+
# RAG Answer function (Change 1 & 2 & 3)
|
74 |
+
###########################################
|
75 |
def get_rag_answer(query, top_results):
|
76 |
+
# Build context from each top result using title, objectives, and description.
|
77 |
+
context = "\n\n".join([build_context_for_result(res) for res in top_results])
|
78 |
+
# Truncate context to 11500 tokens (approximation)
|
79 |
+
context = truncate_to_tokens(context, 11500)
|
80 |
+
# Improved prompt with role instruction and formatting instruction.
|
81 |
prompt = (
|
82 |
"You are a project portfolio adviser at the development cooperation GIZ. "
|
83 |
+
"Using the following context, answer the question in English precisely. "
|
84 |
+
"Ensure that any project title mentioned in your answer is wrapped in ** (markdown bold). "
|
85 |
"Only output the final answer below, without repeating the context or question.\n\n"
|
86 |
f"Context:\n{context}\n\n"
|
87 |
f"Question: {query}\n\n"
|
|
|
102 |
else:
|
103 |
return f"Error in generating answer: {response.text}"
|
104 |
|
105 |
+
###########################################
|
106 |
+
# CRS Options using lookup (Change 7)
|
107 |
+
###########################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
@st.cache_data
|
109 |
def get_crs_options(_client, collection_name):
|
110 |
results = hybrid_search(_client, "", collection_name)
|
|
|
112 |
crs_set = set()
|
113 |
for res in all_results:
|
114 |
metadata = res.payload.get('metadata', {})
|
|
|
115 |
crs_key = metadata.get("crs_key", "").strip()
|
116 |
+
if crs_key:
|
117 |
+
new_value = lookup_crs_value(crs_key)
|
118 |
+
crs_combined = f"{crs_key}: {new_value}"
|
|
|
|
|
|
|
|
|
119 |
crs_set.add(crs_combined)
|
120 |
return sorted(crs_set)
|
121 |
|
122 |
+
###########################################
|
123 |
+
# Revised filter_results with budget filtering (Change 7 & 9)
|
124 |
+
###########################################
|
125 |
+
def parse_budget(value):
|
126 |
+
try:
|
127 |
+
return float(value)
|
128 |
+
except:
|
129 |
+
return 0.0
|
130 |
+
|
131 |
+
def filter_results(results, country_filter, region_filter, end_year_range, crs_filter, budget_filter):
|
132 |
filtered = []
|
133 |
for r in results:
|
134 |
metadata = r.payload.get('metadata', {})
|
|
|
156 |
countries_in_region = c_list
|
157 |
|
158 |
crs_key = metadata.get("crs_key", "").strip()
|
159 |
+
# Use lookup value instead of stored crs_value.
|
160 |
+
new_crs_value = lookup_crs_value(crs_key)
|
161 |
+
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else ""
|
162 |
+
|
163 |
+
# Enforce CRS filter only if specified.
|
|
|
|
|
164 |
if crs_filter != "All/Not allocated" and crs_combined:
|
165 |
if crs_filter != crs_combined:
|
166 |
continue
|
167 |
|
168 |
+
# Budget filtering: parse total_project value.
|
169 |
+
budget_value = parse_budget(metadata.get('total_project', "0"))
|
170 |
+
# Only keep results with budget >= budget_filter (in million euros, so multiply by 1e6)
|
171 |
+
if budget_value < (budget_filter * 1e6):
|
172 |
+
continue
|
173 |
+
|
174 |
year_ok = True if end_year_val == 0 else (end_year_range[0] <= end_year_val <= end_year_range[1])
|
175 |
|
176 |
if ((country_filter == "All/Not allocated" or (selected_iso_code and selected_iso_code in c_list))
|
|
|
179 |
filtered.append(r)
|
180 |
return filtered
|
181 |
|
182 |
+
###########################################
|
183 |
+
# Get device
|
184 |
+
###########################################
|
185 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
186 |
|
187 |
+
###########################################
|
188 |
+
# App heading and About button (Change 5 & 6)
|
189 |
+
###########################################
|
190 |
st.set_page_config(page_title="SEARCH IATI", layout='wide')
|
191 |
+
col_title, col_about = st.columns([8,2])
|
192 |
+
with col_title:
|
193 |
+
st.markdown("<h1 style='text-align:center;'>GIZ Project Database (PROTOTYPE)</h1>", unsafe_allow_html=True)
|
194 |
+
with col_about:
|
195 |
+
with st.expander("About"):
|
196 |
+
st.markdown(
|
197 |
+
"""
|
198 |
+
**This app is a prototype for testing purposes.**
|
199 |
+
The intended use is to explore AI-generated answers using publicly available project data from the German International Cooperation Society (GIZ) as of 23rd February 2025.
|
200 |
+
**Please do NOT enter sensitive or personal information.**
|
201 |
+
Note: The generated answers are AI-generated and may be wrong or misleading.
|
202 |
+
""")
|
203 |
+
|
204 |
+
###########################################
|
205 |
+
# Query input and budget slider (Change 9)
|
206 |
+
###########################################
|
207 |
var = st.text_input("Enter Search Question")
|
208 |
+
min_budget = st.slider("Minimum Project Budget (Million €)", min_value=0.0, max_value=100.0, value=0.0)
|
209 |
|
210 |
+
###########################################
|
211 |
+
# Load region lookup CSV
|
212 |
+
###########################################
|
213 |
region_lookup_path = "docStore/regions_lookup.csv"
|
214 |
region_df = load_region_data(region_lookup_path)
|
215 |
|
216 |
+
|
217 |
+
###########################################
|
218 |
+
# Create the embeddings collection and save
|
219 |
+
###########################################
|
220 |
# the steps below need to be performed only once and then commented out any unnecssary compute over-run
|
221 |
##### First we process and create the chunks for relvant data source
|
222 |
#chunks = process_giz_worldwide()
|
|
|
226 |
collection_name = "giz_worldwide"
|
227 |
#hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
|
228 |
|
229 |
+
###########################################
|
230 |
+
# Hybrid Search and Filters Setup
|
231 |
+
###########################################
|
232 |
client = get_client()
|
233 |
print(client.get_collections())
|
234 |
max_end_year = get_max_end_year(client, collection_name)
|
|
|
260 |
country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
|
261 |
unique_country_names = sorted(country_name_mapping.keys())
|
262 |
|
263 |
+
# Layout filter columns
|
264 |
col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
|
265 |
with col1:
|
266 |
region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
|
|
|
267 |
if region_filter == "All/Not allocated":
|
268 |
filtered_country_names = unique_country_names
|
269 |
else:
|
|
|
281 |
# Checkbox for exact matches
|
282 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
283 |
|
284 |
+
###########################################
|
285 |
+
# Run the search and apply filters
|
286 |
+
###########################################
|
287 |
results = hybrid_search(client, var, collection_name, limit=500)
|
288 |
semantic_all = results[0]
|
289 |
lexical_all = results[1]
|
|
|
290 |
semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
|
291 |
lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
|
|
|
292 |
semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
|
293 |
|
294 |
+
# Pass the budget filter (min_budget) into filter_results
|
295 |
+
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
296 |
+
filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
297 |
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
|
298 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
299 |
|
|
|
303 |
except (ValueError, TypeError):
|
304 |
return value
|
305 |
|
306 |
+
###########################################
|
307 |
+
# Display Results (Lexical and Semantic)
|
308 |
+
###########################################
|
309 |
+
# --- Lexical Results Branch ---
|
|
|
|
|
|
|
|
|
310 |
if show_exact_matches:
|
311 |
+
st.write("Showing **Top 15 Lexical Search results**")
|
312 |
query_substring = var.strip().lower()
|
313 |
lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
|
314 |
+
filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter, min_budget)
|
315 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
316 |
if not filtered_lexical_no_dupe:
|
317 |
st.write('No exact matches, consider unchecking "Show only exact matches"')
|
318 |
else:
|
319 |
top_results = filtered_lexical_no_dupe[:10]
|
320 |
rag_answer = get_rag_answer(var, top_results)
|
321 |
+
# Use the query as heading; increase size and center it.
|
322 |
+
st.markdown(f"<h2 style='text-align:center; font-size:2.5em;'>Query: {var}</h2>", unsafe_allow_html=True)
|
323 |
st.write(rag_answer)
|
324 |
st.divider()
|
325 |
for res in top_results:
|
326 |
metadata = res.payload.get('metadata', {})
|
327 |
if "title" not in metadata:
|
328 |
metadata["title"] = compute_title(metadata)
|
329 |
+
# Highlight query matches in title (rendered with HTML)
|
330 |
+
title_html = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
|
331 |
+
st.markdown(f"#### {title_html}", unsafe_allow_html=True)
|
332 |
+
# Build snippet from objectives and description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
objectives = metadata.get("objectives", "")
|
334 |
+
desc_en = metadata.get("description.en", "").strip()
|
335 |
+
desc_de = metadata.get("description.de", "").strip()
|
336 |
+
description = desc_en if desc_en != "" else desc_de
|
337 |
full_snippet = f"{objectives} {description}"
|
338 |
words = full_snippet.split()
|
339 |
preview_word_count = 90
|
340 |
preview_text = " ".join(words[:preview_word_count])
|
341 |
remainder_text = " ".join(words[preview_word_count:])
|
342 |
+
st.markdown(highlight_query(preview_text, var), unsafe_allow_html=True)
|
343 |
+
# Create two columns: left for "Show more" (remainder text) and right for additional details.
|
344 |
+
col_left, col_right = st.columns(2)
|
345 |
+
with col_left:
|
346 |
+
if remainder_text:
|
347 |
+
with st.expander("Show more"):
|
348 |
+
st.write(remainder_text)
|
349 |
+
with col_right:
|
350 |
+
# Format additional text with line breaks using <br>
|
351 |
+
start_year = metadata.get('start_year', None)
|
352 |
+
end_year = metadata.get('end_year', None)
|
353 |
+
start_year_str = extract_year(start_year) if start_year else "Unknown"
|
354 |
+
end_year_str = extract_year(end_year) if end_year else "Unknown"
|
355 |
+
total_project = metadata.get('total_project', "Unknown")
|
356 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
357 |
+
formatted_project_budget = format_currency(total_project)
|
358 |
+
formatted_total_volume = format_currency(total_volume)
|
359 |
+
try:
|
360 |
+
c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
|
361 |
+
except json.JSONDecodeError:
|
362 |
+
c_list = []
|
363 |
+
matched_countries = []
|
364 |
+
for code in c_list:
|
365 |
+
if len(code) == 2:
|
366 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
367 |
+
if resolved_name.upper() != code.upper():
|
368 |
+
matched_countries.append(resolved_name)
|
369 |
+
crs_key = metadata.get("crs_key", "").strip()
|
370 |
+
new_crs_value = lookup_crs_value(crs_key)
|
371 |
+
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
|
372 |
+
client_name = metadata.get('client', 'Unknown Client')
|
373 |
+
contact = metadata.get("contact", "").strip()
|
374 |
+
additional_text = (
|
375 |
+
f"Commissioned by **{client_name}**<br>"
|
376 |
+
f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
|
377 |
+
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
|
378 |
+
f"Country: **{', '.join(matched_countries)}**<br>"
|
379 |
+
f"Sector: **{crs_combined}**"
|
380 |
+
)
|
381 |
+
if contact and contact.lower() != "[email protected]":
|
382 |
+
additional_text += f"<br>Contact: **{contact}**"
|
383 |
+
st.markdown(additional_text, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
st.divider()
|
385 |
|
386 |
+
# --- Semantic Results Branch ---
|
|
|
|
|
387 |
else:
|
388 |
if not filtered_semantic_no_dupe:
|
389 |
st.write("No relevant results found.")
|
390 |
else:
|
391 |
top_results = filtered_semantic_no_dupe[:10]
|
392 |
rag_answer = get_rag_answer(var, top_results)
|
393 |
+
st.markdown(f"<h2 style='text-align:center; font-size:2.5em;'>Query: {var}</h2>", unsafe_allow_html=True)
|
394 |
st.write(rag_answer)
|
395 |
st.divider()
|
396 |
+
st.write("Showing **Top 15 Semantic Search results**")
|
397 |
for res in top_results:
|
398 |
metadata = res.payload.get('metadata', {})
|
399 |
if "title" not in metadata:
|
400 |
metadata["title"] = compute_title(metadata)
|
401 |
+
st.markdown(f"#### {metadata['title']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
objectives = metadata.get("objectives", "")
|
403 |
+
desc_en = metadata.get("description.en", "").strip()
|
404 |
+
desc_de = metadata.get("description.de", "").strip()
|
405 |
+
description = desc_en if desc_en != "" else desc_de
|
406 |
full_snippet = f"{objectives} {description}"
|
407 |
words = full_snippet.split()
|
408 |
preview_word_count = 90
|
409 |
preview_text = " ".join(words[:preview_word_count])
|
410 |
remainder_text = " ".join(words[preview_word_count:])
|
411 |
st.write(preview_text)
|
412 |
+
col_left, col_right = st.columns(2)
|
413 |
+
with col_left:
|
414 |
+
if remainder_text:
|
415 |
+
with st.expander("Show more"):
|
416 |
+
st.write(remainder_text)
|
417 |
+
with col_right:
|
418 |
+
start_year = metadata.get('start_year', None)
|
419 |
+
end_year = metadata.get('end_year', None)
|
420 |
+
start_year_str = extract_year(start_year) if start_year else "Unknown"
|
421 |
+
end_year_str = extract_year(end_year) if end_year else "Unknown"
|
422 |
+
total_project = metadata.get('total_project', "Unknown")
|
423 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
424 |
+
formatted_project_budget = format_currency(total_project)
|
425 |
+
formatted_total_volume = format_currency(total_volume)
|
426 |
+
try:
|
427 |
+
c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
|
428 |
+
except json.JSONDecodeError:
|
429 |
+
c_list = []
|
430 |
+
matched_countries = []
|
431 |
+
for code in c_list:
|
432 |
+
if len(code) == 2:
|
433 |
+
resolved_name = get_country_name(code.upper(), region_df)
|
434 |
+
if resolved_name.upper() != code.upper():
|
435 |
+
matched_countries.append(resolved_name)
|
436 |
+
crs_key = metadata.get("crs_key", "").strip()
|
437 |
+
new_crs_value = lookup_crs_value(crs_key)
|
438 |
+
crs_combined = f"{crs_key}: {new_crs_value}" if crs_key else "Unknown"
|
439 |
+
client_name = metadata.get('client', 'Unknown Client')
|
440 |
+
contact = metadata.get("contact", "").strip()
|
441 |
+
additional_text = (
|
442 |
+
f"Commissioned by **{client_name}**<br>"
|
443 |
+
f"Projekt duration **{start_year_str}-{end_year_str}**<br>"
|
444 |
+
f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**<br>"
|
445 |
+
f"Country: **{', '.join(matched_countries)}**<br>"
|
446 |
+
f"Sector: **{crs_combined}**"
|
447 |
+
)
|
448 |
+
if contact and contact.lower() != "[email protected]":
|
449 |
+
additional_text += f"<br>Contact: **{contact}**"
|
450 |
+
st.markdown(additional_text, unsafe_allow_html=True)
|
451 |
+
st.divider()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|