annikwag commited on
Commit
755183b
verified
1 Parent(s): a2b28e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -156
app.py CHANGED
@@ -10,11 +10,10 @@ from torch import cuda
10
  import json
11
  from datetime import datetime
12
 
13
- # get the device to be used eithe gpu or cpu
14
  device = 'cuda' if cuda.is_available() else 'cpu'
15
 
16
-
17
- st.set_page_config(page_title="SEARCH IATI",layout='wide')
18
  st.title("GIZ Project Database (PROTOTYPE)")
19
  var = st.text_input("Enter Search Query")
20
 
@@ -23,14 +22,11 @@ region_lookup_path = "docStore/regions_lookup.csv"
23
  region_df = load_region_data(region_lookup_path)
24
 
25
  #################### Create the embeddings collection and save ######################
26
- # the steps below need to be performed only once and then commented out any unnecssary compute over-run
27
- ##### First we process and create the chunks for relvant data source
28
- #chunks = process_giz_worldwide()
29
- ##### Convert to langchain documents
30
- #temp_doc = create_documents(chunks,'chunks')
31
- ##### Embed and store docs, check if collection exist then you need to update the collection
32
  collection_name = "giz_worldwide"
33
- #hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
34
 
35
  ################### Hybrid Search ######################################################
36
  client = get_client()
@@ -81,11 +77,11 @@ col1, col2, col3, col4 = st.columns([1, 1, 1, 4])
81
 
82
  # Region filter
83
  with col1:
84
- region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions)) # Display region names
85
 
86
  # Dynamically filter countries based on selected region
87
  if region_filter == "All/Not allocated":
88
- filtered_country_names = unique_country_names # Show all countries if no region is selected
89
  else:
90
  filtered_country_names = [
91
  name for name, code in country_name_mapping.items() if iso_code_to_sub_region.get(code) == region_filter
@@ -93,14 +89,12 @@ else:
93
 
94
  # Country filter
95
  with col2:
96
- country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names) # Display filtered country names
97
 
98
- # # Year range slider # ToDo add end_year filter again
99
  # with col3:
100
  # current_year = datetime.now().year
101
  # default_start_year = current_year - 5
102
-
103
- # # 3) The max_value is now the actual max end_year from collection
104
  # end_year_range = st.slider(
105
  # "Project End Year",
106
  # min_value=2010,
@@ -111,7 +105,7 @@ with col2:
111
  # Checkbox to control whether to show only exact matches
112
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
113
 
114
- def filter_results(results, country_filter, region_filter): ## , end_year_range ToDo add end_year filter again
115
  filtered = []
116
  for r in results:
117
  metadata = r.payload.get('metadata', {})
@@ -142,135 +136,97 @@ def filter_results(results, country_filter, region_filter): ## , end_year_range
142
  else:
143
  countries_in_region = c_list
144
 
145
- # Filtering
146
  if (
147
  (country_filter == "All/Not allocated" or selected_iso_code in c_list)
148
  and (region_filter == "All/Not allocated" or countries_in_region)
149
- # and (end_year_range[0] <= end_year_val <= end_year_range[1]) # ToDo add end_year filter again
150
  ):
151
  filtered.append(r)
152
  return filtered
153
 
154
  # Run the search
155
-
156
- # 1) Adjust limit so we get more than 15 results
157
- results = hybrid_search(client, var, collection_name, limit=500) # e.g., 100 or 200
158
-
159
- # results is a tuple: (semantic_results, lexical_results)
160
  semantic_all = results[0]
161
  lexical_all = results[1]
162
 
163
- # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
164
- semantic_all = [
165
- r for r in semantic_all if len(r.payload["page_content"]) >= 5
166
- ]
167
- lexical_all = [
168
- r for r in lexical_all if len(r.payload["page_content"]) >= 5
169
- ]
170
 
171
- # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
172
  semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
173
 
174
- # 2) Filter the entire sets
175
- filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter) ## , end_year_range ToDo add end_year filter again
176
- filtered_lexical = filter_results(lexical_all, country_filter, region_filter)## , end_year_range ToDo add end_year filter again
177
 
178
- filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
179
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
180
 
 
 
 
 
 
 
 
181
 
182
- # 3) Retrieve top 15 *after* filtering
183
- # Check user preference
184
  if show_exact_matches:
185
- # 1) Display heading
186
  st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
187
 
188
- # 2) Do a simple substring check (case-insensitive)
189
- # We'll create a new list lexical_substring_filtered
190
  query_substring = var.strip().lower()
191
  lexical_substring_filtered = []
192
  for r in lexical_all:
193
- # page_content in lowercase
194
- page_text_lower = r.payload["page_content"].lower()
195
- # Keep this result only if the query substring is found
196
- if query_substring in page_text_lower:
197
  lexical_substring_filtered.append(r)
198
 
199
- # 3) Now apply your region/country/year filter on that new list
200
- filtered_lexical = filter_results(
201
- lexical_substring_filtered, country_filter, region_filter
202
- ) ## , end_year_range ToDo add end_year filter again
203
-
204
- # 4) Remove duplicates
205
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
206
 
207
- # 5) If empty after substring + filters + dedupe, show a custom message
208
  if not filtered_lexical_no_dupe:
209
  st.write('No exact matches, consider unchecking "Show only exact matches"')
210
  else:
211
- # 6) Display the first 15 matching results
212
  for res in filtered_lexical_no_dupe[:15]:
213
- project_name = res.payload['metadata'].get('project_name', 'Project Link')
214
- url = res.payload['metadata'].get('url', '#')
215
- st.markdown(f"#### [{project_name}]({url})")
216
-
217
- # Snippet logic (80 words)
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  full_text = res.payload['page_content']
219
- words = full_text.split()
220
- preview_word_count = 200
221
- preview_text = " ".join(words[:preview_word_count])
222
- remainder_text = " ".join(words[preview_word_count:])
223
- st.write(preview_text + ("..." if remainder_text else ""))
224
-
225
- # Keywords
226
  top_keywords = extract_top_keywords(full_text, top_n=5)
227
  if top_keywords:
228
  st.markdown(f"_{' 路 '.join(top_keywords)}_")
229
 
230
- # Metadata
231
- metadata = res.payload.get('metadata', {})
232
- countries = metadata.get('countries', "[]")
233
  client_name = metadata.get('client', 'Unknown Client')
234
  start_year = metadata.get('start_year', None)
235
  end_year = metadata.get('end_year', None)
236
  total_volume = metadata.get('total_volume', "Unknown")
237
  total_project = metadata.get('total_project', "Unknown")
238
- id = metadata.get('id', "Unknown")
239
 
240
-
241
- try:
242
- c_list = json.loads(countries.replace("'", '"'))
243
- except json.JSONDecodeError:
244
- c_list = []
245
-
246
- # Only keep country names if the region lookup (get_country_name)
247
- # returns something different than the raw code.
248
- matched_countries = []
249
- for code in c_list:
250
- if len(code) == 2:
251
- resolved_name = get_country_name(code.upper(), region_df)
252
- # If get_country_name didn't find a match,
253
- # it typically just returns the same code (like "XX").
254
- # We'll consider "successfully looked up" if
255
- # resolved_name != code.upper().
256
- if resolved_name.upper() != code.upper():
257
- matched_countries.append(resolved_name)
258
-
259
- # Format the year range
260
  start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
261
  end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
262
-
263
- if matched_countries:
264
- additional_text = (
265
- f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
266
- f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
267
- )
268
- else:
269
- additional_text = (
270
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
271
- )
272
-
273
-
274
  st.markdown(additional_text)
275
  st.divider()
276
 
@@ -280,75 +236,51 @@ else:
280
  if not filtered_semantic_no_dupe:
281
  st.write("No relevant results found.")
282
  else:
283
- # Show the top 15 from filtered_semantic
284
  for res in filtered_semantic_no_dupe[:15]:
285
- project_name = res.payload['metadata'].get('project_name', 'Project Link')
286
- url = res.payload['metadata'].get('url', '#')
287
- st.markdown(f"#### [{project_name}]({url})")
288
-
289
- # Snippet logic
290
- full_text = res.payload['page_content']
291
- words = full_text.split()
292
- preview_word_count = 10
293
- preview_text = " ".join(words[:preview_word_count])
294
- remainder_text = " ".join(words[preview_word_count:])
295
- st.write(preview_text + ("..." if remainder_text else ""))
 
 
 
296
 
297
  # Keywords
 
298
  top_keywords = extract_top_keywords(full_text, top_n=5)
299
  if top_keywords:
300
  st.markdown(f"_{' 路 '.join(top_keywords)}_")
301
 
302
- # Metadata
303
- metadata = res.payload.get('metadata', {})
304
- countries = metadata.get('countries', "[]")
305
  client_name = metadata.get('client', 'Unknown Client')
306
  start_year = metadata.get('start_year', None)
307
  end_year = metadata.get('end_year', None)
308
  total_volume = metadata.get('total_volume', "Unknown")
309
  total_project = metadata.get('total_project', "Unknown")
310
- id = metadata.get('id', "Unknown")
311
-
312
- try:
313
- c_list = json.loads(countries.replace("'", '"'))
314
- except json.JSONDecodeError:
315
- c_list = []
316
-
317
- # Only keep country names if the region lookup (get_country_name)
318
- # returns something different than the raw code.
319
- matched_countries = []
320
- for code in c_list:
321
- if len(code) == 2:
322
- resolved_name = get_country_name(code.upper(), region_df)
323
- # If get_country_name didn't find a match,
324
- # it typically just returns the same code (like "XX").
325
- # We'll consider "successfully looked up" if
326
- # resolved_name != code.upper().
327
- if resolved_name.upper() != code.upper():
328
- matched_countries.append(resolved_name)
329
-
330
- # Format the year range
331
  start_year_str = extract_year(start_year) if start_year else "Unknown"
332
  end_year_str = extract_year(end_year) if end_year else "Unknown"
333
-
334
- # Build the final string
335
- if matched_countries:
336
- additional_text = (
337
- f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
338
- f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
339
- )
340
- else:
341
- additional_text = (
342
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
343
- )
344
-
345
-
346
  st.markdown(additional_text)
347
  st.divider()
348
 
349
-
350
- # for i in results:
351
- # st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
352
- # st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")
353
- # st.write(i.page_content)
354
- # st.divider()
 
10
  import json
11
  from datetime import datetime
12
 
13
+ # get the device to be used either gpu or cpu
14
  device = 'cuda' if cuda.is_available() else 'cpu'
15
 
16
+ st.set_page_config(page_title="SEARCH IATI", layout='wide')
 
17
  st.title("GIZ Project Database (PROTOTYPE)")
18
  var = st.text_input("Enter Search Query")
19
 
 
22
  region_df = load_region_data(region_lookup_path)
23
 
24
  #################### Create the embeddings collection and save ######################
25
+ # Uncomment these lines to process and embed your data only once.
26
+ # chunks = process_giz_worldwide()
27
+ # temp_doc = create_documents(chunks, 'chunks')
 
 
 
28
  collection_name = "giz_worldwide"
29
+ # hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
30
 
31
  ################### Hybrid Search ######################################################
32
  client = get_client()
 
77
 
78
  # Region filter
79
  with col1:
80
+ region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
81
 
82
  # Dynamically filter countries based on selected region
83
  if region_filter == "All/Not allocated":
84
+ filtered_country_names = unique_country_names
85
  else:
86
  filtered_country_names = [
87
  name for name, code in country_name_mapping.items() if iso_code_to_sub_region.get(code) == region_filter
 
89
 
90
  # Country filter
91
  with col2:
92
+ country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names)
93
 
94
+ # ToDo: Add year filter later if needed (currently commented out)
95
  # with col3:
96
  # current_year = datetime.now().year
97
  # default_start_year = current_year - 5
 
 
98
  # end_year_range = st.slider(
99
  # "Project End Year",
100
  # min_value=2010,
 
105
  # Checkbox to control whether to show only exact matches
106
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
107
 
108
+ def filter_results(results, country_filter, region_filter):
109
  filtered = []
110
  for r in results:
111
  metadata = r.payload.get('metadata', {})
 
136
  else:
137
  countries_in_region = c_list
138
 
 
139
  if (
140
  (country_filter == "All/Not allocated" or selected_iso_code in c_list)
141
  and (region_filter == "All/Not allocated" or countries_in_region)
 
142
  ):
143
  filtered.append(r)
144
  return filtered
145
 
146
  # Run the search
147
+ results = hybrid_search(client, var, collection_name, limit=500)
 
 
 
 
148
  semantic_all = results[0]
149
  lexical_all = results[1]
150
 
151
+ # Filter out very short content
152
+ semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
153
+ lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
 
 
 
 
154
 
155
+ # Apply a threshold to SEMANTIC results (score >= 0.0)
156
  semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
157
 
158
+ filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter)
159
+ filtered_lexical = filter_results(lexical_all, country_filter, region_filter)
 
160
 
161
+ filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
162
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
163
 
164
+ # Define a helper function to format currency values
165
+ def format_currency(value):
166
+ try:
167
+ # Convert to float then int for formatting (assumes whole numbers)
168
+ return f"鈧瑊int(float(value)):,}"
169
+ except (ValueError, TypeError):
170
+ return value
171
 
172
+ # Display Results
 
173
  if show_exact_matches:
 
174
  st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
175
 
 
 
176
  query_substring = var.strip().lower()
177
  lexical_substring_filtered = []
178
  for r in lexical_all:
179
+ if query_substring in r.payload["page_content"].lower():
 
 
 
180
  lexical_substring_filtered.append(r)
181
 
182
+ filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter)
 
 
 
 
 
183
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
184
 
 
185
  if not filtered_lexical_no_dupe:
186
  st.write('No exact matches, consider unchecking "Show only exact matches"')
187
  else:
 
188
  for res in filtered_lexical_no_dupe[:15]:
189
+ metadata = res.payload.get('metadata', {})
190
+ # Get title and id; do not format as a link.
191
+ project_name = metadata.get('project_name', 'Project Link')
192
+ proj_id = metadata.get('id', 'Unknown')
193
+ st.markdown(f"#### {project_name} [{proj_id}]")
194
+
195
+ # Build snippet from objectives and descriptions.
196
+ objectives = metadata.get("objectives", "")
197
+ desc_de = metadata.get("description.de", "")
198
+ desc_en = metadata.get("description.en", "")
199
+ description = desc_de if desc_de else desc_en
200
+ full_snippet = f"Objective: {objectives} Description: {description}"
201
+ preview_limit = 400 # preview limit in characters
202
+ preview_snippet = full_snippet if len(full_snippet) <= preview_limit else full_snippet[:preview_limit] + "..."
203
+ # Using HTML to add a tooltip with the full snippet text.
204
+ st.markdown(f'<span title="{full_snippet}">{preview_snippet}</span>', unsafe_allow_html=True)
205
+
206
+ # Keywords remain the same.
207
  full_text = res.payload['page_content']
 
 
 
 
 
 
 
208
  top_keywords = extract_top_keywords(full_text, top_n=5)
209
  if top_keywords:
210
  st.markdown(f"_{' 路 '.join(top_keywords)}_")
211
 
212
+ # Metadata: get client, duration and budget details.
 
 
213
  client_name = metadata.get('client', 'Unknown Client')
214
  start_year = metadata.get('start_year', None)
215
  end_year = metadata.get('end_year', None)
216
  total_volume = metadata.get('total_volume', "Unknown")
217
  total_project = metadata.get('total_project', "Unknown")
 
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
220
  end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
221
+
222
+ formatted_project_budget = format_currency(total_project)
223
+ formatted_total_volume = format_currency(total_volume)
224
+
225
+ additional_text = (
226
+ f"Commissioned by **{client_name}**\n"
227
+ f"Projekt duration **{start_year_str}-{end_year_str}**\n"
228
+ f"Budget: Project: **{formatted_project_budget}**, total volume: **{formatted_total_volume}**"
229
+ )
 
 
 
230
  st.markdown(additional_text)
231
  st.divider()
232
 
 
236
  if not filtered_semantic_no_dupe:
237
  st.write("No relevant results found.")
238
  else:
 
239
  for res in filtered_semantic_no_dupe[:15]:
240
+ metadata = res.payload.get('metadata', {})
241
+ project_name = metadata.get('project_name', 'Project Link')
242
+ proj_id = metadata.get('id', 'Unknown')
243
+ st.markdown(f"#### {project_name} [{proj_id}]")
244
+
245
+ # Build snippet from objectives and descriptions.
246
+ objectives = metadata.get("objectives", "")
247
+ desc_de = metadata.get("description.de", "")
248
+ desc_en = metadata.get("description.en", "")
249
+ description = desc_de if desc_de else desc_en
250
+ full_snippet = f"Objective: {objectives} Description: {description}"
251
+ preview_limit = 400
252
+ preview_snippet = full_snippet if len(full_snippet) <= preview_limit else full_snippet[:preview_limit] + "..."
253
+ st.markdown(f'<span title="{full_snippet}">{preview_snippet}</span>', unsafe_allow_html=True)
254
 
255
  # Keywords
256
+ full_text = res.payload['page_content']
257
  top_keywords = extract_top_keywords(full_text, top_n=5)
258
  if top_keywords:
259
  st.markdown(f"_{' 路 '.join(top_keywords)}_")
260
 
 
 
 
261
  client_name = metadata.get('client', 'Unknown Client')
262
  start_year = metadata.get('start_year', None)
263
  end_year = metadata.get('end_year', None)
264
  total_volume = metadata.get('total_volume', "Unknown")
265
  total_project = metadata.get('total_project', "Unknown")
266
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  start_year_str = extract_year(start_year) if start_year else "Unknown"
268
  end_year_str = extract_year(end_year) if end_year else "Unknown"
269
+
270
+ formatted_project_budget = format_currency(total_project)
271
+ formatted_total_volume = format_currency(total_volume)
272
+
273
+ additional_text = (
274
+ f"Commissioned by **{client_name}**\n"
275
+ f"Projekt duration **{start_year_str}-{end_year_str}**\n"
276
+ f"Budget: Project: **{formatted_project_budget}**, total volume: **{formatted_total_volume}**"
277
+ )
 
 
 
 
278
  st.markdown(additional_text)
279
  st.divider()
280
 
281
+ # Uncomment the following lines if you need to debug by listing raw results.
282
+ # for i in results:
283
+ # st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
284
+ # st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")
285
+ # st.write(i.page_content)
286
+ # st.divider()