TEST-GIZ-Project-Search

Sleeping

App Files Files Community

annikwag commited on Mar 3

Commit

7b37585

verified ·

1 Parent(s): d237e1f

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -198

app.py CHANGED Viewed

@@ -64,6 +64,83 @@ def get_rag_answer(query, top_results):
     else:
         return f"Error in generating answer: {response.text}"
 #######
 # get the device to be used eithe gpu or cpu
@@ -72,7 +149,7 @@ device = 'cuda' if cuda.is_available() else 'cpu'
 st.set_page_config(page_title="SEARCH IATI",layout='wide')
 st.title("GIZ Project Database (PROTOTYPE)")
-var = st.text_input("Enter Search Query")
 # Load the region lookup CSV
 region_lookup_path = "docStore/regions_lookup.csv"
@@ -132,80 +209,24 @@ client = get_client()
 country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
 unique_country_names = sorted(country_name_mapping.keys())  # List of country names
-# Layout filters in columns
 col1, col2, col3, col4 = st.columns([1, 1, 1, 4])
-# Region filter
 with col1:
-    region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))  # Display region names
-# Dynamically filter countries based on selected region
-if region_filter == "All/Not allocated":
-    filtered_country_names = unique_country_names  # Show all countries if no region is selected
-else:
-    filtered_country_names = [
-        name for name, code in country_name_mapping.items() if iso_code_to_sub_region.get(code) == region_filter
-    ]
-# Country filter
 with col2:
-    country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names)  # Display filtered country names
-# Year range slider # ToDo add end_year filter again
 with col3:
     current_year = datetime.now().year
-    default_start_year = current_year - 5
-    # 3) The max_value is now the actual max end_year from collection
-    end_year_range = st.slider(
-        "Project End Year",
-        min_value=2010,
-        max_value=max_end_year,
-        value=(default_start_year, max_end_year),
-    )
 # Checkbox to control whether to show only exact matches
 show_exact_matches = st.checkbox("Show only exact matches", value=False)
-def filter_results(results, country_filter, region_filter, end_year_range): ##  ToDo add end_year filter again
-    filtered = []
-    for r in results:
-        metadata = r.payload.get('metadata', {})
-        countries = metadata.get('countries', "[]")
-        year_str = metadata.get('end_year')
-        if year_str:
-            extracted = extract_year(year_str)
-            try:
-                end_year_val = int(extracted) if extracted != "Unknown" else 0
-            except ValueError:
-                end_year_val = 0
-        else:
-            end_year_val = 0
-        # Convert countries to a list
-        try:
-            c_list = json.loads(countries.replace("'", '"'))
-            c_list = [code.upper() for code in c_list if len(code) == 2]
-        except json.JSONDecodeError:
-            c_list = []
-        # Translate selected country name to iso2
-        selected_iso_code = country_name_mapping.get(country_filter, None)
-        # Check if any country in the metadata matches the selected region
-        if region_filter != "All/Not allocated":
-            countries_in_region = [code for code in c_list if iso_code_to_sub_region.get(code) == region_filter]
-        else:
-            countries_in_region = c_list
-        # Filtering
-        if (
-            (country_filter == "All/Not allocated" or selected_iso_code in c_list)
-            and (region_filter == "All/Not allocated" or countries_in_region)
-            and (end_year_range[0] <= end_year_val <= end_year_range[1]) # ToDo add end_year filter again
-        ):
-            filtered.append(r)
-    return filtered
 # Run the search
@@ -227,10 +248,9 @@ lexical_all = [
 # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
 semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
-# 2) Filter the entire sets
-filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range) ##  ToDo add end_year filter again
-filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range)## ToDo add end_year filter again
 filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
@@ -241,33 +261,21 @@ def format_currency(value):
         return f"€{int(float(value)):,}"
     except (ValueError, TypeError):
         return value
-# 3) Retrieve top 15 *after* filtering
-# Check user preference
 if show_exact_matches:
-    # 1) Display heading
     st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
-    # 2) Do a simple substring check (case-insensitive)
-    #    We'll create a new list lexical_substring_filtered
     query_substring = var.strip().lower()
-    lexical_substring_filtered = []
-    for r in lexical_all:
-        # page_content in lowercase
-        page_text_lower = r.payload["page_content"].lower()
-        # Keep this result only if the query substring is found
-        if query_substring in page_text_lower:
-            lexical_substring_filtered.append(r)
-    # 3) Now apply your region/country/year filter on that new list
-    filtered_lexical = filter_results(
-        lexical_substring_filtered, country_filter, region_filter, end_year_range
-    ) ## ToDo add end_year filter again
-    # 4) Remove duplicates
     filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
-    # 5) If empty after substring + filters + dedupe, show a custom message
     if not filtered_lexical_no_dupe:
         st.write('No exact matches, consider unchecking "Show only exact matches"')
     else:
@@ -277,21 +285,16 @@ if show_exact_matches:
         st.write(rag_answer)
         st.divider()
         for res in top_results:
-            # Metadata
             metadata = res.payload.get('metadata', {})
-            countries = metadata.get('countries', "[]")
-            client_name = metadata.get('client', 'Unknown Client')
-            start_year = metadata.get('start_year', None)
-            end_year = metadata.get('end_year', None)
-            total_volume = metadata.get('total_volume', "Unknown")
-            total_project = metadata.get('total_project', "Unknown")
-            id = metadata.get('id', "Unknown")
-            project_name = res.payload['metadata'].get('project_name', 'Project Link')
             proj_id = metadata.get('id', 'Unknown')
-            st.markdown(f"#### {project_name} [{proj_id}]")
-            # Snippet logic (80 words)
-            # Build snippet from objectives and descriptions.
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
@@ -301,21 +304,23 @@ if show_exact_matches:
             preview_word_count = 200
             preview_text = " ".join(words[:preview_word_count])
             remainder_text = " ".join(words[preview_word_count:])
-            st.write(preview_text + ("..." if remainder_text else ""))
             # Keywords
             full_text = res.payload['page_content']
             top_keywords = extract_top_keywords(full_text, top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
             try:
-                c_list = json.loads(countries.replace("'", '"'))
             except json.JSONDecodeError:
                 c_list = []
-            # Only keep country names if the region lookup returns a different value.
             matched_countries = []
             for code in c_list:
                 if len(code) == 2:
@@ -323,71 +328,34 @@ if show_exact_matches:
                     if resolved_name.upper() != code.upper():
                         matched_countries.append(resolved_name)
-            # Format the year range
-            start_year_str = extract_year(start_year) if start_year else "Unknown"
-            end_year_str = extract_year(end_year) if end_year else "Unknown"
-            formatted_project_budget = format_currency(total_project)
-            formatted_total_volume = format_currency(total_volume)
-            # Build the final string including a new row for countries.
-            if matched_countries:
-                additional_text = (
-                    f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**\n"
-                    f"Projekt duration **{start_year_str}-{end_year_str}**\n"
-                    f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**\n"
-                    f"Country: **{', '.join(matched_countries)}**"
-                )
-            else:
-                additional_text = (
-                    f"Commissioned by **{client_name}**\n"
-                    f"Projekt duration **{start_year_str}-{end_year_str}**\n"
-                    f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**\n"
-                    f"Country: **{', '.join(c_list) if c_list else 'Unknown'}**"
-                )
             st.markdown(additional_text)
             st.divider()
 else:
     st.write(f"Showing **Top 15 Semantic Search results** for query: {var}")
     if not filtered_semantic_no_dupe:
         st.write("No relevant results found.")
     else:
-        # Get the top 15 results for the RAG context
         top_results = filtered_semantic_no_dupe[:5]
-        # Call the RAG function to generate an answer
         rag_answer = get_rag_answer(var, top_results)
-        # Display the generated answer at the top of the page
         st.markdown("### Generated Answer")
         st.write(rag_answer)
         st.divider()
-        # Now list each individual search result below
         for res in top_results:
-            # Metadata
             metadata = res.payload.get('metadata', {})
-            countries = metadata.get('countries', "[]")
-            client_name = metadata.get('client', 'Unknown Client')
-            start_year = metadata.get('start_year', None)
-            end_year = metadata.get('end_year', None)
-            total_volume = metadata.get('total_volume', "Unknown")
-            total_project = metadata.get('total_project', "Unknown")
-            id = metadata.get('id', "Unknown")
-            project_name = res.payload['metadata'].get('project_name', 'Project Link')
-            proj_id = metadata.get('id', 'Unknown')
-            st.markdown(f"#### {project_name} [{proj_id}]")
-            # Snippet logic (80 words)
-            # Build snippet from objectives and descriptions.
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
@@ -397,19 +365,19 @@ else:
             preview_word_count = 200
             preview_text = " ".join(words[:preview_word_count])
             remainder_text = " ".join(words[preview_word_count:])
-            st.write(preview_text + ("..." if remainder_text else ""))
-            # Keywords
-            full_text = res.payload['page_content']
-            top_keywords = extract_top_keywords(full_text, top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
             try:
-                c_list = json.loads(countries.replace("'", '"'))
             except json.JSONDecodeError:
                 c_list = []
             matched_countries = []
             for code in c_list:
                 if len(code) == 2:
@@ -417,40 +385,13 @@ else:
                     if resolved_name.upper() != code.upper():
                         matched_countries.append(resolved_name)
-            # Format the year range
-            start_year_str = extract_year(start_year) if start_year else "Unknown"
-            end_year_str = extract_year(end_year) if end_year else "Unknown"
-            formatted_project_budget = format_currency(total_project)
-            formatted_total_volume = format_currency(total_volume)
-            # Build the final string
-            if matched_countries:
-                additional_text = (
-                    f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**\n"
-                    f"Projekt duration **{start_year_str}-{end_year_str}**\n"
-                    f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**\n"
-                    f"Country: **{', '.join(matched_countries)}**"
-                )
-            else:
-                additional_text = (
-                    f"Commissioned by **{client_name}**\n"
-                    f"Projekt duration **{start_year_str}-{end_year_str}**\n"
-                    f"Budget: Project: **{formatted_project_budget}**, Total volume: **{formatted_total_volume}**\n"
-                    f"Country: **{', '.join(c_list) if c_list else 'Unknown'}**"
-                )
             st.markdown(additional_text)
             st.divider()
     #  for i in results:
     #      st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
     #      st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")

     else:
         return f"Error in generating answer: {response.text}"
+#######
+# Helper function: Format project id (e.g., "201940485" -> "2019.4048.5")
+def format_project_id(pid):
+    s = str(pid)
+    if len(s) > 5:
+        return s[:4] + "." + s[4:-1] + "." + s[-1]
+    return s
+# Helper function: Compute title from metadata using name.en (or name.de if empty)
+def compute_title(metadata):
+    name_en = metadata.get("name.en", "").strip()
+    name_de = metadata.get("name.de", "").strip()
+    base = name_en if name_en else name_de
+    pid = metadata.get("id", "")
+    if base and pid:
+        return f"{base} [{format_project_id(pid)}]"
+    return base or "No Title"
+# Helper function: Get CRS filter options from all documents in the collection
+@st.cache_data
+def get_crs_options(client, collection_name):
+    results = hybrid_search(client, "", collection_name)
+    all_results = results[0] + results[1]
+    crs_set = set()
+    for res in all_results:
+        metadata = res.payload.get('metadata', {})
+        crs_key = metadata.get("crs_key", "").strip()
+        crs_value = metadata.get("crs_value", "").strip()
+        if crs_key or crs_value:
+            crs_combined = f"{crs_key}: {crs_value}"
+            crs_set.add(crs_combined)
+    return sorted(crs_set)
+# Update filter_results to also filter by crs_combined.
+def filter_results(results, country_filter, region_filter, end_year_range, crs_filter):
+    filtered = []
+    for r in results:
+        metadata = r.payload.get('metadata', {})
+        countries = metadata.get('countries', "[]")
+        year_str = metadata.get('end_year')
+        if year_str:
+            extracted = extract_year(year_str)
+            try:
+                end_year_val = int(extracted) if extracted != "Unknown" else 0
+            except ValueError:
+                end_year_val = 0
+        else:
+            end_year_val = 0
+        try:
+            c_list = json.loads(countries.replace("'", '"'))
+            c_list = [code.upper() for code in c_list if len(code) == 2]
+        except json.JSONDecodeError:
+            c_list = []
+        selected_iso_code = country_name_mapping.get(country_filter, None)
+        if region_filter != "All/Not allocated":
+            countries_in_region = [code for code in c_list if iso_code_to_sub_region.get(code) == region_filter]
+        else:
+            countries_in_region = c_list
+        # Filter by CRS: compute crs_combined and compare to the selected filter.
+        crs_key = metadata.get("crs_key", "").strip()
+        crs_value = metadata.get("crs_value", "").strip()
+        crs_combined = f"{crs_key}: {crs_value}" if (crs_key or crs_value) else ""
+        if crs_filter != "All/Not allocated" and crs_filter != crs_combined:
+            continue
+        if ((country_filter == "All/Not allocated" or selected_iso_code in c_list)
+            and (region_filter == "All/Not allocated" or countries_in_region)
+            and (end_year_range[0] <= end_year_val <= end_year_range[1])):
+            filtered.append(r)
+    return filtered
 #######
 # get the device to be used eithe gpu or cpu
 st.set_page_config(page_title="SEARCH IATI",layout='wide')
 st.title("GIZ Project Database (PROTOTYPE)")
+var = st.text_input("Enter Search Question")
 # Load the region lookup CSV
 region_lookup_path = "docStore/regions_lookup.csv"
 country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
 unique_country_names = sorted(country_name_mapping.keys())  # List of country names
+# Layout filters in columns: add a new filter for CRS in col4.
 col1, col2, col3, col4 = st.columns([1, 1, 1, 4])
 with col1:
+    region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
 with col2:
+    country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names if (filtered_country_names := unique_country_names) else unique_country_names)
 with col3:
     current_year = datetime.now().year
+    default_start_year = current_year - 4
+    end_year_range = st.slider("Project End Year", min_value=2010, max_value=max_end_year, value=(default_start_year, max_end_year))
+with col4:
+    crs_options = ["All/Not allocated"] + get_crs_options(client, collection_name)
+    crs_filter = st.selectbox("CRS", crs_options)
 # Checkbox to control whether to show only exact matches
 show_exact_matches = st.checkbox("Show only exact matches", value=False)
 # Run the search
 # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
 semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
+filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter)
+filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter)
 filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
         return f"€{int(float(value)):,}"
     except (ValueError, TypeError):
         return value
+# Helper function to highlight query matches (case-insensitive)
+def highlight_query(text, query):
+    pattern = re.compile(re.escape(query), re.IGNORECASE)
+    return pattern.sub(lambda m: f"**{m.group(0)}**", text)
+###############################
+# Display Lexical Results Branch
+###############################
 if show_exact_matches:
     st.write(f"Showing **Top 15 Lexical Search results** for query: {var}")
     query_substring = var.strip().lower()
+    lexical_substring_filtered = [r for r in lexical_all if query_substring in r.payload["page_content"].lower()]
+    filtered_lexical = filter_results(lexical_substring_filtered, country_filter, region_filter, end_year_range, crs_filter)
     filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
     if not filtered_lexical_no_dupe:
         st.write('No exact matches, consider unchecking "Show only exact matches"')
     else:
         st.write(rag_answer)
         st.divider()
         for res in top_results:
             metadata = res.payload.get('metadata', {})
+            # Compute new title if not already set
+            if "title" not in metadata:
+                metadata["title"] = compute_title(metadata)
+            # Use new title instead of project_name and highlight query if present
+            display_title = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
             proj_id = metadata.get('id', 'Unknown')
+            st.markdown(f"#### {display_title} [{proj_id}]")
+            # Build snippet with potential highlighting
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
             preview_word_count = 200
             preview_text = " ".join(words[:preview_word_count])
             remainder_text = " ".join(words[preview_word_count:])
+            preview_text = highlight_query(preview_text, var) if var.strip() else preview_text
+            st.write(preview_text)
+            if remainder_text:
+                with st.expander("Show more"):
+                    st.write(remainder_text)
             # Keywords
             full_text = res.payload['page_content']
             top_keywords = extract_top_keywords(full_text, top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
+            # Country info
             try:
+                c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
             except json.JSONDecodeError:
                 c_list = []
             matched_countries = []
             for code in c_list:
                 if len(code) == 2:
                     if resolved_name.upper() != code.upper():
                         matched_countries.append(resolved_name)
+            additional_text = f"Country: **{', '.join(matched_countries) if matched_countries else 'Unknown'}**"
+            # Add contact info if available and not [email protected]
+            contact = metadata.get("contact", "").strip()
+            if contact and contact.lower() != "[email protected]":
+                additional_text += f" | Contact: **{contact}**"
             st.markdown(additional_text)
             st.divider()
+###############################
+# Display Semantic Results Branch
+###############################
 else:
     st.write(f"Showing **Top 15 Semantic Search results** for query: {var}")
     if not filtered_semantic_no_dupe:
         st.write("No relevant results found.")
     else:
         top_results = filtered_semantic_no_dupe[:5]
         rag_answer = get_rag_answer(var, top_results)
         st.markdown("### Generated Answer")
         st.write(rag_answer)
         st.divider()
         for res in top_results:
             metadata = res.payload.get('metadata', {})
+            if "title" not in metadata:
+                metadata["title"] = compute_title(metadata)
+            display_title = metadata["title"]
+            st.markdown(f"#### {display_title} [{metadata.get('id', 'Unknown')}]")
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
             preview_word_count = 200
             preview_text = " ".join(words[:preview_word_count])
             remainder_text = " ".join(words[preview_word_count:])
+            st.write(preview_text)
+            if remainder_text:
+                with st.expander("Show more"):
+                    st.write(remainder_text)
+            top_keywords = extract_top_keywords(res.payload['page_content'], top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
             try:
+                c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
             except json.JSONDecodeError:
                 c_list = []
             matched_countries = []
             for code in c_list:
                 if len(code) == 2:
                     if resolved_name.upper() != code.upper():
                         matched_countries.append(resolved_name)
+            additional_text = f"Country: **{', '.join(matched_countries) if matched_countries else 'Unknown'}**"
+            contact = metadata.get("contact", "").strip()
+            if contact and contact.lower() != "[email protected]":
+                additional_text += f" | Contact: **{contact}**"
             st.markdown(additional_text)
             st.divider()
     #  for i in results:
     #      st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
     #      st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")