Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 8,251 Bytes
e44062d 71dcf5a a2f475d db59072 e2ce8c5 e44062d 8e1fb27 b4aa482 d5cb87e 21fcb5a d5cb87e 21fcb5a d5cb87e b4aa482 d5cb87e 21fcb5a b4aa482 28c8e4b 21fcb5a d5cb87e 8e1fb27 7fe4b03 d5cb87e b4aa482 d01f935 ba7bdf1 34e5e83 76bd8b0 2293687 76bd8b0 2293687 76bd8b0 2293687 76bd8b0 ba7bdf1 b5da228 2293687 21fcb5a d5cb87e 8e1fb27 b4aa482 21fcb5a 8e1fb27 d5cb87e 28c8e4b d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 cb75685 c21b0a0 cb75685 16d74db ce2ec6b 16d74db 4345f0f 16d74db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import pandas as pd
path_to_data = "./docStore/"
from appStore.prep_utils import create_chunks
from appStore.search import hybrid_search
from datetime import datetime
def convert_to_date(val):
try:
# If val is a string, first check if it represents a numeric value.
if isinstance(val, str):
val_str = val.strip()
try:
# Try converting the string to a float (i.e. it’s an epoch in string form)
num = float(val_str)
return datetime.utcfromtimestamp(num / 1000).strftime("%Y-%m-%d")
except ValueError:
# Not a numeric string; assume it's already a date string in "YYYY-MM-DD" format.
# Optionally, you can validate it:
datetime.strptime(val_str, "%Y-%m-%d")
return val_str
elif isinstance(val, (int, float)):
return datetime.utcfromtimestamp(val / 1000).strftime("%Y-%m-%d")
else:
return "Unknown"
except Exception:
return "Unknown"
def process_iati():
"""
this will read the iati files and create the chunks
"""
orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
projects_df.drop(columns= ['orga_abbreviation', 'client',
'orga_full_name', 'country',
'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
#print(projects_df.columns)
projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
projects_df['source'] = 'IATI'
projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
return projects_df
def convert_crs_value(x):
if pd.isnull(x):
return x
# Convert to string and remove trailing '.0' if present.
x_str = str(x).strip()
if x_str.endswith(".0"):
x_str = x_str[:-2]
try:
return int(x_str)
except ValueError:
return x
def process_giz_worldwide():
"""
This function reads the new giz_worldwide file and prepares the data for embedding.
Adjustments made:
- Reads the file 'giz_worldwide_api_download_23_02_2025.json'
- Renames 'name.en' to 'project_name'
- Uses the 'merged_text' column for embedding the whole text (no chunking)
- Creates an empty 'url' column (since the new dataset has an empty URL)
- Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
- Converts 'crs_value' to an integer (dropping any .0) if present.
"""
# Read the new JSON file
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
# Reset index for safety
giz_df = giz_df.reset_index(drop=True)
# Rename columns per new dataset requirements
giz_df = giz_df.rename(columns={
'name.en': 'project_name',
'duration.project.start': 'start_year',
'duration.project.end': 'end_year'
})
giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
# Create an empty 'url' column as the new dataset has an empty URL
giz_df['url'] = ''
# Convert CRS value to numeric then to integer if possible.
# After reading the JSON file into giz_df, convert the crs_key column:
if 'crs_key' in giz_df.columns:
def clean_crs_key(x):
x_str = str(x).strip()
if not x_str:
return x_str
try:
# Convert to float then to int, then to string.
return str(int(float(x_str)))
except Exception:
# Fallback: remove trailing ".0" if present.
return x_str.replace('.0', '')
giz_df['crs_key'] = giz_df['crs_key'].apply(clean_crs_key)
# Compute text_size based on merged_text and assign full text to the 'chunks' column
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
# Use the full merged_text for embedding (no chunking)
giz_df['chunks'] = giz_df['merged_text']
giz_df['source'] = 'GIZ_WORLDWIDE'
return giz_df
# def process_giz_worldwide():
# """
# this will read the giz_worldwide files and create the chunks
# """
# giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
# giz_df = giz_df.rename(columns={'content':'project_description'})
# # Sample random rows for quick embeddings (seed set for reproducibility)
# giz_df = giz_df.sample(n=5, random_state=42)
# giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
# giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
# print("initial df length:",len(giz_df))
# giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
# print("new df length:",len(giz_df))
# print(giz_df.columns)
# #giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
# # 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
# giz_df['source'] = 'GIZ_WORLDWIDE'
# return giz_df
def remove_duplicates(results_list):
"""
Return a new list of results with duplicates removed,
based on 'url' in metadata.
"""
unique_results = []
seen_urls = set()
for r in results_list:
# Safely get the URL from metadata
url = r.payload['metadata'].get('id', None)
if url not in seen_urls:
seen_urls.add(url)
unique_results.append(r)
return unique_results
def extract_year(date_str):
try:
return str(datetime.strptime(date_str, "%Y-%m-%d").year)
except Exception:
return "Unknown"
def get_max_end_year(_client, collection_name):
"""
Return the maximum 'end_year' in the entire collection
so we can set the slider's max_value dynamically.
"""
# For safety, get a large pool of items
all_res = hybrid_search(_client, "", collection_name, limit=2000)
big_list = all_res[0] + all_res[1]
years = []
for r in big_list:
metadata = r.payload.get('metadata', {})
year_str = metadata.get('end_year', None)
if year_str:
try:
years.append(float(year_str))
except ValueError:
pass
if not years:
# fallback if no valid end years found
return 2030
return int(max(years))
# Helper: safe formatting of project IDs
def safe_format_project_id(pid):
"""
Safely format a project ID:
- If the ID is a float ending with ".0", remove it.
- If the value is "nan" (case insensitive) or empty, return an empty string.
- Otherwise, format it in the typical GIZ format if it has enough digits.
"""
s = str(pid)
# Remove trailing ".0" if present
if s.endswith(".0"):
s = s[:-2]
# If the value is 'nan' or empty after stripping, return empty string
if s.lower() == "nan" or s.strip() == "":
return ""
# Format if the string has enough digits
if len(s) > 5:
return s[:4] + "." + s[4:-1] + "." + s[-1]
return s
|