File size: 8,251 Bytes
e44062d
 
71dcf5a
a2f475d
db59072
 
e2ce8c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e44062d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1fb27
b4aa482
 
 
 
 
 
 
 
 
 
 
 
d5cb87e
 
21fcb5a
 
d5cb87e
 
21fcb5a
d5cb87e
 
b4aa482
d5cb87e
 
 
21fcb5a
b4aa482
28c8e4b
21fcb5a
d5cb87e
 
 
 
 
 
8e1fb27
7fe4b03
 
d5cb87e
 
b4aa482
d01f935
ba7bdf1
34e5e83
76bd8b0
2293687
 
 
76bd8b0
2293687
 
76bd8b0
 
2293687
76bd8b0
ba7bdf1
b5da228
2293687
21fcb5a
d5cb87e
8e1fb27
b4aa482
21fcb5a
8e1fb27
d5cb87e
 
 
 
28c8e4b
d5cb87e
 
 
 
 
 
 
 
 
 
 
8e1fb27
d5cb87e
8e1fb27
d5cb87e
 
8e1fb27
 
 
cb75685
 
 
 
 
 
 
 
 
 
c21b0a0
cb75685
 
 
 
 
16d74db
ce2ec6b
 
 
 
 
 
16d74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4345f0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16d74db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import pandas as pd
path_to_data = "./docStore/"
from appStore.prep_utils import create_chunks
from appStore.search import hybrid_search
from datetime import datetime

def convert_to_date(val):
    try:
        # If val is a string, first check if it represents a numeric value.
        if isinstance(val, str):
            val_str = val.strip()
            try:
                # Try converting the string to a float (i.e. it’s an epoch in string form)
                num = float(val_str)
                return datetime.utcfromtimestamp(num / 1000).strftime("%Y-%m-%d")
            except ValueError:
                # Not a numeric string; assume it's already a date string in "YYYY-MM-DD" format.
                # Optionally, you can validate it:
                datetime.strptime(val_str, "%Y-%m-%d")
                return val_str
        elif isinstance(val, (int, float)):
            return datetime.utcfromtimestamp(val / 1000).strftime("%Y-%m-%d")
        else:
            return "Unknown"
    except Exception:
        return "Unknown"


def process_iati():
    """
    this will read the iati files and create the chunks
    """
    orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
    region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
    sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
    status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
    texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")

    projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
    projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)

    projects_df.drop(columns= ['orga_abbreviation', 'client',
       'orga_full_name', 'country', 
       'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
       'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
    #print(projects_df.columns)
    projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
    projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
    projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
    projects_df['source'] = 'IATI'
    projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)

    return projects_df


def convert_crs_value(x):
    if pd.isnull(x):
        return x
    # Convert to string and remove trailing '.0' if present.
    x_str = str(x).strip()
    if x_str.endswith(".0"):
        x_str = x_str[:-2]
    try:
        return int(x_str)
    except ValueError:
        return x

def process_giz_worldwide():
    """
    This function reads the new giz_worldwide file and prepares the data for embedding.
    Adjustments made:
      - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
      - Renames 'name.en' to 'project_name'
      - Uses the 'merged_text' column for embedding the whole text (no chunking)
      - Creates an empty 'url' column (since the new dataset has an empty URL)
      - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
      - Converts 'crs_value' to an integer (dropping any .0) if present.
    """
    # Read the new JSON file
    giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
    
    # Reset index for safety
    giz_df = giz_df.reset_index(drop=True)
    
    # Rename columns per new dataset requirements
    giz_df = giz_df.rename(columns={
        'name.en': 'project_name',
        'duration.project.start': 'start_year',
        'duration.project.end': 'end_year'
    })
    
    giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
    
    # Create an empty 'url' column as the new dataset has an empty URL
    giz_df['url'] = ''
    
    # Convert CRS value to numeric then to integer if possible.
    # After reading the JSON file into giz_df, convert the crs_key column:
    if 'crs_key' in giz_df.columns:
        def clean_crs_key(x):
            x_str = str(x).strip()
            if not x_str:
                return x_str
            try:
                # Convert to float then to int, then to string.
                return str(int(float(x_str)))
            except Exception:
                # Fallback: remove trailing ".0" if present.
                return x_str.replace('.0', '')
        giz_df['crs_key'] = giz_df['crs_key'].apply(clean_crs_key)



    # Compute text_size based on merged_text and assign full text to the 'chunks' column
    giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
    
    # Use the full merged_text for embedding (no chunking)
    giz_df['chunks'] = giz_df['merged_text']
    
    giz_df['source'] = 'GIZ_WORLDWIDE'
    return giz_df



# def process_giz_worldwide():
#     """
#     this will read the giz_worldwide files and create the chunks
#     """
#     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
#     giz_df = giz_df.rename(columns={'content':'project_description'})
#     # Sample random rows for quick embeddings (seed set for reproducibility)
#     giz_df = giz_df.sample(n=5, random_state=42)    
#     giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
#     giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
#     print("initial df length:",len(giz_df))
#     giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
#     print("new df length:",len(giz_df))
#     print(giz_df.columns)
#     #giz_df.drop(columns = ['filename', 'url', 'name', 'mail', 
#     #                    'language', 'start_year', 'end_year','poli_trager'], inplace=True)
#     giz_df['source'] = 'GIZ_WORLDWIDE'
#     return giz_df

def remove_duplicates(results_list):
    """
    Return a new list of results with duplicates removed, 
    based on 'url' in metadata.
    """
    unique_results = []
    seen_urls = set()

    for r in results_list:
        # Safely get the URL from metadata
        url = r.payload['metadata'].get('id', None)
        if url not in seen_urls:
            seen_urls.add(url)
            unique_results.append(r)

    return unique_results

def extract_year(date_str):
    try:
        return str(datetime.strptime(date_str, "%Y-%m-%d").year)
    except Exception:
        return "Unknown"


def get_max_end_year(_client, collection_name):
    """
    Return the maximum 'end_year' in the entire collection 
    so we can set the slider's max_value dynamically.
    """
    # For safety, get a large pool of items
    all_res = hybrid_search(_client, "", collection_name, limit=2000)
    big_list = all_res[0] + all_res[1]

    years = []
    for r in big_list:
        metadata = r.payload.get('metadata', {})
        year_str = metadata.get('end_year', None)
        if year_str:
            try:
                years.append(float(year_str))
            except ValueError:
                pass

    if not years:
        # fallback if no valid end years found
        return 2030
    return int(max(years))

# Helper: safe formatting of project IDs
def safe_format_project_id(pid):
    """
    Safely format a project ID:
      - If the ID is a float ending with ".0", remove it.
      - If the value is "nan" (case insensitive) or empty, return an empty string.
      - Otherwise, format it in the typical GIZ format if it has enough digits.
    """
    s = str(pid)
    # Remove trailing ".0" if present
    if s.endswith(".0"):
        s = s[:-2]
    # If the value is 'nan' or empty after stripping, return empty string
    if s.lower() == "nan" or s.strip() == "":
        return ""
    # Format if the string has enough digits
    if len(s) > 5:
        return s[:4] + "." + s[4:-1] + "." + s[-1]
    return s