TEST-GIZ-Project-Search

Sleeping

annikwag commited on Feb 26

Commit

28c8e4b

verified ·

1 Parent(s): 21fcb5a

Update appStore/prep_data.py

Files changed (1) hide show

appStore/prep_data.py CHANGED Viewed

@@ -71,7 +71,10 @@ def process_giz_worldwide():
     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
     # Sample random rows for quick embeddings (seed set for reproducibility)
-    giz_df = giz_df.sample(n=30, random_state=42)
     # Rename columns per new dataset requirements
     giz_df = giz_df.rename(columns={
@@ -89,8 +92,8 @@ def process_giz_worldwide():
     # Compute text_size based on merged_text and assign full text to the 'chunks' column
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
-    # Use the full merged_text instead of creating chunks.
-    # If your downstream code expects a list of texts, use:
     # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
     giz_df['chunks'] = giz_df['merged_text']
@@ -98,6 +101,7 @@ def process_giz_worldwide():
     return giz_df
 # def process_giz_worldwide():
 #     """
 #     this will read the giz_worldwide files and create the chunks

     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
     # Sample random rows for quick embeddings (seed set for reproducibility)
+    giz_df = giz_df.sample(n=30, random_state=42)
+    # Reset the index so that create_documents can iterate using integer indices
+    giz_df = giz_df.reset_index(drop=True)
     # Rename columns per new dataset requirements
     giz_df = giz_df.rename(columns={
     # Compute text_size based on merged_text and assign full text to the 'chunks' column
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
+    # Use the full merged_text for embedding (no chunking).
+    # If downstream code expects a list, you could instead wrap it in a list:
     # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
     giz_df['chunks'] = giz_df['merged_text']
     return giz_df
 # def process_giz_worldwide():
 #     """
 #     this will read the giz_worldwide files and create the chunks