annikwag commited on
Commit
28c8e4b
·
verified ·
1 Parent(s): 21fcb5a

Update appStore/prep_data.py

Browse files
Files changed (1) hide show
  1. appStore/prep_data.py +7 -3
appStore/prep_data.py CHANGED
@@ -71,7 +71,10 @@ def process_giz_worldwide():
71
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
72
 
73
  # Sample random rows for quick embeddings (seed set for reproducibility)
74
- giz_df = giz_df.sample(n=30, random_state=42)
 
 
 
75
 
76
  # Rename columns per new dataset requirements
77
  giz_df = giz_df.rename(columns={
@@ -89,8 +92,8 @@ def process_giz_worldwide():
89
  # Compute text_size based on merged_text and assign full text to the 'chunks' column
90
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
91
 
92
- # Use the full merged_text instead of creating chunks.
93
- # If your downstream code expects a list of texts, use:
94
  # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
95
  giz_df['chunks'] = giz_df['merged_text']
96
 
@@ -98,6 +101,7 @@ def process_giz_worldwide():
98
  return giz_df
99
 
100
 
 
101
  # def process_giz_worldwide():
102
  # """
103
  # this will read the giz_worldwide files and create the chunks
 
71
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
72
 
73
  # Sample random rows for quick embeddings (seed set for reproducibility)
74
+ giz_df = giz_df.sample(n=30, random_state=42)
75
+
76
+ # Reset the index so that create_documents can iterate using integer indices
77
+ giz_df = giz_df.reset_index(drop=True)
78
 
79
  # Rename columns per new dataset requirements
80
  giz_df = giz_df.rename(columns={
 
92
  # Compute text_size based on merged_text and assign full text to the 'chunks' column
93
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
94
 
95
+ # Use the full merged_text for embedding (no chunking).
96
+ # If downstream code expects a list, you could instead wrap it in a list:
97
  # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
98
  giz_df['chunks'] = giz_df['merged_text']
99
 
 
101
  return giz_df
102
 
103
 
104
+
105
  # def process_giz_worldwide():
106
  # """
107
  # this will read the giz_worldwide files and create the chunks