Spaces:
Sleeping
Sleeping
Update appStore/prep_data.py
Browse files- appStore/prep_data.py +7 -3
appStore/prep_data.py
CHANGED
@@ -71,7 +71,10 @@ def process_giz_worldwide():
|
|
71 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
|
72 |
|
73 |
# Sample random rows for quick embeddings (seed set for reproducibility)
|
74 |
-
giz_df = giz_df.sample(n=30, random_state=42)
|
|
|
|
|
|
|
75 |
|
76 |
# Rename columns per new dataset requirements
|
77 |
giz_df = giz_df.rename(columns={
|
@@ -89,8 +92,8 @@ def process_giz_worldwide():
|
|
89 |
# Compute text_size based on merged_text and assign full text to the 'chunks' column
|
90 |
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
|
91 |
|
92 |
-
# Use the full merged_text
|
93 |
-
# If
|
94 |
# giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
|
95 |
giz_df['chunks'] = giz_df['merged_text']
|
96 |
|
@@ -98,6 +101,7 @@ def process_giz_worldwide():
|
|
98 |
return giz_df
|
99 |
|
100 |
|
|
|
101 |
# def process_giz_worldwide():
|
102 |
# """
|
103 |
# this will read the giz_worldwide files and create the chunks
|
|
|
71 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
|
72 |
|
73 |
# Sample random rows for quick embeddings (seed set for reproducibility)
|
74 |
+
giz_df = giz_df.sample(n=30, random_state=42)
|
75 |
+
|
76 |
+
# Reset the index so that create_documents can iterate using integer indices
|
77 |
+
giz_df = giz_df.reset_index(drop=True)
|
78 |
|
79 |
# Rename columns per new dataset requirements
|
80 |
giz_df = giz_df.rename(columns={
|
|
|
92 |
# Compute text_size based on merged_text and assign full text to the 'chunks' column
|
93 |
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
|
94 |
|
95 |
+
# Use the full merged_text for embedding (no chunking).
|
96 |
+
# If downstream code expects a list, you could instead wrap it in a list:
|
97 |
# giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
|
98 |
giz_df['chunks'] = giz_df['merged_text']
|
99 |
|
|
|
101 |
return giz_df
|
102 |
|
103 |
|
104 |
+
|
105 |
# def process_giz_worldwide():
|
106 |
# """
|
107 |
# this will read the giz_worldwide files and create the chunks
|