Spaces:
Sleeping
Sleeping
import json | |
from datasets import load_dataset | |
def download_and_process_gaia_level1(output_filename="GAIA_level1.json"): | |
""" | |
Downloads the GAIA level 1 dataset, processes it into the desired | |
JSON format with a 'status' field, and saves it to a file. | |
""" | |
print("Attempting to download GAIA level 1 dataset...") | |
try: | |
# Load the 'level1' split of the GAIA dataset | |
# Using trust_remote_code=True as it might be required for some datasets | |
dataset = load_dataset( | |
"gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True) | |
print(f"Successfully downloaded {len(dataset)} Q&A pairs.") | |
except Exception as e: | |
print(f"Error downloading the dataset: {e}") | |
print("Please ensure you have an internet connection and the 'datasets' library is installed (`pip install datasets`).") | |
return | |
processed_data = {} | |
print("Processing dataset...") | |
for i, item in enumerate(dataset): | |
question = item.get("Question") | |
final_answer = item.get("Final answer") | |
if question is not None and final_answer is not None: | |
processed_data[str(i + 1)] = { | |
"Q": question, | |
"A": final_answer, | |
"status": False # Initialize status to False | |
} | |
print(f"Saving processed data to {output_filename}...") | |
with open(output_filename, 'w', encoding='utf-8') as f: | |
json.dump(processed_data, f, indent=4, ensure_ascii=False) | |
print("Done.") | |
if __name__ == "__main__": | |
download_and_process_gaia_level1() | |