Final_Assignment_Template / download_gaia.py
TzurVaich's picture
Get GAIA validation set
e6036f2
import json
from datasets import load_dataset
def download_and_process_gaia_level1(output_filename="GAIA_level1.json"):
"""
Downloads the GAIA level 1 dataset, processes it into the desired
JSON format with a 'status' field, and saves it to a file.
"""
print("Attempting to download GAIA level 1 dataset...")
try:
# Load the 'level1' split of the GAIA dataset
# Using trust_remote_code=True as it might be required for some datasets
dataset = load_dataset(
"gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
print(f"Successfully downloaded {len(dataset)} Q&A pairs.")
except Exception as e:
print(f"Error downloading the dataset: {e}")
print("Please ensure you have an internet connection and the 'datasets' library is installed (`pip install datasets`).")
return
processed_data = {}
print("Processing dataset...")
for i, item in enumerate(dataset):
question = item.get("Question")
final_answer = item.get("Final answer")
if question is not None and final_answer is not None:
processed_data[str(i + 1)] = {
"Q": question,
"A": final_answer,
"status": False # Initialize status to False
}
print(f"Saving processed data to {output_filename}...")
with open(output_filename, 'w', encoding='utf-8') as f:
json.dump(processed_data, f, indent=4, ensure_ascii=False)
print("Done.")
if __name__ == "__main__":
download_and_process_gaia_level1()