import json from datasets import load_dataset def download_and_process_gaia_level1(output_filename="GAIA_level1.json"): """ Downloads the GAIA level 1 dataset, processes it into the desired JSON format with a 'status' field, and saves it to a file. """ print("Attempting to download GAIA level 1 dataset...") try: # Load the 'level1' split of the GAIA dataset # Using trust_remote_code=True as it might be required for some datasets dataset = load_dataset( "gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True) print(f"Successfully downloaded {len(dataset)} Q&A pairs.") except Exception as e: print(f"Error downloading the dataset: {e}") print("Please ensure you have an internet connection and the 'datasets' library is installed (`pip install datasets`).") return processed_data = {} print("Processing dataset...") for i, item in enumerate(dataset): question = item.get("Question") final_answer = item.get("Final answer") if question is not None and final_answer is not None: processed_data[str(i + 1)] = { "Q": question, "A": final_answer, "status": False # Initialize status to False } print(f"Saving processed data to {output_filename}...") with open(output_filename, 'w', encoding='utf-8') as f: json.dump(processed_data, f, indent=4, ensure_ascii=False) print("Done.") if __name__ == "__main__": download_and_process_gaia_level1()