File size: 1,605 Bytes
e6036f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
from datasets import load_dataset

def download_and_process_gaia_level1(output_filename="GAIA_level1.json"):
    """
    Downloads the GAIA level 1 dataset, processes it into the desired
    JSON format with a 'status' field, and saves it to a file.
    """
    print("Attempting to download GAIA level 1 dataset...")


    try:
        # Load the 'level1' split of the GAIA dataset
        # Using trust_remote_code=True as it might be required for some datasets
        dataset = load_dataset(
            "gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
        print(f"Successfully downloaded {len(dataset)} Q&A pairs.")
    except Exception as e:
        print(f"Error downloading the dataset: {e}")
        print("Please ensure you have an internet connection and the 'datasets' library is installed (`pip install datasets`).")
        return

    processed_data = {}
    print("Processing dataset...")
    for i, item in enumerate(dataset):
        question = item.get("Question")
        final_answer = item.get("Final answer")

        if question is not None and final_answer is not None:
            processed_data[str(i + 1)] = {
                "Q": question,
                "A": final_answer,
                "status": False  # Initialize status to False
            }

    print(f"Saving processed data to {output_filename}...")
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=4, ensure_ascii=False)
    print("Done.")


if __name__ == "__main__":
    download_and_process_gaia_level1()