File size: 7,637 Bytes
e62e166
 
 
 
 
 
ec4f8ef
e62e166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4f8ef
e62e166
ec4f8ef
 
e62e166
 
 
 
ec4f8ef
 
 
 
e62e166
 
 
 
 
 
 
 
 
 
 
 
ec4f8ef
e62e166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4f8ef
e62e166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# test_agents,py
import json
import time
import datetime
import os
from dotenv import load_dotenv
from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH

# Import the BasicAgent from your app module
try:
    from app import BasicAgent
except ImportError as e:
    print(f"Error importing BasicAgent from app.py: {e}")
    print("Please ensure app.py is in the same directory or accessible in the Python path.")
    exit(1)

# --- Define Question-Answer Pairs ---
# Note: The 'A' part is just for reference here; the agent will generate its own answer.
QA_PAIRS = {
    "What is the capital of France?": "Paris",
    "Who wrote 'Hamlet'?": "William Shakespeare",
    "What is the formula for water?": "H2O",
    "How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.",
    # Agent should find current data
    "What is the current population of Earth?": "Approximately 8 billion",
}


def eval_GAIA(json_file_path="GAIA_level1_status.json"):
    """
    Loads GAIA level 1 questions from a JSON file, evaluates unanswered
    questions using the BasicAgent, logs incorrect answers, updates the
    status in the JSON data, and saves the updated data.

    Args:
        json_file_path (str): The path to the GAIA status JSON file.
                              Defaults to "GAIA_level1_status.json".
    """
    print(f"--- Starting GAIA Evaluation from {json_file_path} ---")

    tmp_json_file_path = json_file_path.replace(".json", "_tmp.json")

    # 2. Load GAIA data
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            gaia_data = json.load(f)
        print(
            f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.")
    except FileNotFoundError:
        print(f"Error: JSON file not found at {json_file_path}")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_file_path}")
        return
    except Exception as e:
        print(f"An unexpected error occurred loading the JSON file: {e}")
        return

    # 3. Initialize Agent, Log file, and Tracking variables
    try:
        agent = BasicAgent()
    except Exception as e:
        print(f"Error initializing BasicAgent: {e}")
        print("Evaluation cannot proceed.")
        return

    log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log"
    print(f"Incorrect answers will be logged to: {log_filename}")

    total_questions = len(gaia_data)
    processed_count = 0
    correct_count = 0
    initially_correct = sum(1 for item in gaia_data.values()
                            if item.get("status") is True)
    questions_to_process = total_questions - initially_correct

    print(f"Found {initially_correct} questions already marked as correct.")
    if questions_to_process == 0:
        print("No questions with status=false found to process.")
        # Still save the file in case formatting needs update, or just return
        # For consistency, let's save it.
    else:
        print(f"Attempting to answer {questions_to_process} questions...")

    start_time = time.time()

    # 4. Process questions
    test_questions = ["27"]
    for q_num, data in gaia_data.items():
        if q_num not in test_questions:
            continue
        if data.get("status") is False:
            processed_count += 1
            question = data.get("Q")
            correct_answer = data.get("A")
            file_name = data.get("file_name", None)
            if file_name:
                file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)

            status = data.get("status")  # Should be False here

            if question is None or correct_answer is None:
                print(f"Skipping question {q_num}: Missing 'Q' or 'A'.")
                continue

            elapsed_time = time.time() - start_time
            print(
                f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s")
            print(f"Q: {question[:100]}...")  # Print first 100 chars

            try:
                agent_response = agent(question, file_name)
                print(f"Agent A: {agent_response}")
                print(f"Correct A: {correct_answer}")

                # Simple comparison (case-sensitive, exact match)
                # Consider adding .strip() or lower() for more robust comparison if needed
                if str(agent_response).strip() == str(correct_answer).strip():
                    print(f"Result for Q {q_num}: CORRECT")
                    gaia_data[q_num]["status"] = True
                    correct_count += 1
                else:
                    print(f"Result for Q {q_num}: INCORRECT")
                    # Append to log file
                    with open(log_filename, 'a', encoding='utf-8') as log_f:
                        log_f.write(f"*question number {q_num} *\n")
                        log_f.write(f"Q: {question}\n")
                        log_f.write(f"A: {correct_answer}\n")
                        log_f.write(f"Agent: {agent_response}\n")
                        log_f.write("<END>\n\n")

            except Exception as e:
                print(f"Error processing question {q_num} with agent: {e}")
                # Optionally log agent errors too
                with open(log_filename, 'a', encoding='utf-8') as log_f:
                    log_f.write(f"*question number {q_num} *\n")
                    log_f.write(f"Q: {question}\n")
                    log_f.write(f"A: {correct_answer}\n")
                    log_f.write(f"Agent: ERROR - {e}\n")
                    log_f.write("<END>\n\n")

            with open(tmp_json_file_path, 'w', encoding='utf-8') as f:
                json.dump(gaia_data, f, indent=4, ensure_ascii=False)

        else:
            correct_count += 1
            print(f"Skipping question {q_num}: Status is already True.")

        # Exit
        break

    end_time = time.time()
    total_time = end_time - start_time

    # 5. Summary
    print("\n--- Evaluation Summary ---")
    print(f"Processed {processed_count} questions with status=false.")
    print(f"Correct answers provided by agent: {correct_count}")
    final_correct_count = initially_correct + correct_count
    print(
        f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}")
    print(f"Total evaluation time: {total_time:.2f} seconds")

    # 6. Save updated data
    try:
        with open(json_file_path, 'w', encoding='utf-8') as f:
            json.dump(gaia_data, f, indent=4, ensure_ascii=False)
        print(f"Successfully saved updated data to {json_file_path}")
    except Exception as e:
        print(f"Error saving updated data to {json_file_path}: {e}")

    print("--- GAIA Evaluation Finished ---")


def run_test_questions():
    """Instantiates the agent and runs it on the predefined questions."""
    print("--- Starting Agent Test ---")
    # Load environment variables (needed for BasicAgent initialization)
    #load_dotenv()
    #print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}")

    agent = BasicAgent()

    for question in QA_PAIRS.keys():
        print(f"\n--- Testing Question ---")
        print(f"Q: {question}")
        answer = agent(question)  # Call the agent instance
        print(f"Agent A: {answer}")


if __name__ == "__main__":

    load_dotenv()
    if not os.getenv('HF_TOKEN'):
        print("Warning: HF_TOKEN environment variable not found. Agent might fail.")

    # run_test_questions()

    # 
    eval_GAIA()