Spaces:
Sleeping
Sleeping
File size: 7,637 Bytes
e62e166 ec4f8ef e62e166 ec4f8ef e62e166 ec4f8ef e62e166 ec4f8ef e62e166 ec4f8ef e62e166 ec4f8ef e62e166 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# test_agents,py
import json
import time
import datetime
import os
from dotenv import load_dotenv
from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH
# Import the BasicAgent from your app module
try:
from app import BasicAgent
except ImportError as e:
print(f"Error importing BasicAgent from app.py: {e}")
print("Please ensure app.py is in the same directory or accessible in the Python path.")
exit(1)
# --- Define Question-Answer Pairs ---
# Note: The 'A' part is just for reference here; the agent will generate its own answer.
QA_PAIRS = {
"What is the capital of France?": "Paris",
"Who wrote 'Hamlet'?": "William Shakespeare",
"What is the formula for water?": "H2O",
"How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.",
# Agent should find current data
"What is the current population of Earth?": "Approximately 8 billion",
}
def eval_GAIA(json_file_path="GAIA_level1_status.json"):
"""
Loads GAIA level 1 questions from a JSON file, evaluates unanswered
questions using the BasicAgent, logs incorrect answers, updates the
status in the JSON data, and saves the updated data.
Args:
json_file_path (str): The path to the GAIA status JSON file.
Defaults to "GAIA_level1_status.json".
"""
print(f"--- Starting GAIA Evaluation from {json_file_path} ---")
tmp_json_file_path = json_file_path.replace(".json", "_tmp.json")
# 2. Load GAIA data
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
gaia_data = json.load(f)
print(
f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.")
except FileNotFoundError:
print(f"Error: JSON file not found at {json_file_path}")
return
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {json_file_path}")
return
except Exception as e:
print(f"An unexpected error occurred loading the JSON file: {e}")
return
# 3. Initialize Agent, Log file, and Tracking variables
try:
agent = BasicAgent()
except Exception as e:
print(f"Error initializing BasicAgent: {e}")
print("Evaluation cannot proceed.")
return
log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log"
print(f"Incorrect answers will be logged to: {log_filename}")
total_questions = len(gaia_data)
processed_count = 0
correct_count = 0
initially_correct = sum(1 for item in gaia_data.values()
if item.get("status") is True)
questions_to_process = total_questions - initially_correct
print(f"Found {initially_correct} questions already marked as correct.")
if questions_to_process == 0:
print("No questions with status=false found to process.")
# Still save the file in case formatting needs update, or just return
# For consistency, let's save it.
else:
print(f"Attempting to answer {questions_to_process} questions...")
start_time = time.time()
# 4. Process questions
test_questions = ["27"]
for q_num, data in gaia_data.items():
if q_num not in test_questions:
continue
if data.get("status") is False:
processed_count += 1
question = data.get("Q")
correct_answer = data.get("A")
file_name = data.get("file_name", None)
if file_name:
file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)
status = data.get("status") # Should be False here
if question is None or correct_answer is None:
print(f"Skipping question {q_num}: Missing 'Q' or 'A'.")
continue
elapsed_time = time.time() - start_time
print(
f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s")
print(f"Q: {question[:100]}...") # Print first 100 chars
try:
agent_response = agent(question, file_name)
print(f"Agent A: {agent_response}")
print(f"Correct A: {correct_answer}")
# Simple comparison (case-sensitive, exact match)
# Consider adding .strip() or lower() for more robust comparison if needed
if str(agent_response).strip() == str(correct_answer).strip():
print(f"Result for Q {q_num}: CORRECT")
gaia_data[q_num]["status"] = True
correct_count += 1
else:
print(f"Result for Q {q_num}: INCORRECT")
# Append to log file
with open(log_filename, 'a', encoding='utf-8') as log_f:
log_f.write(f"*question number {q_num} *\n")
log_f.write(f"Q: {question}\n")
log_f.write(f"A: {correct_answer}\n")
log_f.write(f"Agent: {agent_response}\n")
log_f.write("<END>\n\n")
except Exception as e:
print(f"Error processing question {q_num} with agent: {e}")
# Optionally log agent errors too
with open(log_filename, 'a', encoding='utf-8') as log_f:
log_f.write(f"*question number {q_num} *\n")
log_f.write(f"Q: {question}\n")
log_f.write(f"A: {correct_answer}\n")
log_f.write(f"Agent: ERROR - {e}\n")
log_f.write("<END>\n\n")
with open(tmp_json_file_path, 'w', encoding='utf-8') as f:
json.dump(gaia_data, f, indent=4, ensure_ascii=False)
else:
correct_count += 1
print(f"Skipping question {q_num}: Status is already True.")
# Exit
break
end_time = time.time()
total_time = end_time - start_time
# 5. Summary
print("\n--- Evaluation Summary ---")
print(f"Processed {processed_count} questions with status=false.")
print(f"Correct answers provided by agent: {correct_count}")
final_correct_count = initially_correct + correct_count
print(
f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}")
print(f"Total evaluation time: {total_time:.2f} seconds")
# 6. Save updated data
try:
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(gaia_data, f, indent=4, ensure_ascii=False)
print(f"Successfully saved updated data to {json_file_path}")
except Exception as e:
print(f"Error saving updated data to {json_file_path}: {e}")
print("--- GAIA Evaluation Finished ---")
def run_test_questions():
"""Instantiates the agent and runs it on the predefined questions."""
print("--- Starting Agent Test ---")
# Load environment variables (needed for BasicAgent initialization)
#load_dotenv()
#print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}")
agent = BasicAgent()
for question in QA_PAIRS.keys():
print(f"\n--- Testing Question ---")
print(f"Q: {question}")
answer = agent(question) # Call the agent instance
print(f"Agent A: {answer}")
if __name__ == "__main__":
load_dotenv()
if not os.getenv('HF_TOKEN'):
print("Warning: HF_TOKEN environment variable not found. Agent might fail.")
# run_test_questions()
#
eval_GAIA()
|