Final_Assignment_Template / test_agents.py
TzurVaich's picture
Work in progress
ec4f8ef
# test_agents,py
import json
import time
import datetime
import os
from dotenv import load_dotenv
from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH
# Import the BasicAgent from your app module
try:
from app import BasicAgent
except ImportError as e:
print(f"Error importing BasicAgent from app.py: {e}")
print("Please ensure app.py is in the same directory or accessible in the Python path.")
exit(1)
# --- Define Question-Answer Pairs ---
# Note: The 'A' part is just for reference here; the agent will generate its own answer.
QA_PAIRS = {
"What is the capital of France?": "Paris",
"Who wrote 'Hamlet'?": "William Shakespeare",
"What is the formula for water?": "H2O",
"How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.",
# Agent should find current data
"What is the current population of Earth?": "Approximately 8 billion",
}
def eval_GAIA(json_file_path="GAIA_level1_status.json"):
"""
Loads GAIA level 1 questions from a JSON file, evaluates unanswered
questions using the BasicAgent, logs incorrect answers, updates the
status in the JSON data, and saves the updated data.
Args:
json_file_path (str): The path to the GAIA status JSON file.
Defaults to "GAIA_level1_status.json".
"""
print(f"--- Starting GAIA Evaluation from {json_file_path} ---")
tmp_json_file_path = json_file_path.replace(".json", "_tmp.json")
# 2. Load GAIA data
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
gaia_data = json.load(f)
print(
f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.")
except FileNotFoundError:
print(f"Error: JSON file not found at {json_file_path}")
return
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {json_file_path}")
return
except Exception as e:
print(f"An unexpected error occurred loading the JSON file: {e}")
return
# 3. Initialize Agent, Log file, and Tracking variables
try:
agent = BasicAgent()
except Exception as e:
print(f"Error initializing BasicAgent: {e}")
print("Evaluation cannot proceed.")
return
log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log"
print(f"Incorrect answers will be logged to: {log_filename}")
total_questions = len(gaia_data)
processed_count = 0
correct_count = 0
initially_correct = sum(1 for item in gaia_data.values()
if item.get("status") is True)
questions_to_process = total_questions - initially_correct
print(f"Found {initially_correct} questions already marked as correct.")
if questions_to_process == 0:
print("No questions with status=false found to process.")
# Still save the file in case formatting needs update, or just return
# For consistency, let's save it.
else:
print(f"Attempting to answer {questions_to_process} questions...")
start_time = time.time()
# 4. Process questions
test_questions = ["27"]
for q_num, data in gaia_data.items():
if q_num not in test_questions:
continue
if data.get("status") is False:
processed_count += 1
question = data.get("Q")
correct_answer = data.get("A")
file_name = data.get("file_name", None)
if file_name:
file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)
status = data.get("status") # Should be False here
if question is None or correct_answer is None:
print(f"Skipping question {q_num}: Missing 'Q' or 'A'.")
continue
elapsed_time = time.time() - start_time
print(
f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s")
print(f"Q: {question[:100]}...") # Print first 100 chars
try:
agent_response = agent(question, file_name)
print(f"Agent A: {agent_response}")
print(f"Correct A: {correct_answer}")
# Simple comparison (case-sensitive, exact match)
# Consider adding .strip() or lower() for more robust comparison if needed
if str(agent_response).strip() == str(correct_answer).strip():
print(f"Result for Q {q_num}: CORRECT")
gaia_data[q_num]["status"] = True
correct_count += 1
else:
print(f"Result for Q {q_num}: INCORRECT")
# Append to log file
with open(log_filename, 'a', encoding='utf-8') as log_f:
log_f.write(f"*question number {q_num} *\n")
log_f.write(f"Q: {question}\n")
log_f.write(f"A: {correct_answer}\n")
log_f.write(f"Agent: {agent_response}\n")
log_f.write("<END>\n\n")
except Exception as e:
print(f"Error processing question {q_num} with agent: {e}")
# Optionally log agent errors too
with open(log_filename, 'a', encoding='utf-8') as log_f:
log_f.write(f"*question number {q_num} *\n")
log_f.write(f"Q: {question}\n")
log_f.write(f"A: {correct_answer}\n")
log_f.write(f"Agent: ERROR - {e}\n")
log_f.write("<END>\n\n")
with open(tmp_json_file_path, 'w', encoding='utf-8') as f:
json.dump(gaia_data, f, indent=4, ensure_ascii=False)
else:
correct_count += 1
print(f"Skipping question {q_num}: Status is already True.")
# Exit
break
end_time = time.time()
total_time = end_time - start_time
# 5. Summary
print("\n--- Evaluation Summary ---")
print(f"Processed {processed_count} questions with status=false.")
print(f"Correct answers provided by agent: {correct_count}")
final_correct_count = initially_correct + correct_count
print(
f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}")
print(f"Total evaluation time: {total_time:.2f} seconds")
# 6. Save updated data
try:
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(gaia_data, f, indent=4, ensure_ascii=False)
print(f"Successfully saved updated data to {json_file_path}")
except Exception as e:
print(f"Error saving updated data to {json_file_path}: {e}")
print("--- GAIA Evaluation Finished ---")
def run_test_questions():
"""Instantiates the agent and runs it on the predefined questions."""
print("--- Starting Agent Test ---")
# Load environment variables (needed for BasicAgent initialization)
#load_dotenv()
#print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}")
agent = BasicAgent()
for question in QA_PAIRS.keys():
print(f"\n--- Testing Question ---")
print(f"Q: {question}")
answer = agent(question) # Call the agent instance
print(f"Agent A: {answer}")
if __name__ == "__main__":
load_dotenv()
if not os.getenv('HF_TOKEN'):
print("Warning: HF_TOKEN environment variable not found. Agent might fail.")
# run_test_questions()
#
eval_GAIA()