Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / test_agents.py

TzurVaich

Work in progress

ec4f8ef 8 days ago

raw

history blame contribute delete

7.64 kB

	# test_agents,py
	import json
	import time
	import datetime
	import os
	from dotenv import load_dotenv
	from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH

	# Import the BasicAgent from your app module
	try:
	from app import BasicAgent
	except ImportError as e:
	print(f"Error importing BasicAgent from app.py: {e}")
	print("Please ensure app.py is in the same directory or accessible in the Python path.")
	exit(1)

	# --- Define Question-Answer Pairs ---
	# Note: The 'A' part is just for reference here; the agent will generate its own answer.
	QA_PAIRS = {
	"What is the capital of France?": "Paris",
	"Who wrote 'Hamlet'?": "William Shakespeare",
	"What is the formula for water?": "H2O",
	"How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.",
	# Agent should find current data
	"What is the current population of Earth?": "Approximately 8 billion",
	}


	def eval_GAIA(json_file_path="GAIA_level1_status.json"):
	"""
	Loads GAIA level 1 questions from a JSON file, evaluates unanswered
	questions using the BasicAgent, logs incorrect answers, updates the
	status in the JSON data, and saves the updated data.

	Args:
	json_file_path (str): The path to the GAIA status JSON file.
	Defaults to "GAIA_level1_status.json".
	"""
	print(f"--- Starting GAIA Evaluation from {json_file_path} ---")

	tmp_json_file_path = json_file_path.replace(".json", "_tmp.json")

	# 2. Load GAIA data
	try:
	with open(json_file_path, 'r', encoding='utf-8') as f:
	gaia_data = json.load(f)
	print(
	f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.")
	except FileNotFoundError:
	print(f"Error: JSON file not found at {json_file_path}")
	return
	except json.JSONDecodeError:
	print(f"Error: Could not decode JSON from {json_file_path}")
	return
	except Exception as e:
	print(f"An unexpected error occurred loading the JSON file: {e}")
	return

	# 3. Initialize Agent, Log file, and Tracking variables
	try:
	agent = BasicAgent()
	except Exception as e:
	print(f"Error initializing BasicAgent: {e}")
	print("Evaluation cannot proceed.")
	return

	log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log"
	print(f"Incorrect answers will be logged to: {log_filename}")

	total_questions = len(gaia_data)
	processed_count = 0
	correct_count = 0
	initially_correct = sum(1 for item in gaia_data.values()
	if item.get("status") is True)
	questions_to_process = total_questions - initially_correct

	print(f"Found {initially_correct} questions already marked as correct.")
	if questions_to_process == 0:
	print("No questions with status=false found to process.")
	# Still save the file in case formatting needs update, or just return
	# For consistency, let's save it.
	else:
	print(f"Attempting to answer {questions_to_process} questions...")

	start_time = time.time()

	# 4. Process questions
	test_questions = ["27"]
	for q_num, data in gaia_data.items():
	if q_num not in test_questions:
	continue
	if data.get("status") is False:
	processed_count += 1
	question = data.get("Q")
	correct_answer = data.get("A")
	file_name = data.get("file_name", None)
	if file_name:
	file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)

	status = data.get("status") # Should be False here

	if question is None or correct_answer is None:
	print(f"Skipping question {q_num}: Missing 'Q' or 'A'.")
	continue

	elapsed_time = time.time() - start_time
	print(
	f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) \| Elapsed: {elapsed_time:.2f}s")
	print(f"Q: {question[:100]}...") # Print first 100 chars

	try:
	agent_response = agent(question, file_name)
	print(f"Agent A: {agent_response}")
	print(f"Correct A: {correct_answer}")

	# Simple comparison (case-sensitive, exact match)
	# Consider adding .strip() or lower() for more robust comparison if needed
	if str(agent_response).strip() == str(correct_answer).strip():
	print(f"Result for Q {q_num}: CORRECT")
	gaia_data[q_num]["status"] = True
	correct_count += 1
	else:
	print(f"Result for Q {q_num}: INCORRECT")
	# Append to log file
	with open(log_filename, 'a', encoding='utf-8') as log_f:
	log_f.write(f"question number {q_num} \n")
	log_f.write(f"Q: {question}\n")
	log_f.write(f"A: {correct_answer}\n")
	log_f.write(f"Agent: {agent_response}\n")
	log_f.write("<END>\n\n")

	except Exception as e:
	print(f"Error processing question {q_num} with agent: {e}")
	# Optionally log agent errors too
	with open(log_filename, 'a', encoding='utf-8') as log_f:
	log_f.write(f"question number {q_num} \n")
	log_f.write(f"Q: {question}\n")
	log_f.write(f"A: {correct_answer}\n")
	log_f.write(f"Agent: ERROR - {e}\n")
	log_f.write("<END>\n\n")

	with open(tmp_json_file_path, 'w', encoding='utf-8') as f:
	json.dump(gaia_data, f, indent=4, ensure_ascii=False)

	else:
	correct_count += 1
	print(f"Skipping question {q_num}: Status is already True.")

	# Exit
	break

	end_time = time.time()
	total_time = end_time - start_time

	# 5. Summary
	print("\n--- Evaluation Summary ---")
	print(f"Processed {processed_count} questions with status=false.")
	print(f"Correct answers provided by agent: {correct_count}")
	final_correct_count = initially_correct + correct_count
	print(
	f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}")
	print(f"Total evaluation time: {total_time:.2f} seconds")

	# 6. Save updated data
	try:
	with open(json_file_path, 'w', encoding='utf-8') as f:
	json.dump(gaia_data, f, indent=4, ensure_ascii=False)
	print(f"Successfully saved updated data to {json_file_path}")
	except Exception as e:
	print(f"Error saving updated data to {json_file_path}: {e}")

	print("--- GAIA Evaluation Finished ---")


	def run_test_questions():
	"""Instantiates the agent and runs it on the predefined questions."""
	print("--- Starting Agent Test ---")
	# Load environment variables (needed for BasicAgent initialization)
	#load_dotenv()
	#print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}")

	agent = BasicAgent()

	for question in QA_PAIRS.keys():
	print(f"\n--- Testing Question ---")
	print(f"Q: {question}")
	answer = agent(question) # Call the agent instance
	print(f"Agent A: {answer}")


	if __name__ == "__main__":

	load_dotenv()
	if not os.getenv('HF_TOKEN'):
	print("Warning: HF_TOKEN environment variable not found. Agent might fail.")

	# run_test_questions()

	#
	eval_GAIA()