Spaces:
Sleeping
Sleeping
# test_agents,py | |
import json | |
import time | |
import datetime | |
import os | |
from dotenv import load_dotenv | |
from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH | |
# Import the BasicAgent from your app module | |
try: | |
from app import BasicAgent | |
except ImportError as e: | |
print(f"Error importing BasicAgent from app.py: {e}") | |
print("Please ensure app.py is in the same directory or accessible in the Python path.") | |
exit(1) | |
# --- Define Question-Answer Pairs --- | |
# Note: The 'A' part is just for reference here; the agent will generate its own answer. | |
QA_PAIRS = { | |
"What is the capital of France?": "Paris", | |
"Who wrote 'Hamlet'?": "William Shakespeare", | |
"What is the formula for water?": "H2O", | |
"How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.", | |
# Agent should find current data | |
"What is the current population of Earth?": "Approximately 8 billion", | |
} | |
def eval_GAIA(json_file_path="GAIA_level1_status.json"): | |
""" | |
Loads GAIA level 1 questions from a JSON file, evaluates unanswered | |
questions using the BasicAgent, logs incorrect answers, updates the | |
status in the JSON data, and saves the updated data. | |
Args: | |
json_file_path (str): The path to the GAIA status JSON file. | |
Defaults to "GAIA_level1_status.json". | |
""" | |
print(f"--- Starting GAIA Evaluation from {json_file_path} ---") | |
tmp_json_file_path = json_file_path.replace(".json", "_tmp.json") | |
# 2. Load GAIA data | |
try: | |
with open(json_file_path, 'r', encoding='utf-8') as f: | |
gaia_data = json.load(f) | |
print( | |
f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.") | |
except FileNotFoundError: | |
print(f"Error: JSON file not found at {json_file_path}") | |
return | |
except json.JSONDecodeError: | |
print(f"Error: Could not decode JSON from {json_file_path}") | |
return | |
except Exception as e: | |
print(f"An unexpected error occurred loading the JSON file: {e}") | |
return | |
# 3. Initialize Agent, Log file, and Tracking variables | |
try: | |
agent = BasicAgent() | |
except Exception as e: | |
print(f"Error initializing BasicAgent: {e}") | |
print("Evaluation cannot proceed.") | |
return | |
log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log" | |
print(f"Incorrect answers will be logged to: {log_filename}") | |
total_questions = len(gaia_data) | |
processed_count = 0 | |
correct_count = 0 | |
initially_correct = sum(1 for item in gaia_data.values() | |
if item.get("status") is True) | |
questions_to_process = total_questions - initially_correct | |
print(f"Found {initially_correct} questions already marked as correct.") | |
if questions_to_process == 0: | |
print("No questions with status=false found to process.") | |
# Still save the file in case formatting needs update, or just return | |
# For consistency, let's save it. | |
else: | |
print(f"Attempting to answer {questions_to_process} questions...") | |
start_time = time.time() | |
# 4. Process questions | |
test_questions = ["27"] | |
for q_num, data in gaia_data.items(): | |
if q_num not in test_questions: | |
continue | |
if data.get("status") is False: | |
processed_count += 1 | |
question = data.get("Q") | |
correct_answer = data.get("A") | |
file_name = data.get("file_name", None) | |
if file_name: | |
file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name) | |
status = data.get("status") # Should be False here | |
if question is None or correct_answer is None: | |
print(f"Skipping question {q_num}: Missing 'Q' or 'A'.") | |
continue | |
elapsed_time = time.time() - start_time | |
print( | |
f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s") | |
print(f"Q: {question[:100]}...") # Print first 100 chars | |
try: | |
agent_response = agent(question, file_name) | |
print(f"Agent A: {agent_response}") | |
print(f"Correct A: {correct_answer}") | |
# Simple comparison (case-sensitive, exact match) | |
# Consider adding .strip() or lower() for more robust comparison if needed | |
if str(agent_response).strip() == str(correct_answer).strip(): | |
print(f"Result for Q {q_num}: CORRECT") | |
gaia_data[q_num]["status"] = True | |
correct_count += 1 | |
else: | |
print(f"Result for Q {q_num}: INCORRECT") | |
# Append to log file | |
with open(log_filename, 'a', encoding='utf-8') as log_f: | |
log_f.write(f"*question number {q_num} *\n") | |
log_f.write(f"Q: {question}\n") | |
log_f.write(f"A: {correct_answer}\n") | |
log_f.write(f"Agent: {agent_response}\n") | |
log_f.write("<END>\n\n") | |
except Exception as e: | |
print(f"Error processing question {q_num} with agent: {e}") | |
# Optionally log agent errors too | |
with open(log_filename, 'a', encoding='utf-8') as log_f: | |
log_f.write(f"*question number {q_num} *\n") | |
log_f.write(f"Q: {question}\n") | |
log_f.write(f"A: {correct_answer}\n") | |
log_f.write(f"Agent: ERROR - {e}\n") | |
log_f.write("<END>\n\n") | |
with open(tmp_json_file_path, 'w', encoding='utf-8') as f: | |
json.dump(gaia_data, f, indent=4, ensure_ascii=False) | |
else: | |
correct_count += 1 | |
print(f"Skipping question {q_num}: Status is already True.") | |
# Exit | |
break | |
end_time = time.time() | |
total_time = end_time - start_time | |
# 5. Summary | |
print("\n--- Evaluation Summary ---") | |
print(f"Processed {processed_count} questions with status=false.") | |
print(f"Correct answers provided by agent: {correct_count}") | |
final_correct_count = initially_correct + correct_count | |
print( | |
f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}") | |
print(f"Total evaluation time: {total_time:.2f} seconds") | |
# 6. Save updated data | |
try: | |
with open(json_file_path, 'w', encoding='utf-8') as f: | |
json.dump(gaia_data, f, indent=4, ensure_ascii=False) | |
print(f"Successfully saved updated data to {json_file_path}") | |
except Exception as e: | |
print(f"Error saving updated data to {json_file_path}: {e}") | |
print("--- GAIA Evaluation Finished ---") | |
def run_test_questions(): | |
"""Instantiates the agent and runs it on the predefined questions.""" | |
print("--- Starting Agent Test ---") | |
# Load environment variables (needed for BasicAgent initialization) | |
#load_dotenv() | |
#print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}") | |
agent = BasicAgent() | |
for question in QA_PAIRS.keys(): | |
print(f"\n--- Testing Question ---") | |
print(f"Q: {question}") | |
answer = agent(question) # Call the agent instance | |
print(f"Agent A: {answer}") | |
if __name__ == "__main__": | |
load_dotenv() | |
if not os.getenv('HF_TOKEN'): | |
print("Warning: HF_TOKEN environment variable not found. Agent might fail.") | |
# run_test_questions() | |
# | |
eval_GAIA() | |