# test_agents,py import json import time import datetime import os from dotenv import load_dotenv from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH # Import the BasicAgent from your app module try: from app import BasicAgent except ImportError as e: print(f"Error importing BasicAgent from app.py: {e}") print("Please ensure app.py is in the same directory or accessible in the Python path.") exit(1) # --- Define Question-Answer Pairs --- # Note: The 'A' part is just for reference here; the agent will generate its own answer. QA_PAIRS = { "What is the capital of France?": "Paris", "Who wrote 'Hamlet'?": "William Shakespeare", "What is the formula for water?": "H2O", "How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.", # Agent should find current data "What is the current population of Earth?": "Approximately 8 billion", } def eval_GAIA(json_file_path="GAIA_level1_status.json"): """ Loads GAIA level 1 questions from a JSON file, evaluates unanswered questions using the BasicAgent, logs incorrect answers, updates the status in the JSON data, and saves the updated data. Args: json_file_path (str): The path to the GAIA status JSON file. Defaults to "GAIA_level1_status.json". """ print(f"--- Starting GAIA Evaluation from {json_file_path} ---") tmp_json_file_path = json_file_path.replace(".json", "_tmp.json") # 2. Load GAIA data try: with open(json_file_path, 'r', encoding='utf-8') as f: gaia_data = json.load(f) print( f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.") except FileNotFoundError: print(f"Error: JSON file not found at {json_file_path}") return except json.JSONDecodeError: print(f"Error: Could not decode JSON from {json_file_path}") return except Exception as e: print(f"An unexpected error occurred loading the JSON file: {e}") return # 3. Initialize Agent, Log file, and Tracking variables try: agent = BasicAgent() except Exception as e: print(f"Error initializing BasicAgent: {e}") print("Evaluation cannot proceed.") return log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log" print(f"Incorrect answers will be logged to: {log_filename}") total_questions = len(gaia_data) processed_count = 0 correct_count = 0 initially_correct = sum(1 for item in gaia_data.values() if item.get("status") is True) questions_to_process = total_questions - initially_correct print(f"Found {initially_correct} questions already marked as correct.") if questions_to_process == 0: print("No questions with status=false found to process.") # Still save the file in case formatting needs update, or just return # For consistency, let's save it. else: print(f"Attempting to answer {questions_to_process} questions...") start_time = time.time() # 4. Process questions test_questions = ["27"] for q_num, data in gaia_data.items(): if q_num not in test_questions: continue if data.get("status") is False: processed_count += 1 question = data.get("Q") correct_answer = data.get("A") file_name = data.get("file_name", None) if file_name: file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name) status = data.get("status") # Should be False here if question is None or correct_answer is None: print(f"Skipping question {q_num}: Missing 'Q' or 'A'.") continue elapsed_time = time.time() - start_time print( f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s") print(f"Q: {question[:100]}...") # Print first 100 chars try: agent_response = agent(question, file_name) print(f"Agent A: {agent_response}") print(f"Correct A: {correct_answer}") # Simple comparison (case-sensitive, exact match) # Consider adding .strip() or lower() for more robust comparison if needed if str(agent_response).strip() == str(correct_answer).strip(): print(f"Result for Q {q_num}: CORRECT") gaia_data[q_num]["status"] = True correct_count += 1 else: print(f"Result for Q {q_num}: INCORRECT") # Append to log file with open(log_filename, 'a', encoding='utf-8') as log_f: log_f.write(f"*question number {q_num} *\n") log_f.write(f"Q: {question}\n") log_f.write(f"A: {correct_answer}\n") log_f.write(f"Agent: {agent_response}\n") log_f.write("\n\n") except Exception as e: print(f"Error processing question {q_num} with agent: {e}") # Optionally log agent errors too with open(log_filename, 'a', encoding='utf-8') as log_f: log_f.write(f"*question number {q_num} *\n") log_f.write(f"Q: {question}\n") log_f.write(f"A: {correct_answer}\n") log_f.write(f"Agent: ERROR - {e}\n") log_f.write("\n\n") with open(tmp_json_file_path, 'w', encoding='utf-8') as f: json.dump(gaia_data, f, indent=4, ensure_ascii=False) else: correct_count += 1 print(f"Skipping question {q_num}: Status is already True.") # Exit break end_time = time.time() total_time = end_time - start_time # 5. Summary print("\n--- Evaluation Summary ---") print(f"Processed {processed_count} questions with status=false.") print(f"Correct answers provided by agent: {correct_count}") final_correct_count = initially_correct + correct_count print( f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}") print(f"Total evaluation time: {total_time:.2f} seconds") # 6. Save updated data try: with open(json_file_path, 'w', encoding='utf-8') as f: json.dump(gaia_data, f, indent=4, ensure_ascii=False) print(f"Successfully saved updated data to {json_file_path}") except Exception as e: print(f"Error saving updated data to {json_file_path}: {e}") print("--- GAIA Evaluation Finished ---") def run_test_questions(): """Instantiates the agent and runs it on the predefined questions.""" print("--- Starting Agent Test ---") # Load environment variables (needed for BasicAgent initialization) #load_dotenv() #print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}") agent = BasicAgent() for question in QA_PAIRS.keys(): print(f"\n--- Testing Question ---") print(f"Q: {question}") answer = agent(question) # Call the agent instance print(f"Agent A: {answer}") if __name__ == "__main__": load_dotenv() if not os.getenv('HF_TOKEN'): print("Warning: HF_TOKEN environment variable not found. Agent might fail.") # run_test_questions() # eval_GAIA()