Final_Assignment_Template

Sleeping

File size: 7,637 Bytes

# test_agents,py
import json
import time
import datetime
import os
from dotenv import load_dotenv
from app import BasicAgent, GAIA_LEVEL1_VALIDATION_FILES_PATH

# Import the BasicAgent from your app module
try:
    from app import BasicAgent
except ImportError as e:
    print(f"Error importing BasicAgent from app.py: {e}")
    print("Please ensure app.py is in the same directory or accessible in the Python path.")
    exit(1)

# --- Define Question-Answer Pairs ---
# Note: The 'A' part is just for reference here; the agent will generate its own answer.
QA_PAIRS = {
    "What is the capital of France?": "Paris",
    "Who wrote 'Hamlet'?": "William Shakespeare",
    "What is the formula for water?": "H2O",
    "How does photosynthesis work?": "Plants use sunlight, water, and carbon dioxide to create their own food.",
    # Agent should find current data
    "What is the current population of Earth?": "Approximately 8 billion",
}


def eval_GAIA(json_file_path="GAIA_level1_status.json"):
    """
    Loads GAIA level 1 questions from a JSON file, evaluates unanswered
    questions using the BasicAgent, logs incorrect answers, updates the
    status in the JSON data, and saves the updated data.

    Args:
        json_file_path (str): The path to the GAIA status JSON file.
                              Defaults to "GAIA_level1_status.json".
    """
    print(f"--- Starting GAIA Evaluation from {json_file_path} ---")

    tmp_json_file_path = json_file_path.replace(".json", "_tmp.json")

    # 2. Load GAIA data
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            gaia_data = json.load(f)
        print(
            f"Successfully loaded {len(gaia_data)} questions from {json_file_path}.")
    except FileNotFoundError:
        print(f"Error: JSON file not found at {json_file_path}")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_file_path}")
        return
    except Exception as e:
        print(f"An unexpected error occurred loading the JSON file: {e}")
        return

    # 3. Initialize Agent, Log file, and Tracking variables
    try:
        agent = BasicAgent()
    except Exception as e:
        print(f"Error initializing BasicAgent: {e}")
        print("Evaluation cannot proceed.")
        return

    log_filename = f"Response_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.log"
    print(f"Incorrect answers will be logged to: {log_filename}")

    total_questions = len(gaia_data)
    processed_count = 0
    correct_count = 0
    initially_correct = sum(1 for item in gaia_data.values()
                            if item.get("status") is True)
    questions_to_process = total_questions - initially_correct

    print(f"Found {initially_correct} questions already marked as correct.")
    if questions_to_process == 0:
        print("No questions with status=false found to process.")
        # Still save the file in case formatting needs update, or just return
        # For consistency, let's save it.
    else:
        print(f"Attempting to answer {questions_to_process} questions...")

    start_time = time.time()

    # 4. Process questions
    test_questions = ["27"]
    for q_num, data in gaia_data.items():
        if q_num not in test_questions:
            continue
        if data.get("status") is False:
            processed_count += 1
            question = data.get("Q")
            correct_answer = data.get("A")
            file_name = data.get("file_name", None)
            if file_name:
                file_name = os.path.join(GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)

            status = data.get("status")  # Should be False here

            if question is None or correct_answer is None:
                print(f"Skipping question {q_num}: Missing 'Q' or 'A'.")
                continue

            elapsed_time = time.time() - start_time
            print(
                f"\nProcessing question {processed_count}/{questions_to_process} (ID: {q_num}) | Elapsed: {elapsed_time:.2f}s")
            print(f"Q: {question[:100]}...")  # Print first 100 chars

            try:
                agent_response = agent(question, file_name)
                print(f"Agent A: {agent_response}")
                print(f"Correct A: {correct_answer}")

                # Simple comparison (case-sensitive, exact match)
                # Consider adding .strip() or lower() for more robust comparison if needed
                if str(agent_response).strip() == str(correct_answer).strip():
                    print(f"Result for Q {q_num}: CORRECT")
                    gaia_data[q_num]["status"] = True
                    correct_count += 1
                else:
                    print(f"Result for Q {q_num}: INCORRECT")
                    # Append to log file
                    with open(log_filename, 'a', encoding='utf-8') as log_f:
                        log_f.write(f"*question number {q_num} *\n")
                        log_f.write(f"Q: {question}\n")
                        log_f.write(f"A: {correct_answer}\n")
                        log_f.write(f"Agent: {agent_response}\n")
                        log_f.write("<END>\n\n")

            except Exception as e:
                print(f"Error processing question {q_num} with agent: {e}")
                # Optionally log agent errors too
                with open(log_filename, 'a', encoding='utf-8') as log_f:
                    log_f.write(f"*question number {q_num} *\n")
                    log_f.write(f"Q: {question}\n")
                    log_f.write(f"A: {correct_answer}\n")
                    log_f.write(f"Agent: ERROR - {e}\n")
                    log_f.write("<END>\n\n")

            with open(tmp_json_file_path, 'w', encoding='utf-8') as f:
                json.dump(gaia_data, f, indent=4, ensure_ascii=False)

        else:
            correct_count += 1
            print(f"Skipping question {q_num}: Status is already True.")

        # Exit
        break

    end_time = time.time()
    total_time = end_time - start_time

    # 5. Summary
    print("\n--- Evaluation Summary ---")
    print(f"Processed {processed_count} questions with status=false.")
    print(f"Correct answers provided by agent: {correct_count}")
    final_correct_count = initially_correct + correct_count
    print(
        f"Total correct answers (initial + agent): {final_correct_count}/{total_questions}")
    print(f"Total evaluation time: {total_time:.2f} seconds")

    # 6. Save updated data
    try:
        with open(json_file_path, 'w', encoding='utf-8') as f:
            json.dump(gaia_data, f, indent=4, ensure_ascii=False)
        print(f"Successfully saved updated data to {json_file_path}")
    except Exception as e:
        print(f"Error saving updated data to {json_file_path}: {e}")

    print("--- GAIA Evaluation Finished ---")


def run_test_questions():
    """Instantiates the agent and runs it on the predefined questions."""
    print("--- Starting Agent Test ---")
    # Load environment variables (needed for BasicAgent initialization)
    #load_dotenv()
    #print(f"HF_TOKEN found: {'Yes' if os.getenv('HF_TOKEN') else 'No'}")

    agent = BasicAgent()

    for question in QA_PAIRS.keys():
        print(f"\n--- Testing Question ---")
        print(f"Q: {question}")
        answer = agent(question)  # Call the agent instance
        print(f"Agent A: {answer}")


if __name__ == "__main__":

    load_dotenv()
    if not os.getenv('HF_TOKEN'):
        print("Warning: HF_TOKEN environment variable not found. Agent might fail.")

    # run_test_questions()

    # 
    eval_GAIA()