Spaces:
Running
Running
File size: 20,557 Bytes
298104a d11f3f5 298104a d11f3f5 9f53a53 298104a a9f9e3a 298104a 9f53a53 d93406c 9f53a53 a9f9e3a 9f53a53 d11f3f5 9f53a53 e728ff2 d11f3f5 e728ff2 d11f3f5 e728ff2 d11f3f5 e728ff2 d11f3f5 e728ff2 d11f3f5 e728ff2 d11f3f5 d085539 d11f3f5 c638053 42a9dc7 c638053 42a9dc7 d11f3f5 c638053 d11f3f5 c638053 d11f3f5 298104a 5b9a69f d11f3f5 5b9a69f 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 73fc334 d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a 6b849a6 d11f3f5 298104a d11f3f5 298104a d11f3f5 298104a d11f3f5 d93406c d11f3f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 |
import os
import pandas as pd
from fastapi import FastAPI, HTTPException, Body
from pydantic import BaseModel, Field
from typing import List, Dict, Any
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import HfApi, hf_hub_download
from datetime import datetime, timezone
import logging
import uvicorn # To run the app
import random # <-- Added import for random choice
tool_threshold = 3
step_threshold = 5
# --- Configuration ---
HF_DATASET_ID = "agents-course/unit4-students-scores"
# Ensure you have write access to this dataset repository on Hugging Face
# and are logged in via `huggingface-cli login` or have HF_TOKEN env var set.
# Prepare data structures for the API
questions_for_api: List[Dict[str, str]] = []
ground_truth_answers: Dict[str, str] = {}
# --- Logging Setup ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
filtered_dataset=None
def load_questions():
global filtered_dataset
global questions_for_api
global ground_truth_answers
tempo_filtered=[]
# Clear existing data to prevent duplication if called multiple times
questions_for_api.clear()
ground_truth_answers.clear()
logger.info("Starting to load and filter GAIA dataset...")
try:
dataset=load_dataset("gaia-benchmark/GAIA","2023_level1",trust_remote_code=True)
logger.info("GAIA dataset loaded.")
except Exception as e:
logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
# Decide how to handle this: maybe raise the error or exit
raise RuntimeError("Could not load the primary GAIA dataset.") from e
for question in dataset['validation']:
metadata = question.get('Annotator Metadata') # Use .get() for safety
if metadata: # Check if 'Annotator Metadata' exists
num_tools_str = metadata.get('Number of tools')
num_steps_str = metadata.get('Number of steps')
# Check if both numbers exist before trying to convert
if num_tools_str is not None and num_steps_str is not None:
try:
# Convert values to integers for comparison
num_tools = int(num_tools_str)
num_steps = int(num_steps_str)
# Apply the filter conditions
if num_tools < tool_threshold and num_steps < step_threshold:
# logger.debug(f"MATCH FOUND (Task ID: {question.get('task_id', 'N/A')}) - Tools: {num_tools}, Steps: {num_steps}")
# logger.debug(question) # Print the matching question dictionary
# logger.debug("------------------------------------------------------------------")
tempo_filtered.append(question) # Add to the filtered list
# else: # Optional: Handle items that don't match the filter
# logger.debug(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Tools: {num_tools}, Steps: {num_steps}")
except ValueError:
# Handle cases where 'Number of tools' or 'Number of steps' is not a valid integer
logger.warning(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Could not convert tool/step count to integer: tools='{num_tools_str}', steps='{num_steps_str}'.")
# logger.debug("------------------------------------------------------------------")
else:
logger.warning(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
# logger.debug("------------------------------------------------------------------")
filtered_dataset=tempo_filtered
logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
# print(filtered_dataset) # Keep this commented unless debugging
processed_count = 0
for item in filtered_dataset:
task_id = item.get('task_id')
question_text = item.get('Question')
final_answer = item.get('Final answer')
# Validate required fields
if task_id and question_text and final_answer is not None:
# Create a copy of the item and remove fields we don't want
processed_item = item.copy()
processed_item.pop('Final answer', None) # Remove Final answer
processed_item.pop('Annotator Annotation', None) # Remove Annotator Annotation
# Store in questions_for_api
questions_for_api.append(processed_item)
# Still store the ground truth answers separately
ground_truth_answers[str(task_id)] = str(final_answer)
processed_count += 1
else:
logger.warning(f"Skipping item due to missing fields (task_id, Question, or Final answer): {item}")
if not questions_for_api:
logger.error("CRITICAL: No valid questions loaded after filtering. API endpoints needing questions will fail.")
# Depending on requirements, you might want to exit or raise an error here
# raise RuntimeError("Failed to load mandatory question data after filtering.")
# --- Pydantic Models for Data Validation ---
class Question(BaseModel):
task_id: str
question: str
class AnswerItem(BaseModel):
task_id: str
submitted_answer: str = Field(..., description="The agent's answer for the task_id")
class Submission(BaseModel):
username: str = Field(..., description="Hugging Face username", min_length=1)
agent_code: str = Field(..., description="The Python class code for the agent", min_length=10) # Basic check
answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
class ScoreResponse(BaseModel):
username: str
score: float
correct_count: int
total_attempted: int
message: str
timestamp: str
class ErrorResponse(BaseModel):
detail: str
# --- FastAPI Application ---
app = FastAPI(
title="Agent Evaluation API",
description="API to fetch questions and submit agent answers for scoring.",
)
# --- Startup Event Handler ---
@app.on_event("startup")
async def startup_event():
"""
Loads the questions when the FastAPI application starts.
"""
logger.info("Application startup: Loading questions...")
try:
load_questions() # Call your loading function here
if not questions_for_api:
logger.error("CRITICAL: No questions were loaded during startup. The /questions and /random-question endpoints might fail.")
# Depending on requirements, you might want the app to fail startup
# raise RuntimeError("Failed to load mandatory question data.")
else:
logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
except Exception as e:
logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
# Decide if the app should exit if loading fails
# import sys
# sys.exit(1)
# --- Helper Function to interact with HF Dataset ---
def update_huggingface_dataset(username: str, score: float):
"""Loads the dataset, updates the score if higher, and pushes back."""
try:
# 1. Load the dataset
logger.info(f"Loading dataset '{HF_DATASET_ID}'...")
ds_dict = None
try:
# Use hf_hub_download to check if the parquet file exists, avoiding full dataset load error if empty
# This assumes the dataset uses the default 'train' split and parquet format. Adjust if needed.
hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
ds_dict = load_dataset(HF_DATASET_ID)
logger.info("Dataset loaded successfully.")
if "train" not in ds_dict:
logger.warning(f"Dataset '{HF_DATASET_ID}' does not contain a 'train' split. Creating one.")
df = pd.DataFrame({'username': pd.Series(dtype='str'),
'score': pd.Series(dtype='float'),
'timestamp': pd.Series(dtype='str')})
else:
# Convert the 'train' split to a pandas DataFrame for easier manipulation
df = ds_dict['train'].to_pandas()
except Exception as load_error: # Catch broad exception for file not found or other loading issues
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it might be empty/new ({load_error}). Creating structure.")
# Create an empty DataFrame with the correct schema
df = pd.DataFrame({'username': pd.Series(dtype='str'),
'score': pd.Series(dtype='float'),
'timestamp': pd.Series(dtype='str')})
# Ensure columns exist, add if they don't
for col, dtype in [('username', 'str'), ('score', 'float'), ('timestamp', 'str')]:
if col not in df.columns:
logger.warning(f"Column '{col}' not found in dataset. Adding it.")
df[col] = pd.Series(dtype=dtype)
# Convert score column to numeric, coercing errors
df['score'] = pd.to_numeric(df['score'], errors='coerce')
# Fill potential NaN values in score with 0.0 before comparison/aggregation
df['score'] = df['score'].fillna(0.0)
# 2. Find existing score for the user
existing_entries = df[df['username'] == username]
current_timestamp = datetime.now(timezone.utc).isoformat()
needs_update = False
if not existing_entries.empty:
# User exists, find their highest score
# Handle potential NaN scores from coercion or previous bad data (though fillna above should help)
max_existing_score = existing_entries['score'].max()
if score > max_existing_score:
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating.")
# Remove old entries for this user
df = df[df['username'] != username]
# Add new entry
new_entry = pd.DataFrame([{'username': username, 'score': score, 'timestamp': current_timestamp}])
df = pd.concat([df, new_entry], ignore_index=True)
needs_update = True
else:
logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
else:
# User does not exist, add them
logger.info(f"User {username} not found. Adding new entry.")
new_entry = pd.DataFrame([{'username': username, 'score': score, 'timestamp': current_timestamp}])
df = pd.concat([df, new_entry], ignore_index=True)
needs_update = True
# 3. Push updated data back to Hugging Face Hub if changes were made
if needs_update:
logger.info(f"Pushing updated dataset to '{HF_DATASET_ID}'...")
# Convert potentially modified DataFrame back to a Dataset object
# Ensure the schema matches if columns were added/modified.
# Use 'train' split convention.
# Make sure the dtypes are correct before creating the Dataset
df['username'] = df['username'].astype(str)
df['score'] = df['score'].astype(float)
df['timestamp'] = df['timestamp'].astype(str)
updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
logger.info(f"Dataset to push: {updated_ds}") # Log the dataset structure
updated_ds.push_to_hub(HF_DATASET_ID) # Token should be picked up from env or login
logger.warning("Dataset push to hub is currently commented out. Uncomment the line above to enable leaderboard updates.") # REMINDER
logger.info("Dataset push simulated/attempted.")
return True
else:
return False # No update was pushed
except Exception as e:
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
# Re-raise the exception to be caught by the endpoint handler
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
# --- API Endpoints ---
@app.get("/questions",
response_model=List[Question],
summary="Get All Filtered Questions",
description="Returns the complete list of questions (task_id and question text only) filtered based on criteria.")
async def get_questions():
"""
Provides the list of questions that agents should answer.
"""
# print(f"Returning {len(questions_for_api)} questions.") # Debug log
if not questions_for_api:
logger.error("GET /questions requested but no questions are loaded.")
raise HTTPException(status_code=404, detail="No questions available.")
return questions_for_api
# --- NEW ENDPOINT ---
@app.get("/random-question",
response_model=Question,
summary="Get One Random Question",
description="Returns a single random question from the available filtered set.",
responses={
200: {"description": "A random question."},
404: {"model": ErrorResponse, "description": "No questions available to choose from."}
})
async def get_random_question():
"""
Provides a single, randomly selected question from the loaded list.
"""
if not questions_for_api:
logger.warning("GET /random-question requested but no questions are loaded.")
raise HTTPException(status_code=404, detail="No questions available to choose from.")
# Select and return a random question dictionary
random_question = random.choice(questions_for_api)
logger.info(f"Returning random question with task_id: {random_question['task_id']}")
return random_question
# --- END NEW ENDPOINT ---
@app.post("/submit",
response_model=ScoreResponse,
summary="Submit Agent Answers",
description="Submit answers from an agent, calculate score, and update leaderboard on Hugging Face.",
responses={
200: {"description": "Submission successful, score calculated."},
400: {"model": ErrorResponse, "description": "Invalid input data."},
404: {"model": ErrorResponse, "description": "Task ID not found in submission or ground truth."},
500: {"model": ErrorResponse, "description": "Server error (e.g., failed to update dataset)."}
})
async def submit_answers(submission: Submission = Body(...)):
"""
Receives agent submissions:
- Validates input.
- Checks presence of agent code (basic anti-cheat).
- Calculates score based on submitted answers vs ground truth.
- Updates the score on the Hugging Face dataset if it's a new high score for the user.
"""
logger.info(f"Received submission from username: {submission.username}")
# Basic check for agent code presence
if not submission.agent_code or len(submission.agent_code.strip()) < 10:
logger.warning(f"Submission rejected for {submission.username}: Agent code missing or too short.")
raise HTTPException(status_code=400, detail="Agent code is required and must be sufficiently long.")
if not submission.answers:
logger.warning(f"Submission rejected for {submission.username}: No answers provided.")
raise HTTPException(status_code=400, detail="No answers provided in the submission.")
correct_count = 0
total_attempted_in_payload = len(submission.answers)
valid_attempted_count = 0 # Count attempts where task_id was valid
processed_ids = set()
for answer_item in submission.answers:
task_id = str(answer_item.task_id) # Ensure string comparison
submitted = str(answer_item.submitted_answer) # Ensure string comparison
# Prevent duplicate task_id submissions in the same request
if task_id in processed_ids:
logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
continue # Don't count this as an attempt for scoring
processed_ids.add(task_id)
# Check if task_id is valid (exists in our loaded ground truth)
if task_id not in ground_truth_answers:
logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
# Don't count this as a valid attempt for score calculation
continue
# If we reach here, the task_id is valid
valid_attempted_count += 1
ground_truth = ground_truth_answers[task_id]
# Compare answers (case-insensitive, strip whitespace)
if submitted.strip().lower() == ground_truth.strip().lower():
correct_count += 1
logger.debug(f"Correct answer for {task_id} from {submission.username}")
else:
logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
# Calculate score based on valid attempts
if valid_attempted_count == 0:
score = 0.0
message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
logger.warning(f"No valid answers processed for {submission.username} out of {total_attempted_in_payload} submitted.")
else:
score = round((correct_count / len(ground_truth_answers)) * 100, 2)
message = f"Score calculated successfully: {correct_count}/{valid_attempted_count} correct answers for valid tasks."
if valid_attempted_count < total_attempted_in_payload:
message += f" ({total_attempted_in_payload - valid_attempted_count} submitted answers had invalid or duplicate task IDs)."
logger.info(f"Score for {submission.username}: {score}% ({correct_count}/{valid_attempted_count})")
# Update Hugging Face dataset
try:
updated = update_huggingface_dataset(submission.username, score)
if updated:
message += " High score updated on leaderboard."
logger.info(f"Leaderboard updated for {submission.username}.")
else:
message += " Score did not improve previous record, leaderboard not updated."
logger.info(f"Leaderboard not updated for {submission.username} as score was not higher.")
except HTTPException as http_exc:
# Propagate HTTPException from the helper function (e.g., 500 error)
raise http_exc
except Exception as e:
# Catch any other unexpected errors during HF update
logger.error(f"Unexpected error during dataset update for {submission.username}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="An unexpected error occurred while updating the leaderboard.")
return ScoreResponse(
username=submission.username,
score=score,
correct_count=correct_count,
# Return the count of *valid* attempts for clarity
total_attempted=valid_attempted_count,
message=message,
timestamp=datetime.now(timezone.utc).isoformat()
)
# --- Run the application ---
# This part is mainly for local development without Docker.
# Docker uses the CMD instruction in the Dockerfile.
if __name__ == "__main__":
logger.info("Starting FastAPI server for local development...")
# Explicitly call load_questions here for local run,
# as the @app.on_event("startup") might not trigger reliably
# depending on how uvicorn is invoked directly.
try:
load_questions()
if not questions_for_api:
logger.error("EXITING: Cannot start server without loaded questions.")
else:
# Read port from environment variable for consistency, default to 8000 for local if not set
local_port = int(os.getenv("PORT", "8000"))
logger.info(f"Running Uvicorn locally on http://127.0.0.1:{local_port}")
# Note: host='127.0.0.1' is usually fine for local runs outside docker
uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
except Exception as e:
logger.error(f"Failed to start server: {e}", exc_info=True) |