File size: 3,255 Bytes
193db9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
from typing import Any, Dict, List
import pandas as pd
def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
"""Create a DataFrame for the confidence plot."""
if not top_k_mode:
return pd.DataFrame(
{
"position": [r["position"] for r in results],
"confidence": [r["confidence"] for r in results],
"answer": [r["answer"] for r in results],
}
)
# For top-k mode, extract and plot top answers
return _create_top_k_plot_data(results)
def _create_top_k_plot_data(results: List[Dict]) -> pd.DataFrame:
"""Create plot data for top-k mode."""
# Find top answers across all positions (limited to top 5)
top_answers = set()
for r in results:
for g in r.get("guesses", [])[:3]: # Get top 3 from each position
if g.get("answer"):
top_answers.add(g.get("answer"))
top_answers = list(top_answers)[:5] # Limit to 5 total answers
# Create plot data for each answer
all_data = []
for position_idx, result in enumerate(results):
position = result["position"]
for answer in top_answers:
confidence = 0
for guess in result.get("guesses", []):
if guess.get("answer") == answer:
confidence = guess.get("confidence", 0)
break
all_data.append({"position": position, "confidence": confidence, "answer": answer})
return pd.DataFrame(all_data)
def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
"""Create a DataFrame for top-k results."""
df_rows = []
for result in results:
position = result["position"]
for i, guess in enumerate(result.get("guesses", [])):
df_rows.append(
{
"position": position,
"answer": guess.get("answer", ""),
"confidence": guess.get("confidence", 0),
"rank": i + 1,
}
)
return pd.DataFrame(df_rows)
def _format_buzz_result(buzzed: bool, results: List[Dict], gold_label: str, top_k_mode: bool) -> tuple[str, str, bool]:
"""Format the result text based on whether the agent buzzed."""
if not buzzed:
return f"Did not buzz. Correct answer was: {gold_label}", "No buzz", False
buzz_position = next(i for i, r in enumerate(results) if r.get("buzz", False))
buzz_result = results[buzz_position]
if top_k_mode:
# For top-k, check if any of the top guesses match
top_answers = [g.get("answer", "").lower() for g in buzz_result.get("guesses", [])]
correct = gold_label.lower() in [a.lower() for a in top_answers]
final_answer = top_answers[0] if top_answers else "No answer"
else:
# For regular mode
final_answer = buzz_result["answer"]
correct = final_answer.lower() == gold_label.lower()
result_text = f"BUZZED at position {buzz_position + 1} with answer: {final_answer}\n"
result_text += f"Correct answer: {gold_label}\n"
result_text += f"Result: {'CORRECT' if correct else 'INCORRECT'}"
return result_text, final_answer, correct
|