File size: 3,255 Bytes
193db9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from typing import Any, Dict, List

import pandas as pd


def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
    """Create a DataFrame for the confidence plot."""
    if not top_k_mode:
        return pd.DataFrame(
            {
                "position": [r["position"] for r in results],
                "confidence": [r["confidence"] for r in results],
                "answer": [r["answer"] for r in results],
            }
        )

    # For top-k mode, extract and plot top answers
    return _create_top_k_plot_data(results)


def _create_top_k_plot_data(results: List[Dict]) -> pd.DataFrame:
    """Create plot data for top-k mode."""
    # Find top answers across all positions (limited to top 5)
    top_answers = set()
    for r in results:
        for g in r.get("guesses", [])[:3]:  # Get top 3 from each position
            if g.get("answer"):
                top_answers.add(g.get("answer"))

    top_answers = list(top_answers)[:5]  # Limit to 5 total answers

    # Create plot data for each answer
    all_data = []
    for position_idx, result in enumerate(results):
        position = result["position"]
        for answer in top_answers:
            confidence = 0
            for guess in result.get("guesses", []):
                if guess.get("answer") == answer:
                    confidence = guess.get("confidence", 0)
                    break
            all_data.append({"position": position, "confidence": confidence, "answer": answer})

    return pd.DataFrame(all_data)


def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
    """Create a DataFrame for top-k results."""
    df_rows = []
    for result in results:
        position = result["position"]
        for i, guess in enumerate(result.get("guesses", [])):
            df_rows.append(
                {
                    "position": position,
                    "answer": guess.get("answer", ""),
                    "confidence": guess.get("confidence", 0),
                    "rank": i + 1,
                }
            )
    return pd.DataFrame(df_rows)


def _format_buzz_result(buzzed: bool, results: List[Dict], gold_label: str, top_k_mode: bool) -> tuple[str, str, bool]:
    """Format the result text based on whether the agent buzzed."""
    if not buzzed:
        return f"Did not buzz. Correct answer was: {gold_label}", "No buzz", False

    buzz_position = next(i for i, r in enumerate(results) if r.get("buzz", False))
    buzz_result = results[buzz_position]

    if top_k_mode:
        # For top-k, check if any of the top guesses match
        top_answers = [g.get("answer", "").lower() for g in buzz_result.get("guesses", [])]
        correct = gold_label.lower() in [a.lower() for a in top_answers]
        final_answer = top_answers[0] if top_answers else "No answer"
    else:
        # For regular mode
        final_answer = buzz_result["answer"]
        correct = final_answer.lower() == gold_label.lower()

    result_text = f"BUZZED at position {buzz_position + 1} with answer: {final_answer}\n"
    result_text += f"Correct answer: {gold_label}\n"
    result_text += f"Result: {'CORRECT' if correct else 'INCORRECT'}"

    return result_text, final_answer, correct