Spaces:

qanta-challenge
/

quizbowl-submission

Running

File size: 19,506 Bytes

# %%
import json
import logging
import re
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str:
    clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer]
    additional_answers_html = ""
    if clean_answers:
        additional_answers_html = f"<span class='bonus-answer-text'> [or {', '.join(clean_answers)}]</span>"
    return f"""
        <div class='bonus-answer'>
            <span class='bonus-answer-label'>Answer: </span>
            <span class='bonus-answer-text'>{answer}</span>
            {additional_answers_html}
        </div>
    """


def _get_token_classes(confidence, buzz, score) -> str:
    if confidence is None:
        return "token"
    elif not buzz:
        return f"token guess-point buzz-{score}"
    else:
        return f"token guess-point buzz-{score}"


def _create_token_tooltip_html(values) -> str:
    if not values:
        return ""
    confidence = values.get("confidence", 0)
    buzz = values.get("buzz", 0)
    correct = values.get("correct", 0)
    guess = values.get("guess", "")
    guess_tokens = guess.split()
    if len(guess_tokens) > 10:
        k = len(guess_tokens) - 10
        guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]"

    color = "#a3c9a3" if correct else "#ebbec4"  # Light green for correct, light pink for incorrect

    if values.get("logprob", None) is not None:
        prob = np.exp(values["logprob"])
        prob_str = f"<p style='margin: 0 0 4px; color: #000;'> 📈 <b style='color: #000;'>Output Probability:</b> {prob:.3f}</p>"
    else:
        prob_str = ""

    return f"""
        <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
            <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
                <h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
                <p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{guess}</code></p>
                <p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
                {prob_str}
                <p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if correct else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
            </div>
        </div>
    """


def create_token_html(token: str, values: dict, i: int) -> str:
    confidence = values.get("confidence", None)
    buzz = values.get("buzz", 0)
    correct = values.get("correct", 0)

    # Replace non-word characters for proper display in HTML
    display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
    if not re.match(r"\w+", token):
        display_token = token.replace(" ", "&nbsp;")

    css_class = _get_token_classes(confidence, buzz, correct)
    # Add tooltip if we have values for this token
    tooltip_html = _create_token_tooltip_html(values)

    token_html = f'<span id="token-{i}" class="{css_class}" data-index="{i}">{display_token}{tooltip_html}</span>'
    # if i in marker_indices:
    #     token_html += "<span style='color: crimson;'>|</span>"
    return token_html


def create_tossup_html(
    tokens: list[str],
    answer_primary: str,
    clean_answers: list[str],
    marker_indices: list[int] = [],
    eval_points: list[tuple[int, dict]] = [],
) -> str:
    """Create HTML for tokens with hover capability and a colored header for the answer."""
    try:
        ep = dict(eval_points)
        marker_indices = set(marker_indices)

        html_tokens = []
        for i, token in enumerate(tokens, start=1):
            token_html = create_token_html(token, ep.get(i, {}), i)
            html_tokens.append(token_html)

        answer_html = _make_answer_html(answer_primary, clean_answers)
        return f"""
        <div class='bonus-container'>
            <div class='bonus-card'>
                <div class='tossup-question'>
                    {"".join(html_tokens)}
            </div>
                {answer_html}
            </div>
        </div>
        """
    except Exception as e:
        logging.error(f"Error creating token HTML: {e}", exc_info=True)
        return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"


def create_bonus_html(leadin: str, parts: list[dict]) -> str:
    # Create HTML for leadin and parts with answers
    leadin_html = f"<div class='bonus-leadin'>{leadin}</div>"
    parts_html = []

    for i, part in enumerate(parts):
        question_text = part["part"]
        answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])

        "<div class='bonus-part-number'>Part {i + 1}</div>"
        part_html = f"""
                <div class='bonus-part'>
                    <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
                    {answer_html}
                </div>
            """
        parts_html.append(part_html)

    html_content = f"""
            <div class='bonus-container'>
                <div class='bonus-card'>
                    {leadin_html}
                    {"".join(parts_html)}
                </div>
            </div>
        """

    # Format clean answers for the answer display
    clean_answers = []
    for i, part in enumerate(parts):
        part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6]
        clean_answers.append(f"{i + 1}. {', '.join(part_answers)}")

    return html_content


def create_tossup_confidence_pyplot(
    tokens: list[str],
    run_outputs: list[dict],
    confidence_threshold: float = 0.5,
    prob_threshold: float | None = None,
) -> plt.Figure:
    """Create a pyplot of token values with optional highlighting."""
    plt.style.use("ggplot")  # Set theme to grid paper
    fig = plt.figure(figsize=(10, 4), dpi=300)  # Set figure size to 11x5
    ax = fig.add_subplot(111)
    x = [0] + [o["token_position"] for o in run_outputs]
    y_conf = [0] + [o["confidence"] for o in run_outputs]
    logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None]
    y_prob = [0] + [np.exp(v) for v in logprobs]

    ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
    ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
    for o in run_outputs:
        if not o["buzz"]:
            continue
        color = "green" if o["correct"] else "red"
        conf = o["confidence"]
        i = o["token_position"]
        ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
        if o["logprob"] is not None:
            prob = np.exp(o["logprob"])
            ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
        if i > len(tokens):
            print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}")
        ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center")

    # Add horizontal dashed line for confidence threshold
    ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
    # Add horizontal dashed line for probability threshold if provided
    if prob_threshold is not None:
        ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold")

    ax.set_title("Buzz Confidence")
    ax.set_xlabel("Token Index")
    ax.set_ylabel("Confidence")
    ax.set_xticks(x)
    ax.set_xticklabels(x)
    ax.legend()
    return fig


def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure:
    """Create a scatter plot of token positions and scores."""
    plt.style.use("ggplot")
    fig = plt.figure(figsize=(11, 5))
    ax = fig.add_subplot(111)

    counts = Counter(zip(token_positions, scores))
    X = []
    Y = []
    S = []
    for (pos, score), size in counts.items():
        X.append(pos)
        Y.append(score)
        S.append(size * 20)

    ax.scatter(X, Y, color="#4698cf", s=S)

    return fig


def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure:
    """Create confidence plot for bonus parts."""
    plt.style.use("ggplot")
    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(111)

    # Plot confidence for each part
    x = range(1, len(parts) + 1)
    confidences = [output["confidence"] for output in model_outputs]
    scores = [output["correct"] for output in model_outputs]

    # Plot confidence bars
    bars = ax.bar(x, confidences, color="#4698cf")

    # Color bars based on correctness
    for i, score in enumerate(scores):
        bars[i].set_color("green" if score == 1 else "red")

    ax.set_title("Part Confidence")
    ax.set_xlabel("Part Number")
    ax.set_ylabel("Confidence")
    ax.set_xticks(x)
    ax.set_xticklabels([f"Part {i}" for i in x])

    return fig


def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame:
    """Update the plot when a token is hovered; add a vertical line on the plot."""
    try:
        if not state or state == "{}":
            logging.warning("Empty state provided to update_plot")
            return pd.DataFrame()

        highlighted_index = int(highlighted_index) if highlighted_index else None
        logging.info(f"Update plot triggered with token index: {highlighted_index}")

        data = json.loads(state)
        tokens = data.get("tokens", [])
        values = data.get("values", [])

        if not tokens or not values:
            logging.warning("No tokens or values found in state")
            return pd.DataFrame()

        # Create updated plot with highlighting of the token point
        # plot_data = create_line_plot(values, highlighted_index)
        plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index)
        return plot_data
    except Exception as e:
        logging.error(f"Error updating plot: {e}")
        return pd.DataFrame()


def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
    """Create a table from a dataframe."""
    # Prepare a dataframe of aggregated metrics:
    # - Mean Tossup Score
    # - Buzz Accuracy
    # - Mean +ve Gap
    # - Mean -ve Gap
    # - Mean Buzz Position

    positions = df["chosen_idx"].dropna()
    gaps = df["gap"].dropna()
    pos_gaps = gaps.loc[gaps >= 0]
    neg_gaps = gaps.loc[gaps < 0]

    mean_tossup_score = df["raw_score"].sum() / len(df)
    expected_score = df["expected_score"].sum() / len(df)
    buzz_precision = df["is_correct"].sum() / df["buzz"].sum()

    return pd.DataFrame(
        [
            {
                "Raw Score": f"{mean_tossup_score:5.1f}",
                "Expected Score": f"{expected_score:5.1f}",
                "Buzz Precision": f"{buzz_precision:5.1%}",
                "Buzz Position": f"{np.mean(positions):5.1f}",
                "+ve Gap": f"{pos_gaps.mean():5.1f}",
                "-ve Gap": f"{neg_gaps.mean():5.1f}",
            }
        ]
    )


def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""):
    """
    Visualise buzzing behaviour with three sub-plots:

    1. Ceiling-accuracy vs. prefix length
    2. Scatter of earliest-correct idx vs. chosen-buzz idx
    3. Frequency distribution of narrative classes (vertical bars)

    Parameters
    ----------
    df : pd.DataFrame
        Output of `build_buzz_dataframe` – must contain
        columns: earliest_ok_idx, chosen_idx, cls.
    eval_indices : sequence[int]
        Token positions at which the model was probed.
    figsize : tuple, optional
        Figure size passed to `plt.subplots`.
    title_prefix : str, optional
        Prepended to each subplot title (useful when comparing models).
    """
    # ------------------------------------------------------------------
    # 0. Prep (variables reused throughout the function)
    # ------------------------------------------------------------------
    # Collect all evaluation indices across questions so we know the
    # x-axis domain and the padding for NaNs.
    eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices}))

    # Narrative classes and their colours
    classes = [
        "best-buzz",
        "late-buzz",
        "never-buzzed",
        "premature",
        "hopeless",
    ]
    colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"]
    palette = dict(zip(classes, colors))

    max_idx = eval_indices.max() * 1.25  # padding for NaN replacement / axis limits

    # ------------------------------------------------------------------
    # 1. Figure / axes layout
    # ------------------------------------------------------------------
    # GridSpec layout → 2 rows × 3 cols.
    #   ┌────────────┬────────────┬────────┐
    #   │ Ceiling    │ Scatter    │  Bars  │  (row 0)
    #   ├────────────┴────────────┴────────┤
    #   │ Descriptions (spans all 3 cols)  │  (row 1)
    #   └──────────────────────────────────┘
    # Having a dedicated row for the narrative-class descriptions avoids
    # overlapping with sub-plots and makes the whole figure more compact.

    plt.style.use("ggplot")
    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(
        nrows=2,
        ncols=3,
        height_ratios=[5, 1],  # extra space for plots vs. descriptions
        width_ratios=[2.2, 2.2, 1],
        hspace=0.2,  # reduced vertical spacing between plots
        wspace=0.2,  # reduced horizontal spacing between plots
        left=0.05,  # reduced left margin
        right=0.95,  # reduced right margin
        top=0.9,  # reduced top margin
        bottom=0.05,  # reduced bottom margin
    )

    ax_ceiling = fig.add_subplot(gs[0, 0])  # Ceiling accuracy curve
    ax_scatter = fig.add_subplot(gs[0, 1])  # Earliest vs. chosen scatter
    ax_bars = fig.add_subplot(gs[0, 2])  # Outcome distribution bars
    ax_desc = fig.add_subplot(gs[1, :])  # Textual descriptions
    ax_desc.axis("off")

    fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold")

    # ------------------------------------------------------------------
    # 2. Ceiling accuracy curve
    # ------------------------------------------------------------------
    ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices]
    ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf")
    ax_ceiling.set_xlabel("Token index shown")
    ax_ceiling.set_ylabel("Proportion of questions correct")
    ax_ceiling.set_ylim(0, 1.01)
    ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix")

    # ------------------------------------------------------------------
    # 3. Earliest-vs-Chosen scatter
    # ------------------------------------------------------------------
    for cls in classes:
        sub = df[df["cls"] == cls]
        if sub.empty:
            continue
        x = sub["earliest_ok_idx"].fillna(max_idx)
        y = sub["chosen_idx"].fillna(max_idx)
        ax_scatter.scatter(
            x,
            y,
            label=cls,
            alpha=0.7,
            edgecolor="black",
            linewidth=1,
            marker="o",
            s=90,
            c=palette[cls],
            facecolor="none",
        )

    lim = max_idx
    ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1)
    ax_scatter.set_xlim(0, lim)
    ax_scatter.set_ylim(0, lim)
    ax_scatter.set_xlabel("Earliest index with correct answer")
    ax_scatter.set_ylabel("Chosen buzz index")
    ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index")
    ax_scatter.legend(frameon=False, fontsize="small")

    # ------------------------------------------------------------------
    # 4. Outcome distribution (horizontal bars)
    # ------------------------------------------------------------------
    counts = df["cls"].value_counts().reindex(classes).fillna(0)
    ax_bars.barh(
        counts.index,
        counts.values,
        color=[palette[c] for c in counts.index],
        alpha=0.7,
        edgecolor="black",
        linewidth=1,
    )
    ax_bars.set_xlabel("Number of questions")
    ax_bars.set_title(f"{title_prefix}Outcome distribution")

    # Ensure x-axis shows integer ticks only
    from matplotlib.ticker import MaxNLocator

    ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True))

    # ------------------------------------------------------------------
    # 5. Narrative-class descriptions (bottom panel)
    # ------------------------------------------------------------------
    descriptions = {
        "best-buzz": "Perfect timing. Buzzed at the earliest possible correct position",
        "late-buzz": "Missed opportunity. Buzzed correctly but later than optimal",
        "never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer",
        "premature": "Incorrect buzz. Buzzing at a later position could have been correct",
        "hopeless": "Never knew the answer. No correct answer at any position",
    }

    y_pos = 1.0  # start at top of the description axis

    for cls, color in zip(classes, colors):
        ax_desc.text(
            0.01,
            y_pos,
            f"■ {cls}: {descriptions[cls]}",
            ha="left",
            va="top",
            color=color,
            fontweight="bold",
            fontsize=11,  # increased font size from 9 to 11
            transform=ax_desc.transAxes,
        )

        y_pos -= 0.25  # increased vertical step inside the axis for more line height

    # ------------------------------------------------------------------
    # 6. Return the final figure
    # ------------------------------------------------------------------
    return fig


# %%


# Create dummy data for testing
def create_dummy_model_outputs(n_entries=10, n_positions=5):
    """Create dummy model outputs for testing."""
    np.random.seed(42)
    dummy_outputs = []

    for _ in range(n_entries):
        run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False))
        outputs = []

        for i in range(n_positions):
            # Randomly decide if model will buzz at this position
            will_buzz = np.random.random() > 0.7
            # Randomly decide if answer is correct
            is_correct = np.random.random() > 0.4

            outputs.append(
                {
                    "run_idx": i + 1,
                    "buzz": will_buzz,
                    "correct": 1 if is_correct else 0,
                    "confidence": np.random.random(),
                    "logprob": np.log(np.random.random()),
                    "guess": f"Answer {i + 1}",
                }
            )

        dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs})

    return dummy_outputs


# dummy_data = create_dummy_model_outputs()
# dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data])
# dummy_df
# plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"])

# %%