# %% import json import logging import re from collections import Counter import matplotlib.pyplot as plt import numpy as np import pandas as pd def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str: clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer] additional_answers_html = "" if clean_answers: additional_answers_html = f" [or {', '.join(clean_answers)}]" return f"""

Answer: {answer} {additional_answers_html}

""" def _get_token_classes(confidence, buzz, score) -> str: if confidence is None: return "token" elif not buzz: return f"token guess-point buzz-{score}" else: return f"token guess-point buzz-{score}" def _create_token_tooltip_html(values) -> str: if not values: return "" confidence = values.get("confidence", 0) buzz = values.get("buzz", 0) correct = values.get("correct", 0) guess = values.get("guess", "") guess_tokens = guess.split() if len(guess_tokens) > 10: k = len(guess_tokens) - 10 guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]" color = "#a3c9a3" if correct else "#ebbec4" # Light green for correct, light pink for incorrect if values.get("logprob", None) is not None: prob = np.exp(values["logprob"]) prob_str = f"

📈 Output Probability: {prob:.3f}

" else: prob_str = "" return f""" """ def create_token_html(token: str, values: dict, i: int) -> str: confidence = values.get("confidence", None) buzz = values.get("buzz", 0) correct = values.get("correct", 0) # Replace non-word characters for proper display in HTML display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token if not re.match(r"\w+", token): display_token = token.replace(" ", " ") css_class = _get_token_classes(confidence, buzz, correct) # Add tooltip if we have values for this token tooltip_html = _create_token_tooltip_html(values) token_html = f'{display_token}{tooltip_html}' # if i in marker_indices: # token_html += "|" return token_html def create_tossup_html( tokens: list[str], answer_primary: str, clean_answers: list[str], marker_indices: list[int] = [], eval_points: list[tuple[int, dict]] = [], ) -> str: """Create HTML for tokens with hover capability and a colored header for the answer.""" try: ep = dict(eval_points) marker_indices = set(marker_indices) html_tokens = [] for i, token in enumerate(tokens, start=1): token_html = create_token_html(token, ep.get(i, {}), i) html_tokens.append(token_html) answer_html = _make_answer_html(answer_primary, clean_answers) return f"""

{"".join(html_tokens)}

{answer_html}

""" except Exception as e: logging.error(f"Error creating token HTML: {e}", exc_info=True) return f"

Error creating tokens: {str(e)}

" def create_bonus_html(leadin: str, parts: list[dict]) -> str: # Create HTML for leadin and parts with answers leadin_html = f"

{leadin}

" parts_html = [] for i, part in enumerate(parts): question_text = part["part"] answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"]) "

Part {i + 1}

" part_html = f"""

#{i + 1}. {question_text}

{answer_html}

""" parts_html.append(part_html) html_content = f"""

{leadin_html} {"".join(parts_html)}

""" # Format clean answers for the answer display clean_answers = [] for i, part in enumerate(parts): part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6] clean_answers.append(f"{i + 1}. {', '.join(part_answers)}") return html_content def create_tossup_confidence_pyplot( tokens: list[str], run_outputs: list[dict], confidence_threshold: float = 0.5, prob_threshold: float | None = None, ) -> plt.Figure: """Create a pyplot of token values with optional highlighting.""" plt.style.use("ggplot") # Set theme to grid paper fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5 ax = fig.add_subplot(111) x = [0] + [o["token_position"] for o in run_outputs] y_conf = [0] + [o["confidence"] for o in run_outputs] logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None] y_prob = [0] + [np.exp(v) for v in logprobs] ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability") ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence") for o in run_outputs: if not o["buzz"]: continue color = "green" if o["correct"] else "red" conf = o["confidence"] i = o["token_position"] ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5) if o["logprob"] is not None: prob = np.exp(o["logprob"]) ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5) if i > len(tokens): print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}") ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center") # Add horizontal dashed line for confidence threshold ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold") # Add horizontal dashed line for probability threshold if provided if prob_threshold is not None: ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold") ax.set_title("Buzz Confidence") ax.set_xlabel("Token Index") ax.set_ylabel("Confidence") ax.set_xticks(x) ax.set_xticklabels(x) ax.legend() return fig def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure: """Create a scatter plot of token positions and scores.""" plt.style.use("ggplot") fig = plt.figure(figsize=(11, 5)) ax = fig.add_subplot(111) counts = Counter(zip(token_positions, scores)) X = [] Y = [] S = [] for (pos, score), size in counts.items(): X.append(pos) Y.append(score) S.append(size * 20) ax.scatter(X, Y, color="#4698cf", s=S) return fig def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure: """Create confidence plot for bonus parts.""" plt.style.use("ggplot") fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) # Plot confidence for each part x = range(1, len(parts) + 1) confidences = [output["confidence"] for output in model_outputs] scores = [output["correct"] for output in model_outputs] # Plot confidence bars bars = ax.bar(x, confidences, color="#4698cf") # Color bars based on correctness for i, score in enumerate(scores): bars[i].set_color("green" if score == 1 else "red") ax.set_title("Part Confidence") ax.set_xlabel("Part Number") ax.set_ylabel("Confidence") ax.set_xticks(x) ax.set_xticklabels([f"Part {i}" for i in x]) return fig def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame: """Update the plot when a token is hovered; add a vertical line on the plot.""" try: if not state or state == "{}": logging.warning("Empty state provided to update_plot") return pd.DataFrame() highlighted_index = int(highlighted_index) if highlighted_index else None logging.info(f"Update plot triggered with token index: {highlighted_index}") data = json.loads(state) tokens = data.get("tokens", []) values = data.get("values", []) if not tokens or not values: logging.warning("No tokens or values found in state") return pd.DataFrame() # Create updated plot with highlighting of the token point # plot_data = create_line_plot(values, highlighted_index) plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index) return plot_data except Exception as e: logging.error(f"Error updating plot: {e}") return pd.DataFrame() def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame: """Create a table from a dataframe.""" # Prepare a dataframe of aggregated metrics: # - Mean Tossup Score # - Buzz Accuracy # - Mean +ve Gap # - Mean -ve Gap # - Mean Buzz Position positions = df["chosen_idx"].dropna() gaps = df["gap"].dropna() pos_gaps = gaps.loc[gaps >= 0] neg_gaps = gaps.loc[gaps < 0] mean_tossup_score = df["raw_score"].sum() / len(df) expected_score = df["expected_score"].sum() / len(df) buzz_precision = df["is_correct"].sum() / df["buzz"].sum() return pd.DataFrame( [ { "Raw Score": f"{mean_tossup_score:5.1f}", "Expected Score": f"{expected_score:5.1f}", "Buzz Precision": f"{buzz_precision:5.1%}", "Buzz Position": f"{np.mean(positions):5.1f}", "+ve Gap": f"{pos_gaps.mean():5.1f}", "-ve Gap": f"{neg_gaps.mean():5.1f}", } ] ) def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""): """ Visualise buzzing behaviour with three sub-plots: 1. Ceiling-accuracy vs. prefix length 2. Scatter of earliest-correct idx vs. chosen-buzz idx 3. Frequency distribution of narrative classes (vertical bars) Parameters ---------- df : pd.DataFrame Output of `build_buzz_dataframe` – must contain columns: earliest_ok_idx, chosen_idx, cls. eval_indices : sequence[int] Token positions at which the model was probed. figsize : tuple, optional Figure size passed to `plt.subplots`. title_prefix : str, optional Prepended to each subplot title (useful when comparing models). """ # ------------------------------------------------------------------ # 0. Prep (variables reused throughout the function) # ------------------------------------------------------------------ # Collect all evaluation indices across questions so we know the # x-axis domain and the padding for NaNs. eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices})) # Narrative classes and their colours classes = [ "best-buzz", "late-buzz", "never-buzzed", "premature", "hopeless", ] colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"] palette = dict(zip(classes, colors)) max_idx = eval_indices.max() * 1.25 # padding for NaN replacement / axis limits # ------------------------------------------------------------------ # 1. Figure / axes layout # ------------------------------------------------------------------ # GridSpec layout → 2 rows × 3 cols. # ┌────────────┬────────────┬────────┐ # │ Ceiling │ Scatter │ Bars │ (row 0) # ├────────────┴────────────┴────────┤ # │ Descriptions (spans all 3 cols) │ (row 1) # └──────────────────────────────────┘ # Having a dedicated row for the narrative-class descriptions avoids # overlapping with sub-plots and makes the whole figure more compact. plt.style.use("ggplot") fig = plt.figure(figsize=figsize) gs = fig.add_gridspec( nrows=2, ncols=3, height_ratios=[5, 1], # extra space for plots vs. descriptions width_ratios=[2.2, 2.2, 1], hspace=0.2, # reduced vertical spacing between plots wspace=0.2, # reduced horizontal spacing between plots left=0.05, # reduced left margin right=0.95, # reduced right margin top=0.9, # reduced top margin bottom=0.05, # reduced bottom margin ) ax_ceiling = fig.add_subplot(gs[0, 0]) # Ceiling accuracy curve ax_scatter = fig.add_subplot(gs[0, 1]) # Earliest vs. chosen scatter ax_bars = fig.add_subplot(gs[0, 2]) # Outcome distribution bars ax_desc = fig.add_subplot(gs[1, :]) # Textual descriptions ax_desc.axis("off") fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold") # ------------------------------------------------------------------ # 2. Ceiling accuracy curve # ------------------------------------------------------------------ ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices] ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf") ax_ceiling.set_xlabel("Token index shown") ax_ceiling.set_ylabel("Proportion of questions correct") ax_ceiling.set_ylim(0, 1.01) ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix") # ------------------------------------------------------------------ # 3. Earliest-vs-Chosen scatter # ------------------------------------------------------------------ for cls in classes: sub = df[df["cls"] == cls] if sub.empty: continue x = sub["earliest_ok_idx"].fillna(max_idx) y = sub["chosen_idx"].fillna(max_idx) ax_scatter.scatter( x, y, label=cls, alpha=0.7, edgecolor="black", linewidth=1, marker="o", s=90, c=palette[cls], facecolor="none", ) lim = max_idx ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1) ax_scatter.set_xlim(0, lim) ax_scatter.set_ylim(0, lim) ax_scatter.set_xlabel("Earliest index with correct answer") ax_scatter.set_ylabel("Chosen buzz index") ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index") ax_scatter.legend(frameon=False, fontsize="small") # ------------------------------------------------------------------ # 4. Outcome distribution (horizontal bars) # ------------------------------------------------------------------ counts = df["cls"].value_counts().reindex(classes).fillna(0) ax_bars.barh( counts.index, counts.values, color=[palette[c] for c in counts.index], alpha=0.7, edgecolor="black", linewidth=1, ) ax_bars.set_xlabel("Number of questions") ax_bars.set_title(f"{title_prefix}Outcome distribution") # Ensure x-axis shows integer ticks only from matplotlib.ticker import MaxNLocator ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True)) # ------------------------------------------------------------------ # 5. Narrative-class descriptions (bottom panel) # ------------------------------------------------------------------ descriptions = { "best-buzz": "Perfect timing. Buzzed at the earliest possible correct position", "late-buzz": "Missed opportunity. Buzzed correctly but later than optimal", "never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer", "premature": "Incorrect buzz. Buzzing at a later position could have been correct", "hopeless": "Never knew the answer. No correct answer at any position", } y_pos = 1.0 # start at top of the description axis for cls, color in zip(classes, colors): ax_desc.text( 0.01, y_pos, f"■ {cls}: {descriptions[cls]}", ha="left", va="top", color=color, fontweight="bold", fontsize=11, # increased font size from 9 to 11 transform=ax_desc.transAxes, ) y_pos -= 0.25 # increased vertical step inside the axis for more line height # ------------------------------------------------------------------ # 6. Return the final figure # ------------------------------------------------------------------ return fig # %% # Create dummy data for testing def create_dummy_model_outputs(n_entries=10, n_positions=5): """Create dummy model outputs for testing.""" np.random.seed(42) dummy_outputs = [] for _ in range(n_entries): run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False)) outputs = [] for i in range(n_positions): # Randomly decide if model will buzz at this position will_buzz = np.random.random() > 0.7 # Randomly decide if answer is correct is_correct = np.random.random() > 0.4 outputs.append( { "run_idx": i + 1, "buzz": will_buzz, "correct": 1 if is_correct else 0, "confidence": np.random.random(), "logprob": np.log(np.random.random()), "guess": f"Answer {i + 1}", } ) dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs}) return dummy_outputs # dummy_data = create_dummy_model_outputs() # dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data]) # dummy_df # plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"]) # %%

💡 Answer