Maharshi Gor
Updated workflow APIs, code clean up and minor functions for hf pipeline support
f064c62
# %% | |
import json | |
import logging | |
import re | |
from collections import Counter | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str: | |
clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer] | |
additional_answers_html = "" | |
if clean_answers: | |
additional_answers_html = f"<span class='bonus-answer-text'> [or {', '.join(clean_answers)}]</span>" | |
return f""" | |
<div class='bonus-answer'> | |
<span class='bonus-answer-label'>Answer: </span> | |
<span class='bonus-answer-text'>{answer}</span> | |
{additional_answers_html} | |
</div> | |
""" | |
def _get_token_classes(confidence, buzz, score) -> str: | |
if confidence is None: | |
return "token" | |
elif not buzz: | |
return f"token guess-point buzz-{score}" | |
else: | |
return f"token guess-point buzz-{score}" | |
def _create_token_tooltip_html(values) -> str: | |
if not values: | |
return "" | |
confidence = values.get("confidence", 0) | |
buzz = values.get("buzz", 0) | |
correct = values.get("correct", 0) | |
guess = values.get("guess", "") | |
guess_tokens = guess.split() | |
if len(guess_tokens) > 10: | |
k = len(guess_tokens) - 10 | |
guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]" | |
color = "#a3c9a3" if correct else "#ebbec4" # Light green for correct, light pink for incorrect | |
if values.get("logprob", None) is not None: | |
prob = np.exp(values["logprob"]) | |
prob_str = f"<p style='margin: 0 0 4px; color: #000;'> π <b style='color: #000;'>Output Probability:</b> {prob:.3f}</p>" | |
else: | |
prob_str = "" | |
return f""" | |
<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);"> | |
<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;"> | |
<h4 style="margin: 0 0 8px; color: #000;">π‘ Answer</h4> | |
<p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{guess}</code></p> | |
<p style="margin: 0 0 4px; color: #000;">π <b style="color: #000;">Confidence:</b> {confidence:.2f}</p> | |
{prob_str} | |
<p style="margin: 0; color: #000;">π <b style="color: #000;">Status:</b> {"β Correct" if correct else "β Incorrect" if buzz else "π« No Buzz"}</p> | |
</div> | |
</div> | |
""" | |
def create_token_html(token: str, values: dict, i: int) -> str: | |
confidence = values.get("confidence", None) | |
buzz = values.get("buzz", 0) | |
correct = values.get("correct", 0) | |
# Replace non-word characters for proper display in HTML | |
display_token = f"{token} π¨" if buzz else f"{token} π" if values else token | |
if not re.match(r"\w+", token): | |
display_token = token.replace(" ", " ") | |
css_class = _get_token_classes(confidence, buzz, correct) | |
# Add tooltip if we have values for this token | |
tooltip_html = _create_token_tooltip_html(values) | |
token_html = f'<span id="token-{i}" class="{css_class}" data-index="{i}">{display_token}{tooltip_html}</span>' | |
# if i in marker_indices: | |
# token_html += "<span style='color: crimson;'>|</span>" | |
return token_html | |
def create_tossup_html( | |
tokens: list[str], | |
answer_primary: str, | |
clean_answers: list[str], | |
marker_indices: list[int] = [], | |
eval_points: list[tuple[int, dict]] = [], | |
) -> str: | |
"""Create HTML for tokens with hover capability and a colored header for the answer.""" | |
try: | |
ep = dict(eval_points) | |
marker_indices = set(marker_indices) | |
html_tokens = [] | |
for i, token in enumerate(tokens, start=1): | |
token_html = create_token_html(token, ep.get(i, {}), i) | |
html_tokens.append(token_html) | |
answer_html = _make_answer_html(answer_primary, clean_answers) | |
return f""" | |
<div class='bonus-container'> | |
<div class='bonus-card'> | |
<div class='tossup-question'> | |
{"".join(html_tokens)} | |
</div> | |
{answer_html} | |
</div> | |
</div> | |
""" | |
except Exception as e: | |
logging.error(f"Error creating token HTML: {e}", exc_info=True) | |
return f"<div class='token-container'>Error creating tokens: {str(e)}</div>" | |
def create_bonus_html(leadin: str, parts: list[dict]) -> str: | |
# Create HTML for leadin and parts with answers | |
leadin_html = f"<div class='bonus-leadin'>{leadin}</div>" | |
parts_html = [] | |
for i, part in enumerate(parts): | |
question_text = part["part"] | |
answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"]) | |
"<div class='bonus-part-number'>Part {i + 1}</div>" | |
part_html = f""" | |
<div class='bonus-part'> | |
<div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div> | |
{answer_html} | |
</div> | |
""" | |
parts_html.append(part_html) | |
html_content = f""" | |
<div class='bonus-container'> | |
<div class='bonus-card'> | |
{leadin_html} | |
{"".join(parts_html)} | |
</div> | |
</div> | |
""" | |
# Format clean answers for the answer display | |
clean_answers = [] | |
for i, part in enumerate(parts): | |
part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6] | |
clean_answers.append(f"{i + 1}. {', '.join(part_answers)}") | |
return html_content | |
def create_tossup_confidence_pyplot( | |
tokens: list[str], | |
run_outputs: list[dict], | |
confidence_threshold: float = 0.5, | |
prob_threshold: float | None = None, | |
) -> plt.Figure: | |
"""Create a pyplot of token values with optional highlighting.""" | |
plt.style.use("ggplot") # Set theme to grid paper | |
fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5 | |
ax = fig.add_subplot(111) | |
x = [0] + [o["token_position"] for o in run_outputs] | |
y_conf = [0] + [o["confidence"] for o in run_outputs] | |
logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None] | |
y_prob = [0] + [np.exp(v) for v in logprobs] | |
ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability") | |
ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence") | |
for o in run_outputs: | |
if not o["buzz"]: | |
continue | |
color = "green" if o["correct"] else "red" | |
conf = o["confidence"] | |
i = o["token_position"] | |
ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5) | |
if o["logprob"] is not None: | |
prob = np.exp(o["logprob"]) | |
ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5) | |
if i > len(tokens): | |
print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}") | |
ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center") | |
# Add horizontal dashed line for confidence threshold | |
ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold") | |
# Add horizontal dashed line for probability threshold if provided | |
if prob_threshold is not None: | |
ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold") | |
ax.set_title("Buzz Confidence") | |
ax.set_xlabel("Token Index") | |
ax.set_ylabel("Confidence") | |
ax.set_xticks(x) | |
ax.set_xticklabels(x) | |
ax.legend() | |
return fig | |
def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure: | |
"""Create a scatter plot of token positions and scores.""" | |
plt.style.use("ggplot") | |
fig = plt.figure(figsize=(11, 5)) | |
ax = fig.add_subplot(111) | |
counts = Counter(zip(token_positions, scores)) | |
X = [] | |
Y = [] | |
S = [] | |
for (pos, score), size in counts.items(): | |
X.append(pos) | |
Y.append(score) | |
S.append(size * 20) | |
ax.scatter(X, Y, color="#4698cf", s=S) | |
return fig | |
def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure: | |
"""Create confidence plot for bonus parts.""" | |
plt.style.use("ggplot") | |
fig = plt.figure(figsize=(10, 6)) | |
ax = fig.add_subplot(111) | |
# Plot confidence for each part | |
x = range(1, len(parts) + 1) | |
confidences = [output["confidence"] for output in model_outputs] | |
scores = [output["correct"] for output in model_outputs] | |
# Plot confidence bars | |
bars = ax.bar(x, confidences, color="#4698cf") | |
# Color bars based on correctness | |
for i, score in enumerate(scores): | |
bars[i].set_color("green" if score == 1 else "red") | |
ax.set_title("Part Confidence") | |
ax.set_xlabel("Part Number") | |
ax.set_ylabel("Confidence") | |
ax.set_xticks(x) | |
ax.set_xticklabels([f"Part {i}" for i in x]) | |
return fig | |
def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame: | |
"""Update the plot when a token is hovered; add a vertical line on the plot.""" | |
try: | |
if not state or state == "{}": | |
logging.warning("Empty state provided to update_plot") | |
return pd.DataFrame() | |
highlighted_index = int(highlighted_index) if highlighted_index else None | |
logging.info(f"Update plot triggered with token index: {highlighted_index}") | |
data = json.loads(state) | |
tokens = data.get("tokens", []) | |
values = data.get("values", []) | |
if not tokens or not values: | |
logging.warning("No tokens or values found in state") | |
return pd.DataFrame() | |
# Create updated plot with highlighting of the token point | |
# plot_data = create_line_plot(values, highlighted_index) | |
plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index) | |
return plot_data | |
except Exception as e: | |
logging.error(f"Error updating plot: {e}") | |
return pd.DataFrame() | |
def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame: | |
"""Create a table from a dataframe.""" | |
# Prepare a dataframe of aggregated metrics: | |
# - Mean Tossup Score | |
# - Buzz Accuracy | |
# - Mean +ve Gap | |
# - Mean -ve Gap | |
# - Mean Buzz Position | |
positions = df["chosen_idx"].dropna() | |
gaps = df["gap"].dropna() | |
pos_gaps = gaps.loc[gaps >= 0] | |
neg_gaps = gaps.loc[gaps < 0] | |
mean_tossup_score = df["raw_score"].sum() / len(df) | |
expected_score = df["expected_score"].sum() / len(df) | |
buzz_precision = df["is_correct"].sum() / df["buzz"].sum() | |
return pd.DataFrame( | |
[ | |
{ | |
"Raw Score": f"{mean_tossup_score:5.1f}", | |
"Expected Score": f"{expected_score:5.1f}", | |
"Buzz Precision": f"{buzz_precision:5.1%}", | |
"Buzz Position": f"{np.mean(positions):5.1f}", | |
"+ve Gap": f"{pos_gaps.mean():5.1f}", | |
"-ve Gap": f"{neg_gaps.mean():5.1f}", | |
} | |
] | |
) | |
def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""): | |
""" | |
Visualise buzzing behaviour with three sub-plots: | |
1. Ceiling-accuracy vs. prefix length | |
2. Scatter of earliest-correct idx vs. chosen-buzz idx | |
3. Frequency distribution of narrative classes (vertical bars) | |
Parameters | |
---------- | |
df : pd.DataFrame | |
Output of `build_buzz_dataframe` β must contain | |
columns: earliest_ok_idx, chosen_idx, cls. | |
eval_indices : sequence[int] | |
Token positions at which the model was probed. | |
figsize : tuple, optional | |
Figure size passed to `plt.subplots`. | |
title_prefix : str, optional | |
Prepended to each subplot title (useful when comparing models). | |
""" | |
# ------------------------------------------------------------------ | |
# 0. Prep (variables reused throughout the function) | |
# ------------------------------------------------------------------ | |
# Collect all evaluation indices across questions so we know the | |
# x-axis domain and the padding for NaNs. | |
eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices})) | |
# Narrative classes and their colours | |
classes = [ | |
"best-buzz", | |
"late-buzz", | |
"never-buzzed", | |
"premature", | |
"hopeless", | |
] | |
colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"] | |
palette = dict(zip(classes, colors)) | |
max_idx = eval_indices.max() * 1.25 # padding for NaN replacement / axis limits | |
# ------------------------------------------------------------------ | |
# 1. Figure / axes layout | |
# ------------------------------------------------------------------ | |
# GridSpec layout β 2 rows Γ 3 cols. | |
# ββββββββββββββ¬βββββββββββββ¬βββββββββ | |
# β Ceiling β Scatter β Bars β (row 0) | |
# ββββββββββββββ΄βββββββββββββ΄βββββββββ€ | |
# β Descriptions (spans all 3 cols) β (row 1) | |
# ββββββββββββββββββββββββββββββββββββ | |
# Having a dedicated row for the narrative-class descriptions avoids | |
# overlapping with sub-plots and makes the whole figure more compact. | |
plt.style.use("ggplot") | |
fig = plt.figure(figsize=figsize) | |
gs = fig.add_gridspec( | |
nrows=2, | |
ncols=3, | |
height_ratios=[5, 1], # extra space for plots vs. descriptions | |
width_ratios=[2.2, 2.2, 1], | |
hspace=0.2, # reduced vertical spacing between plots | |
wspace=0.2, # reduced horizontal spacing between plots | |
left=0.05, # reduced left margin | |
right=0.95, # reduced right margin | |
top=0.9, # reduced top margin | |
bottom=0.05, # reduced bottom margin | |
) | |
ax_ceiling = fig.add_subplot(gs[0, 0]) # Ceiling accuracy curve | |
ax_scatter = fig.add_subplot(gs[0, 1]) # Earliest vs. chosen scatter | |
ax_bars = fig.add_subplot(gs[0, 2]) # Outcome distribution bars | |
ax_desc = fig.add_subplot(gs[1, :]) # Textual descriptions | |
ax_desc.axis("off") | |
fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold") | |
# ------------------------------------------------------------------ | |
# 2. Ceiling accuracy curve | |
# ------------------------------------------------------------------ | |
ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices] | |
ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf") | |
ax_ceiling.set_xlabel("Token index shown") | |
ax_ceiling.set_ylabel("Proportion of questions correct") | |
ax_ceiling.set_ylim(0, 1.01) | |
ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix") | |
# ------------------------------------------------------------------ | |
# 3. Earliest-vs-Chosen scatter | |
# ------------------------------------------------------------------ | |
for cls in classes: | |
sub = df[df["cls"] == cls] | |
if sub.empty: | |
continue | |
x = sub["earliest_ok_idx"].fillna(max_idx) | |
y = sub["chosen_idx"].fillna(max_idx) | |
ax_scatter.scatter( | |
x, | |
y, | |
label=cls, | |
alpha=0.7, | |
edgecolor="black", | |
linewidth=1, | |
marker="o", | |
s=90, | |
c=palette[cls], | |
facecolor="none", | |
) | |
lim = max_idx | |
ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1) | |
ax_scatter.set_xlim(0, lim) | |
ax_scatter.set_ylim(0, lim) | |
ax_scatter.set_xlabel("Earliest index with correct answer") | |
ax_scatter.set_ylabel("Chosen buzz index") | |
ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index") | |
ax_scatter.legend(frameon=False, fontsize="small") | |
# ------------------------------------------------------------------ | |
# 4. Outcome distribution (horizontal bars) | |
# ------------------------------------------------------------------ | |
counts = df["cls"].value_counts().reindex(classes).fillna(0) | |
ax_bars.barh( | |
counts.index, | |
counts.values, | |
color=[palette[c] for c in counts.index], | |
alpha=0.7, | |
edgecolor="black", | |
linewidth=1, | |
) | |
ax_bars.set_xlabel("Number of questions") | |
ax_bars.set_title(f"{title_prefix}Outcome distribution") | |
# Ensure x-axis shows integer ticks only | |
from matplotlib.ticker import MaxNLocator | |
ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True)) | |
# ------------------------------------------------------------------ | |
# 5. Narrative-class descriptions (bottom panel) | |
# ------------------------------------------------------------------ | |
descriptions = { | |
"best-buzz": "Perfect timing. Buzzed at the earliest possible correct position", | |
"late-buzz": "Missed opportunity. Buzzed correctly but later than optimal", | |
"never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer", | |
"premature": "Incorrect buzz. Buzzing at a later position could have been correct", | |
"hopeless": "Never knew the answer. No correct answer at any position", | |
} | |
y_pos = 1.0 # start at top of the description axis | |
for cls, color in zip(classes, colors): | |
ax_desc.text( | |
0.01, | |
y_pos, | |
f"β {cls}: {descriptions[cls]}", | |
ha="left", | |
va="top", | |
color=color, | |
fontweight="bold", | |
fontsize=11, # increased font size from 9 to 11 | |
transform=ax_desc.transAxes, | |
) | |
y_pos -= 0.25 # increased vertical step inside the axis for more line height | |
# ------------------------------------------------------------------ | |
# 6. Return the final figure | |
# ------------------------------------------------------------------ | |
return fig | |
# %% | |
# Create dummy data for testing | |
def create_dummy_model_outputs(n_entries=10, n_positions=5): | |
"""Create dummy model outputs for testing.""" | |
np.random.seed(42) | |
dummy_outputs = [] | |
for _ in range(n_entries): | |
run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False)) | |
outputs = [] | |
for i in range(n_positions): | |
# Randomly decide if model will buzz at this position | |
will_buzz = np.random.random() > 0.7 | |
# Randomly decide if answer is correct | |
is_correct = np.random.random() > 0.4 | |
outputs.append( | |
{ | |
"run_idx": i + 1, | |
"buzz": will_buzz, | |
"correct": 1 if is_correct else 0, | |
"confidence": np.random.random(), | |
"logprob": np.log(np.random.random()), | |
"guess": f"Answer {i + 1}", | |
} | |
) | |
dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs}) | |
return dummy_outputs | |
# dummy_data = create_dummy_model_outputs() | |
# dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data]) | |
# dummy_df | |
# plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"]) | |
# %% | |