|
import json |
|
import logging |
|
from typing import Any |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
from datasets import Dataset |
|
|
|
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState |
|
from submission import submit |
|
from workflows import factory |
|
from workflows.qb.simple_agent import SimpleBonusAgent |
|
from workflows.structs import ModelStep, Workflow |
|
|
|
from .plotting import ( |
|
create_pyplot, |
|
create_scatter_pyplot, |
|
evaluate_buzz, |
|
update_plot, |
|
) |
|
|
|
|
|
def evaluate_bonus_part(prediction: str, clean_answers: list[str]) -> float: |
|
"""Evaluate a single bonus part.""" |
|
return evaluate_buzz(prediction, clean_answers) |
|
|
|
|
|
def process_bonus_results(results: list[dict]) -> pd.DataFrame: |
|
"""Process results from bonus mode and prepare visualization data.""" |
|
return pd.DataFrame( |
|
[ |
|
{ |
|
"Part": f"Part {r['part_number']}", |
|
"Correct?": "✅" if r["score"] == 1 else "❌", |
|
"Confidence": r["confidence"], |
|
"Prediction": r["answer"], |
|
"Explanation": r["explanation"], |
|
} |
|
for r in results |
|
] |
|
) |
|
|
|
|
|
def initialize_eval_interface(example: dict, model_outputs: list[dict]): |
|
"""Initialize the interface with example text.""" |
|
try: |
|
|
|
leadin_html = f"<div class='leadin'>{example['leadin']}</div>" |
|
parts_html = [] |
|
for i, part in enumerate(example["parts"]): |
|
parts_html.append(f"<div class='part'><b>Part {i + 1}:</b> {part['part']}</div>") |
|
|
|
html_content = f"{leadin_html}<div class='parts-container'>{''.join(parts_html)}</div>" |
|
|
|
|
|
plot_data = create_bonus_confidence_plot(example["parts"], model_outputs) |
|
|
|
|
|
state = json.dumps({"parts": example["parts"], "outputs": model_outputs}) |
|
|
|
return html_content, plot_data, state |
|
except Exception as e: |
|
logging.error(f"Error initializing interface: {e}", exc_info=True) |
|
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}" |
|
|
|
|
|
def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]): |
|
"""Create confidence plot for bonus parts.""" |
|
plt.style.use("ggplot") |
|
fig = plt.figure(figsize=(10, 6)) |
|
ax = fig.add_subplot(111) |
|
|
|
|
|
x = range(1, len(parts) + 1) |
|
confidences = [output["confidence"] for output in model_outputs] |
|
scores = [output["score"] for output in model_outputs] |
|
|
|
|
|
bars = ax.bar(x, confidences, color="#4698cf") |
|
|
|
|
|
for i, score in enumerate(scores): |
|
bars[i].set_color("green" if score == 1 else "red") |
|
|
|
ax.set_title("Part Confidence") |
|
ax.set_xlabel("Part Number") |
|
ax.set_ylabel("Confidence") |
|
ax.set_xticks(x) |
|
ax.set_xticklabels([f"Part {i}" for i in x]) |
|
|
|
return fig |
|
|
|
|
|
def validate_workflow(workflow: Workflow): |
|
"""Validate that a workflow is properly configured for the bonus task.""" |
|
if not workflow.steps: |
|
raise ValueError("Workflow must have at least one step") |
|
|
|
|
|
for step_id, step in workflow.steps.items(): |
|
validate_model_step(step) |
|
|
|
|
|
input_vars = set(workflow.inputs) |
|
if "leadin" not in input_vars or "part" not in input_vars: |
|
raise ValueError("Workflow must have 'leadin' and 'part' as inputs") |
|
|
|
output_vars = set(workflow.outputs) |
|
if not all(var in output_vars for var in ["answer", "confidence", "explanation"]): |
|
raise ValueError("Workflow must produce 'answer', 'confidence', and 'explanation' as outputs") |
|
|
|
|
|
def validate_model_step(model_step: ModelStep): |
|
"""Validate that a model step is properly configured for the bonus task.""" |
|
|
|
if not model_step.model or not model_step.provider: |
|
raise ValueError("Model step must have both model and provider specified") |
|
|
|
if model_step.call_type != "llm": |
|
raise ValueError("Model step must have call_type 'llm'") |
|
|
|
|
|
if model_step.temperature is None: |
|
raise ValueError("Temperature must be specified for LLM model steps") |
|
|
|
if not (0.0 <= model_step.temperature <= 1.0): |
|
raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}") |
|
|
|
|
|
input_field_names = {field.name for field in model_step.input_fields} |
|
if "leadin" not in input_field_names or "part" not in input_field_names: |
|
raise ValueError("Model step must have 'leadin' and 'part' input fields") |
|
|
|
|
|
output_field_names = {field.name for field in model_step.output_fields} |
|
required_outputs = {"answer", "confidence", "explanation"} |
|
if not all(out in output_field_names for out in required_outputs): |
|
raise ValueError("Model step must have all required output fields: answer, confidence, explanation") |
|
|
|
|
|
for field in model_step.output_fields: |
|
if field.name == "confidence" and field.type != "float": |
|
raise ValueError("The 'confidence' output field must be of type 'float'") |
|
|
|
|
|
class BonusInterface: |
|
"""Gradio interface for the Bonus mode.""" |
|
|
|
def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict): |
|
"""Initialize the Bonus interface.""" |
|
logging.info(f"Initializing Bonus interface with dataset size: {len(dataset)}") |
|
self.ds = dataset |
|
self.model_options = model_options |
|
self.app = app |
|
self.defaults = defaults |
|
self.output_state = gr.State(value="{}") |
|
self.render() |
|
|
|
def _render_model_interface(self, workflow: Workflow, simple: bool = True): |
|
"""Render the model interface.""" |
|
self.pipeline_interface = PipelineInterface( |
|
workflow, |
|
simple=simple, |
|
model_options=list(self.model_options.keys()), |
|
) |
|
with gr.Row(): |
|
self.run_btn = gr.Button("Run Bonus", variant="primary") |
|
|
|
def _render_qb_interface(self): |
|
"""Render the quizbowl interface.""" |
|
with gr.Row(): |
|
self.qid_selector = gr.Number( |
|
label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0 |
|
) |
|
self.answer_display = gr.Textbox( |
|
label="Answers", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1 |
|
) |
|
self.clean_answer_display = gr.Textbox( |
|
label="Acceptable Answers", |
|
elem_id="answer-display-2", |
|
elem_classes="answer-box", |
|
interactive=False, |
|
scale=2, |
|
) |
|
|
|
self.question_display = gr.HTML(label="Question", elem_id="question-display") |
|
with gr.Row(): |
|
self.confidence_plot = gr.Plot( |
|
label="Part Confidence", |
|
format="webp", |
|
) |
|
|
|
self.results_table = gr.DataFrame( |
|
label="Model Outputs", |
|
value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]), |
|
) |
|
|
|
with gr.Row(): |
|
self.eval_btn = gr.Button("Evaluate") |
|
|
|
with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True): |
|
with gr.Row(): |
|
self.model_name_input = gr.Textbox(label="Model Name") |
|
self.description_input = gr.Textbox(label="Description") |
|
with gr.Row(): |
|
gr.LoginButton() |
|
self.submit_btn = gr.Button("Submit") |
|
self.submit_status = gr.HTML(label="Submission Status") |
|
|
|
def render(self): |
|
"""Create the Gradio interface.""" |
|
self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index") |
|
workflow = self.defaults["init_workflow"] |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
self._render_model_interface(workflow, simple=self.defaults["simple_workflow"]) |
|
|
|
with gr.Column(scale=1): |
|
self._render_qb_interface() |
|
|
|
self._setup_event_listeners() |
|
|
|
def get_new_question_html(self, question_id: int): |
|
"""Get the HTML for a new question.""" |
|
example = self.ds[question_id - 1] |
|
leadin = example["leadin"] |
|
parts = example["parts"] |
|
|
|
|
|
leadin_html = f"<div class='leadin'>{leadin}</div>" |
|
parts_html = [] |
|
for i, part in enumerate(parts): |
|
parts_html.append(f"<div class='part'>{part['part']}</div>") |
|
|
|
parts_html_str = "<br>".join(parts_html) |
|
|
|
html_content = ( |
|
f"<div class='token-container'>{leadin_html}<div class='parts-container'><br>{parts_html_str}</div></div>" |
|
) |
|
|
|
|
|
primary_answers = [f"{i + 1}. {part['answer_primary']}" for i, part in enumerate(parts)] |
|
clean_answers = [] |
|
for i, part in enumerate(parts): |
|
part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6] |
|
clean_answers.append(f"{i + 1}. {', '.join(part_answers)}") |
|
|
|
return html_content, "\n".join(primary_answers), "\n".join(clean_answers) |
|
|
|
def get_model_outputs(self, example: dict, pipeline_state: PipelineState): |
|
"""Get the model outputs for a given question ID.""" |
|
outputs = [] |
|
leadin = example["leadin"] |
|
|
|
for i, part in enumerate(example["parts"]): |
|
agent = SimpleBonusAgent(workflow=pipeline_state.workflow) |
|
|
|
part_output = agent.run(leadin, part["part"]) |
|
|
|
|
|
part_output["part_number"] = i + 1 |
|
part_output["score"] = evaluate_bonus_part(part_output["answer"], part["clean_answers"]) |
|
|
|
outputs.append(part_output) |
|
|
|
return outputs |
|
|
|
def run_bonus( |
|
self, |
|
question_id: int, |
|
pipeline_state: PipelineState, |
|
) -> tuple[str, Any, Any]: |
|
"""Run the agent in bonus mode.""" |
|
try: |
|
|
|
question_id = int(question_id - 1) |
|
if not self.ds or question_id < 0 or question_id >= len(self.ds): |
|
return "Invalid question ID or dataset not loaded", None, None |
|
|
|
example = self.ds[question_id] |
|
outputs = self.get_model_outputs(example, pipeline_state) |
|
|
|
|
|
html_content, plot_data, output_state = initialize_eval_interface(example, outputs) |
|
df = process_bonus_results(outputs) |
|
|
|
return ( |
|
html_content, |
|
gr.update(value=plot_data, label=f"Part Confidence on Question {question_id + 1}"), |
|
gr.update(value=output_state), |
|
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"), |
|
) |
|
except Exception as e: |
|
import traceback |
|
|
|
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" |
|
return error_msg, None, None |
|
|
|
def evaluate_bonus(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()): |
|
"""Evaluate the bonus questions.""" |
|
try: |
|
|
|
if not self.ds or not self.ds.num_rows: |
|
return "No dataset loaded", None, None |
|
|
|
total_correct = 0 |
|
total_parts = 0 |
|
part_scores = [] |
|
part_numbers = [] |
|
|
|
for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"): |
|
model_outputs = self.get_model_outputs(example, pipeline_state) |
|
|
|
for output in model_outputs: |
|
total_parts += 1 |
|
if output["score"] == 1: |
|
total_correct += 1 |
|
part_scores.append(output["score"]) |
|
part_numbers.append(output["part_number"]) |
|
|
|
accuracy = total_correct / total_parts |
|
df = pd.DataFrame( |
|
[ |
|
{ |
|
"Part Accuracy": f"{accuracy:.2%}", |
|
"Total Score": f"{total_correct}/{total_parts}", |
|
"Questions Evaluated": len(self.ds), |
|
} |
|
] |
|
) |
|
|
|
plot_data = create_scatter_pyplot(part_numbers, part_scores) |
|
return ( |
|
gr.update(value=df, label="Scores on Sample Set"), |
|
gr.update(value=plot_data, label="Part Scores on Sample Set"), |
|
) |
|
except Exception: |
|
import traceback |
|
|
|
logging.error(f"Error evaluating bonus: {traceback.format_exc()}") |
|
return "Error evaluating bonus", None, None |
|
|
|
def submit_model( |
|
self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None |
|
): |
|
"""Submit the model output.""" |
|
return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile) |
|
|
|
def _setup_event_listeners(self): |
|
|
|
|
|
gr.on( |
|
triggers=[self.app.load, self.qid_selector.change], |
|
fn=self.get_new_question_html, |
|
inputs=[self.qid_selector], |
|
outputs=[self.question_display, self.answer_display, self.clean_answer_display], |
|
) |
|
self.run_btn.click( |
|
self.pipeline_interface.validate_workflow, |
|
inputs=[self.pipeline_interface.pipeline_state], |
|
outputs=[self.pipeline_interface.pipeline_state], |
|
).success( |
|
self.run_bonus, |
|
inputs=[ |
|
self.qid_selector, |
|
self.pipeline_interface.pipeline_state, |
|
], |
|
outputs=[ |
|
self.question_display, |
|
self.confidence_plot, |
|
self.output_state, |
|
self.results_table, |
|
], |
|
) |
|
|
|
self.eval_btn.click( |
|
fn=self.evaluate_bonus, |
|
inputs=[self.pipeline_interface.pipeline_state], |
|
outputs=[self.results_table, self.confidence_plot], |
|
) |
|
|
|
self.submit_btn.click( |
|
fn=self.submit_model_output, |
|
inputs=[ |
|
self.model_name_input, |
|
self.description_input, |
|
self.pipeline_interface.pipeline_state, |
|
], |
|
outputs=[self.submit_status], |
|
) |
|
self.hidden_input.change( |
|
fn=update_plot, |
|
inputs=[self.hidden_input, self.output_state], |
|
outputs=[self.confidence_plot], |
|
) |
|
|