Maharshi Gor
commited on
Commit
·
f064c62
1
Parent(s):
bdbc03c
Updated workflow APIs, code clean up and minor functions for hf pipeline support
Browse files- app.py +15 -9
- shared/workflows +1 -1
- src/components/model_pipeline/model_pipeline.py +0 -2
- src/components/model_pipeline/state_manager.py +0 -1
- src/components/quizbowl/bonus.py +48 -59
- src/components/quizbowl/plotting.py +41 -37
- src/components/quizbowl/tossup.py +39 -40
- src/components/quizbowl/utils.py +0 -62
- src/display/custom_css.py +2 -0
- src/envs.py +1 -0
- src/submission/structs.py +1 -1
- src/submission/submit.py +38 -3
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
|
3 |
import datasets
|
4 |
import gradio as gr
|
@@ -6,17 +7,20 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
6 |
from huggingface_hub import snapshot_download
|
7 |
from loguru import logger
|
8 |
|
9 |
-
import
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
from app_configs import DEFAULT_SELECTIONS, THEME
|
12 |
-
from components.
|
13 |
from components.quizbowl.bonus import BonusInterface
|
14 |
from components.quizbowl.tossup import TossupInterface
|
15 |
from components.typed_dicts import PipelineInterfaceDefaults, TossupInterfaceDefaults
|
16 |
from display.css_html_js import fonts_header, js_head, leaderboard_css
|
17 |
from display.custom_css import css_bonus, css_pipeline, css_tossup
|
18 |
from display.guide import BUILDING_MARKDOWN, QUICKSTART_MARKDOWN
|
19 |
-
from display.utils import AutoEvalColumn, fields
|
20 |
|
21 |
# Constants
|
22 |
from envs import (
|
@@ -40,9 +44,11 @@ from envs import (
|
|
40 |
from hf_datasets_utils import download_dataset_snapshot
|
41 |
from shared.workflows import factory
|
42 |
from shared.workflows.configs import AVAILABLE_MODELS
|
|
|
43 |
|
44 |
|
45 |
def restart_space():
|
|
|
46 |
API.restart_space(repo_id=REPO_ID)
|
47 |
|
48 |
|
@@ -112,7 +118,7 @@ if __name__ == "__main__":
|
|
112 |
with gr.Row():
|
113 |
with gr.Column(scale=5):
|
114 |
gr.Markdown(
|
115 |
-
f"## 🤖 Welcome to QANTA 2025 Quizbowl Arena!    
|
116 |
"\n### 🎲 Create, play around, and submit your quizbowl agents."
|
117 |
f"<br>📋 [Register]({REGISTRATION_URL}) to participate in our [QANTA 2025 Human-AI Quizbowl Competition]({COMPETITION_URL}).",
|
118 |
elem_classes="welcome-text",
|
@@ -140,10 +146,8 @@ if __name__ == "__main__":
|
|
140 |
**DEFAULT_SELECTIONS["bonus"], init_workflow=factory.create_simple_qb_bonus_workflow()
|
141 |
)
|
142 |
bonus_interface = BonusInterface(demo, browser_state, bonus_ds, AVAILABLE_MODELS, defaults)
|
143 |
-
# with gr.Tab("
|
144 |
-
#
|
145 |
-
# gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
|
146 |
-
# create_leaderboard_interface(demo)
|
147 |
with gr.Tab("❓ Help", id="help"):
|
148 |
with gr.Row():
|
149 |
with gr.Column():
|
@@ -153,6 +157,8 @@ if __name__ == "__main__":
|
|
153 |
|
154 |
# Event Listeners
|
155 |
|
|
|
|
|
156 |
login_btn.click(
|
157 |
fn=presave_pipeline_state,
|
158 |
inputs=[
|
|
|
1 |
import json
|
2 |
+
import sys
|
3 |
|
4 |
import datasets
|
5 |
import gradio as gr
|
|
|
7 |
from huggingface_hub import snapshot_download
|
8 |
from loguru import logger
|
9 |
|
10 |
+
from envs import LOG_LEVEL
|
11 |
+
|
12 |
+
# Set the log level to INFO
|
13 |
+
logger.remove()
|
14 |
+
logger.add(sys.stdout, level=LOG_LEVEL, diagnose=False)
|
15 |
+
|
16 |
from app_configs import DEFAULT_SELECTIONS, THEME
|
17 |
+
from components.hf_pipelines import create_hf_pipeline_submission_interface
|
18 |
from components.quizbowl.bonus import BonusInterface
|
19 |
from components.quizbowl.tossup import TossupInterface
|
20 |
from components.typed_dicts import PipelineInterfaceDefaults, TossupInterfaceDefaults
|
21 |
from display.css_html_js import fonts_header, js_head, leaderboard_css
|
22 |
from display.custom_css import css_bonus, css_pipeline, css_tossup
|
23 |
from display.guide import BUILDING_MARKDOWN, QUICKSTART_MARKDOWN
|
|
|
24 |
|
25 |
# Constants
|
26 |
from envs import (
|
|
|
44 |
from hf_datasets_utils import download_dataset_snapshot
|
45 |
from shared.workflows import factory
|
46 |
from shared.workflows.configs import AVAILABLE_MODELS
|
47 |
+
from shared.workflows.llms import llm_cache
|
48 |
|
49 |
|
50 |
def restart_space():
|
51 |
+
llm_cache.sync_to_hf()
|
52 |
API.restart_space(repo_id=REPO_ID)
|
53 |
|
54 |
|
|
|
118 |
with gr.Row():
|
119 |
with gr.Column(scale=5):
|
120 |
gr.Markdown(
|
121 |
+
f"## 🤖 Welcome to QANTA 2025 Quizbowl Arena!     👉 🏆 [Leaderboard]({LEADERBOARD_URL}) 👈"
|
122 |
"\n### 🎲 Create, play around, and submit your quizbowl agents."
|
123 |
f"<br>📋 [Register]({REGISTRATION_URL}) to participate in our [QANTA 2025 Human-AI Quizbowl Competition]({COMPETITION_URL}).",
|
124 |
elem_classes="welcome-text",
|
|
|
146 |
**DEFAULT_SELECTIONS["bonus"], init_workflow=factory.create_simple_qb_bonus_workflow()
|
147 |
)
|
148 |
bonus_interface = BonusInterface(demo, browser_state, bonus_ds, AVAILABLE_MODELS, defaults)
|
149 |
+
# with gr.Tab("🤗 HuggingFace Pipelines", elem_id="hf-pipeline-tab", id="hf-pipeline-tab"):
|
150 |
+
# hf_pipeline_interface = create_hf_pipeline_submission_interface(demo)
|
|
|
|
|
151 |
with gr.Tab("❓ Help", id="help"):
|
152 |
with gr.Row():
|
153 |
with gr.Column():
|
|
|
157 |
|
158 |
# Event Listeners
|
159 |
|
160 |
+
# This is used to retrieve the pipeline state user was working on before login.
|
161 |
+
# makes things less annoying when progress is lost due to login.
|
162 |
login_btn.click(
|
163 |
fn=presave_pipeline_state,
|
164 |
inputs=[
|
shared/workflows
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Subproject commit
|
|
|
1 |
+
Subproject commit e5b9e225ca82372ee86f6d340b1523d4574bed3d
|
src/components/model_pipeline/model_pipeline.py
CHANGED
@@ -13,7 +13,6 @@ from components.model_step.model_step import ModelStepComponent
|
|
13 |
from components.structs import ModelStepUIState, PipelineState, PipelineUIState
|
14 |
from components.utils import make_state
|
15 |
from shared.workflows.structs import ModelStep, Workflow
|
16 |
-
from shared.workflows.validators import WorkflowValidationError, WorkflowValidator
|
17 |
|
18 |
from .state_manager import get_output_panel_state
|
19 |
|
@@ -117,7 +116,6 @@ class PipelineInterface:
|
|
117 |
return step_interface
|
118 |
|
119 |
is_multi_step = n_steps > 1
|
120 |
-
# logger.debug(f"Rendering step {position} of {n_steps}")
|
121 |
|
122 |
# Add step controls below
|
123 |
with gr.Row(elem_classes="step-controls", visible=is_multi_step):
|
|
|
13 |
from components.structs import ModelStepUIState, PipelineState, PipelineUIState
|
14 |
from components.utils import make_state
|
15 |
from shared.workflows.structs import ModelStep, Workflow
|
|
|
16 |
|
17 |
from .state_manager import get_output_panel_state
|
18 |
|
|
|
116 |
return step_interface
|
117 |
|
118 |
is_multi_step = n_steps > 1
|
|
|
119 |
|
120 |
# Add step controls below
|
121 |
with gr.Row(elem_classes="step-controls", visible=is_multi_step):
|
src/components/model_pipeline/state_manager.py
CHANGED
@@ -215,7 +215,6 @@ class PipelineStateManager:
|
|
215 |
"""Update a workflow from a YAML string."""
|
216 |
try:
|
217 |
workflow = self.parse_yaml_workflow(yaml_str, strict=True)
|
218 |
-
logger.debug(f"Validator: {self.validator}")
|
219 |
self.validator and self.validator(workflow)
|
220 |
state = self.pipeline_state_cls.from_workflow(workflow)
|
221 |
return state.model_dump(), not change_state, gr.update(visible=False)
|
|
|
215 |
"""Update a workflow from a YAML string."""
|
216 |
try:
|
217 |
workflow = self.parse_yaml_workflow(yaml_str, strict=True)
|
|
|
218 |
self.validator and self.validator(workflow)
|
219 |
state = self.pipeline_state_cls.from_workflow(workflow)
|
220 |
return state.model_dump(), not change_state, gr.update(visible=False)
|
src/components/quizbowl/bonus.py
CHANGED
@@ -15,6 +15,7 @@ from display.formatting import styled_error
|
|
15 |
from shared.workflows import factory
|
16 |
from shared.workflows.metrics import evaluate_prediction
|
17 |
from shared.workflows.qb_agents import QuizBowlBonusAgent
|
|
|
18 |
from submission import submit
|
19 |
|
20 |
from . import populate, validation
|
@@ -28,10 +29,10 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
|
|
28 |
return pd.DataFrame(
|
29 |
[
|
30 |
{
|
31 |
-
"Part": f"Part {r['
|
32 |
-
"Correct?": "✅" if r["
|
33 |
"Confidence": r["confidence"],
|
34 |
-
"Prediction": r["
|
35 |
"Explanation": r["explanation"],
|
36 |
}
|
37 |
for r in results
|
@@ -39,20 +40,20 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
|
|
39 |
)
|
40 |
|
41 |
|
42 |
-
def initialize_eval_interface(example: dict,
|
43 |
"""Initialize the interface with example text."""
|
44 |
try:
|
45 |
html_content = create_bonus_html(example["leadin"], example["parts"])
|
46 |
|
47 |
# Create confidence plot data
|
48 |
-
plot_data = create_bonus_confidence_plot(example["parts"],
|
49 |
|
50 |
# Store state
|
51 |
-
state = {"parts": example["parts"], "outputs":
|
52 |
|
53 |
# Preparing step outputs for the model
|
54 |
step_outputs = {}
|
55 |
-
for i, output in enumerate(
|
56 |
key = f"part {i + 1}"
|
57 |
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
58 |
if output["logprob"] is not None:
|
@@ -60,8 +61,9 @@ def initialize_eval_interface(example: dict, model_outputs: list[dict], input_va
|
|
60 |
|
61 |
return html_content, plot_data, state, step_outputs
|
62 |
except Exception as e:
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
|
67 |
class BonusInterface:
|
@@ -79,19 +81,23 @@ class BonusInterface:
|
|
79 |
self.render()
|
80 |
|
81 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
|
|
|
|
|
|
|
|
82 |
|
83 |
def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
|
84 |
-
logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
|
85 |
try:
|
86 |
state_dict = browser_state["bonus"].get("pipeline_state", {})
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
except Exception as e:
|
91 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
92 |
-
output_state =
|
93 |
-
workflow = self.defaults["init_workflow"]
|
94 |
-
pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
|
95 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
96 |
|
97 |
# ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
|
@@ -101,6 +107,7 @@ class BonusInterface:
|
|
101 |
self.pipeline_selector = commons.get_pipeline_selector([])
|
102 |
self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
|
103 |
self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
|
|
|
104 |
self.pipeline_interface = PipelineInterface(
|
105 |
self.app,
|
106 |
pipeline_state.workflow,
|
@@ -135,7 +142,7 @@ class BonusInterface:
|
|
135 |
def render(self):
|
136 |
"""Create the Gradio interface."""
|
137 |
self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
|
138 |
-
workflow = factory.
|
139 |
pipeline_state = PipelineState.from_workflow(workflow)
|
140 |
|
141 |
with gr.Row():
|
@@ -195,25 +202,7 @@ class BonusInterface:
|
|
195 |
error_msg = styled_error(f"Error loading pipeline: {str(e)}")
|
196 |
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
def get_agent_outputs(self, example: dict, pipeline_state: PipelineState):
|
201 |
-
"""Get the model outputs for a given question ID."""
|
202 |
-
outputs = []
|
203 |
-
leadin = example["leadin"]
|
204 |
-
agent = QuizBowlBonusAgent(pipeline_state.workflow)
|
205 |
-
|
206 |
-
for i, part in enumerate(example["parts"]):
|
207 |
-
# Run model for each part
|
208 |
-
part_output = agent.run(leadin, part["part"])
|
209 |
-
|
210 |
-
# Add part number and evaluate score
|
211 |
-
part_output["part_number"] = i + 1
|
212 |
-
part_output["score"] = evaluate_prediction(part_output["answer"], part["clean_answers"])
|
213 |
-
|
214 |
-
outputs.append(part_output)
|
215 |
-
|
216 |
-
return outputs
|
217 |
|
218 |
def single_run(
|
219 |
self,
|
@@ -237,13 +226,14 @@ class BonusInterface:
|
|
237 |
raise gr.Error("Invalid question ID or dataset not loaded")
|
238 |
|
239 |
example = self.ds[question_id]
|
240 |
-
|
241 |
-
|
|
|
242 |
# Process results and prepare visualization data
|
243 |
html_content, plot_data, output_state, step_outputs = initialize_eval_interface(
|
244 |
-
example,
|
245 |
)
|
246 |
-
df = process_bonus_results(
|
247 |
|
248 |
return (
|
249 |
html_content,
|
@@ -254,7 +244,7 @@ class BonusInterface:
|
|
254 |
)
|
255 |
except Exception as e:
|
256 |
error_msg = styled_error(create_error_message(e))
|
257 |
-
logger.exception(f"Error running
|
258 |
return (
|
259 |
gr.skip(),
|
260 |
gr.skip(),
|
@@ -271,27 +261,26 @@ class BonusInterface:
|
|
271 |
if not self.ds or not self.ds.num_rows:
|
272 |
return "No dataset loaded", None, None
|
273 |
|
274 |
-
|
|
|
|
|
|
|
|
|
275 |
total_parts = 0
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
total_correct += 1
|
286 |
-
part_scores.append(output["score"])
|
287 |
-
part_numbers.append(output["part_number"])
|
288 |
-
|
289 |
-
accuracy = total_correct / total_parts
|
290 |
df = pd.DataFrame(
|
291 |
[
|
292 |
{
|
293 |
-
"
|
294 |
-
"
|
295 |
"Questions Evaluated": len(self.ds),
|
296 |
}
|
297 |
]
|
@@ -305,7 +294,7 @@ class BonusInterface:
|
|
305 |
)
|
306 |
except Exception as e:
|
307 |
error_msg = styled_error(create_error_message(e))
|
308 |
-
logger.exception(f"Error evaluating
|
309 |
return gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
310 |
|
311 |
def submit_model(
|
|
|
15 |
from shared.workflows import factory
|
16 |
from shared.workflows.metrics import evaluate_prediction
|
17 |
from shared.workflows.qb_agents import QuizBowlBonusAgent
|
18 |
+
from shared.workflows.runners import run_and_eval_bonus_dataset, run_and_evaluate_bonus
|
19 |
from submission import submit
|
20 |
|
21 |
from . import populate, validation
|
|
|
29 |
return pd.DataFrame(
|
30 |
[
|
31 |
{
|
32 |
+
"Part": f"Part {r['number']}",
|
33 |
+
"Correct?": "✅" if r["correct"] == 1 else "❌",
|
34 |
"Confidence": r["confidence"],
|
35 |
+
"Prediction": r["guess"],
|
36 |
"Explanation": r["explanation"],
|
37 |
}
|
38 |
for r in results
|
|
|
40 |
)
|
41 |
|
42 |
|
43 |
+
def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
|
44 |
"""Initialize the interface with example text."""
|
45 |
try:
|
46 |
html_content = create_bonus_html(example["leadin"], example["parts"])
|
47 |
|
48 |
# Create confidence plot data
|
49 |
+
plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
|
50 |
|
51 |
# Store state
|
52 |
+
state = {"parts": example["parts"], "outputs": part_outputs}
|
53 |
|
54 |
# Preparing step outputs for the model
|
55 |
step_outputs = {}
|
56 |
+
for i, output in enumerate(part_outputs):
|
57 |
key = f"part {i + 1}"
|
58 |
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
59 |
if output["logprob"] is not None:
|
|
|
61 |
|
62 |
return html_content, plot_data, state, step_outputs
|
63 |
except Exception as e:
|
64 |
+
error_msg = f"Error initializing interface: {str(e)}"
|
65 |
+
logger.exception(error_msg)
|
66 |
+
return styled_error(error_msg), pd.DataFrame(), {}, {}
|
67 |
|
68 |
|
69 |
class BonusInterface:
|
|
|
81 |
self.render()
|
82 |
|
83 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
84 |
+
def load_default_workflow(self):
|
85 |
+
workflow = self.defaults["init_workflow"]
|
86 |
+
pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
|
87 |
+
return pipeline_state_dict, {}
|
88 |
|
89 |
def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
|
|
|
90 |
try:
|
91 |
state_dict = browser_state["bonus"].get("pipeline_state", {})
|
92 |
+
if state_dict:
|
93 |
+
pipeline_state = PipelineState.model_validate(state_dict)
|
94 |
+
pipeline_state_dict = pipeline_state.model_dump()
|
95 |
+
output_state = browser_state["bonus"].get("output_state", {})
|
96 |
+
else:
|
97 |
+
pipeline_state_dict, output_state = self.load_default_workflow()
|
98 |
except Exception as e:
|
99 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
100 |
+
pipeline_state_dict, output_state = self.load_default_workflow()
|
|
|
|
|
101 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
102 |
|
103 |
# ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
|
|
|
107 |
self.pipeline_selector = commons.get_pipeline_selector([])
|
108 |
self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
|
109 |
self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
|
110 |
+
logger.info(f"Rendering {self.__class__.__name__} with pipeline state: {pipeline_state}")
|
111 |
self.pipeline_interface = PipelineInterface(
|
112 |
self.app,
|
113 |
pipeline_state.workflow,
|
|
|
142 |
def render(self):
|
143 |
"""Create the Gradio interface."""
|
144 |
self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
|
145 |
+
workflow = factory.create_empty_bonus_workflow()
|
146 |
pipeline_state = PipelineState.from_workflow(workflow)
|
147 |
|
148 |
with gr.Row():
|
|
|
202 |
error_msg = styled_error(f"Error loading pipeline: {str(e)}")
|
203 |
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
204 |
|
205 |
+
# ------------------------------------- Agent Functions -----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
def single_run(
|
208 |
self,
|
|
|
226 |
raise gr.Error("Invalid question ID or dataset not loaded")
|
227 |
|
228 |
example = self.ds[question_id]
|
229 |
+
agent = QuizBowlBonusAgent(pipeline_state.workflow)
|
230 |
+
model_output = run_and_evaluate_bonus(agent, example, return_extras=True)
|
231 |
+
part_outputs = model_output["part_outputs"]
|
232 |
# Process results and prepare visualization data
|
233 |
html_content, plot_data, output_state, step_outputs = initialize_eval_interface(
|
234 |
+
example, part_outputs, pipeline_state.workflow.inputs
|
235 |
)
|
236 |
+
df = process_bonus_results(part_outputs)
|
237 |
|
238 |
return (
|
239 |
html_content,
|
|
|
244 |
)
|
245 |
except Exception as e:
|
246 |
error_msg = styled_error(create_error_message(e))
|
247 |
+
logger.exception(f"Error running bonus: {e}")
|
248 |
return (
|
249 |
gr.skip(),
|
250 |
gr.skip(),
|
|
|
261 |
if not self.ds or not self.ds.num_rows:
|
262 |
return "No dataset loaded", None, None
|
263 |
|
264 |
+
agent = QuizBowlBonusAgent(pipeline_state.workflow)
|
265 |
+
model_outputs = run_and_eval_bonus_dataset(
|
266 |
+
agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
|
267 |
+
)
|
268 |
+
n_parts_correct = 0
|
269 |
total_parts = 0
|
270 |
+
n_questions_correct = 0
|
271 |
+
for model_output in model_outputs:
|
272 |
+
part_outputs = model_output["part_outputs"]
|
273 |
+
n_parts_correct += sum(output["correct"] for output in part_outputs)
|
274 |
+
total_parts += len(part_outputs)
|
275 |
+
n_questions_correct += int(n_parts_correct == len(part_outputs))
|
276 |
+
|
277 |
+
p_accuracy = n_parts_correct / total_parts
|
278 |
+
q_accuracy = n_questions_correct / len(self.ds)
|
|
|
|
|
|
|
|
|
|
|
279 |
df = pd.DataFrame(
|
280 |
[
|
281 |
{
|
282 |
+
"Question Accuracy": f"{q_accuracy:.2%}",
|
283 |
+
"Part Accuracy": f"{p_accuracy:.2%}",
|
284 |
"Questions Evaluated": len(self.ds),
|
285 |
}
|
286 |
]
|
|
|
294 |
)
|
295 |
except Exception as e:
|
296 |
error_msg = styled_error(create_error_message(e))
|
297 |
+
logger.exception(f"Error evaluating bonus: {e}")
|
298 |
return gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
299 |
|
300 |
def submit_model(
|
src/components/quizbowl/plotting.py
CHANGED
@@ -37,14 +37,14 @@ def _create_token_tooltip_html(values) -> str:
|
|
37 |
return ""
|
38 |
confidence = values.get("confidence", 0)
|
39 |
buzz = values.get("buzz", 0)
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
if len(
|
44 |
-
k = len(
|
45 |
-
|
46 |
|
47 |
-
color = "#a3c9a3" if
|
48 |
|
49 |
if values.get("logprob", None) is not None:
|
50 |
prob = np.exp(values["logprob"])
|
@@ -56,10 +56,10 @@ def _create_token_tooltip_html(values) -> str:
|
|
56 |
<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
|
57 |
<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
|
58 |
<h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
|
59 |
-
<p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{
|
60 |
<p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
|
61 |
{prob_str}
|
62 |
-
<p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if
|
63 |
</div>
|
64 |
</div>
|
65 |
"""
|
@@ -68,14 +68,14 @@ def _create_token_tooltip_html(values) -> str:
|
|
68 |
def create_token_html(token: str, values: dict, i: int) -> str:
|
69 |
confidence = values.get("confidence", None)
|
70 |
buzz = values.get("buzz", 0)
|
71 |
-
|
72 |
|
73 |
# Replace non-word characters for proper display in HTML
|
74 |
display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
|
75 |
if not re.match(r"\w+", token):
|
76 |
display_token = token.replace(" ", " ")
|
77 |
|
78 |
-
css_class = _get_token_classes(confidence, buzz,
|
79 |
# Add tooltip if we have values for this token
|
80 |
tooltip_html = _create_token_tooltip_html(values)
|
81 |
|
@@ -98,8 +98,8 @@ def create_tossup_html(
|
|
98 |
marker_indices = set(marker_indices)
|
99 |
|
100 |
html_tokens = []
|
101 |
-
for i, token in enumerate(tokens):
|
102 |
-
token_html = create_token_html(token, ep.get(i, {}), i
|
103 |
html_tokens.append(token_html)
|
104 |
|
105 |
answer_html = _make_answer_html(answer_primary, clean_answers)
|
@@ -156,7 +156,7 @@ def create_bonus_html(leadin: str, parts: list[dict]) -> str:
|
|
156 |
|
157 |
def create_tossup_confidence_pyplot(
|
158 |
tokens: list[str],
|
159 |
-
|
160 |
confidence_threshold: float = 0.5,
|
161 |
prob_threshold: float | None = None,
|
162 |
) -> plt.Figure:
|
@@ -164,25 +164,26 @@ def create_tossup_confidence_pyplot(
|
|
164 |
plt.style.use("ggplot") # Set theme to grid paper
|
165 |
fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
|
166 |
ax = fig.add_subplot(111)
|
167 |
-
x = [0] + [
|
168 |
-
y_conf = [0] + [
|
169 |
-
|
170 |
-
y_prob = [0] + [np.exp(v) for v in
|
171 |
|
172 |
ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
|
173 |
ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
|
174 |
-
for
|
175 |
-
if not
|
176 |
continue
|
177 |
-
color = "green" if
|
178 |
-
conf =
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
186 |
|
187 |
# Add horizontal dashed line for confidence threshold
|
188 |
ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
|
@@ -228,7 +229,7 @@ def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -
|
|
228 |
# Plot confidence for each part
|
229 |
x = range(1, len(parts) + 1)
|
230 |
confidences = [output["confidence"] for output in model_outputs]
|
231 |
-
scores = [output["
|
232 |
|
233 |
# Plot confidence bars
|
234 |
bars = ax.bar(x, confidences, color="#4698cf")
|
@@ -287,13 +288,16 @@ def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
|
|
287 |
pos_gaps = gaps.loc[gaps >= 0]
|
288 |
neg_gaps = gaps.loc[gaps < 0]
|
289 |
|
290 |
-
mean_tossup_score = df["
|
|
|
|
|
291 |
|
292 |
return pd.DataFrame(
|
293 |
[
|
294 |
{
|
295 |
-
"
|
296 |
-
"
|
|
|
297 |
"Buzz Position": f"{np.mean(positions):5.1f}",
|
298 |
"+ve Gap": f"{pos_gaps.mean():5.1f}",
|
299 |
"-ve Gap": f"{neg_gaps.mean():5.1f}",
|
@@ -493,16 +497,16 @@ def create_dummy_model_outputs(n_entries=10, n_positions=5):
|
|
493 |
|
494 |
outputs.append(
|
495 |
{
|
496 |
-
"
|
497 |
"buzz": will_buzz,
|
498 |
-
"
|
499 |
"confidence": np.random.random(),
|
500 |
"logprob": np.log(np.random.random()),
|
501 |
-
"
|
502 |
}
|
503 |
)
|
504 |
|
505 |
-
dummy_outputs.append({"run_indices": run_indices, "
|
506 |
|
507 |
return dummy_outputs
|
508 |
|
|
|
37 |
return ""
|
38 |
confidence = values.get("confidence", 0)
|
39 |
buzz = values.get("buzz", 0)
|
40 |
+
correct = values.get("correct", 0)
|
41 |
+
guess = values.get("guess", "")
|
42 |
+
guess_tokens = guess.split()
|
43 |
+
if len(guess_tokens) > 10:
|
44 |
+
k = len(guess_tokens) - 10
|
45 |
+
guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]"
|
46 |
|
47 |
+
color = "#a3c9a3" if correct else "#ebbec4" # Light green for correct, light pink for incorrect
|
48 |
|
49 |
if values.get("logprob", None) is not None:
|
50 |
prob = np.exp(values["logprob"])
|
|
|
56 |
<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
|
57 |
<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
|
58 |
<h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
|
59 |
+
<p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{guess}</code></p>
|
60 |
<p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
|
61 |
{prob_str}
|
62 |
+
<p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if correct else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
|
63 |
</div>
|
64 |
</div>
|
65 |
"""
|
|
|
68 |
def create_token_html(token: str, values: dict, i: int) -> str:
|
69 |
confidence = values.get("confidence", None)
|
70 |
buzz = values.get("buzz", 0)
|
71 |
+
correct = values.get("correct", 0)
|
72 |
|
73 |
# Replace non-word characters for proper display in HTML
|
74 |
display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
|
75 |
if not re.match(r"\w+", token):
|
76 |
display_token = token.replace(" ", " ")
|
77 |
|
78 |
+
css_class = _get_token_classes(confidence, buzz, correct)
|
79 |
# Add tooltip if we have values for this token
|
80 |
tooltip_html = _create_token_tooltip_html(values)
|
81 |
|
|
|
98 |
marker_indices = set(marker_indices)
|
99 |
|
100 |
html_tokens = []
|
101 |
+
for i, token in enumerate(tokens, start=1):
|
102 |
+
token_html = create_token_html(token, ep.get(i, {}), i)
|
103 |
html_tokens.append(token_html)
|
104 |
|
105 |
answer_html = _make_answer_html(answer_primary, clean_answers)
|
|
|
156 |
|
157 |
def create_tossup_confidence_pyplot(
|
158 |
tokens: list[str],
|
159 |
+
run_outputs: list[dict],
|
160 |
confidence_threshold: float = 0.5,
|
161 |
prob_threshold: float | None = None,
|
162 |
) -> plt.Figure:
|
|
|
164 |
plt.style.use("ggplot") # Set theme to grid paper
|
165 |
fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
|
166 |
ax = fig.add_subplot(111)
|
167 |
+
x = [0] + [o["token_position"] for o in run_outputs]
|
168 |
+
y_conf = [0] + [o["confidence"] for o in run_outputs]
|
169 |
+
logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None]
|
170 |
+
y_prob = [0] + [np.exp(v) for v in logprobs]
|
171 |
|
172 |
ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
|
173 |
ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
|
174 |
+
for o in run_outputs:
|
175 |
+
if not o["buzz"]:
|
176 |
continue
|
177 |
+
color = "green" if o["correct"] else "red"
|
178 |
+
conf = o["confidence"]
|
179 |
+
i = o["token_position"]
|
180 |
+
ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
|
181 |
+
if o["logprob"] is not None:
|
182 |
+
prob = np.exp(o["logprob"])
|
183 |
+
ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
|
184 |
+
if i > len(tokens):
|
185 |
+
print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}")
|
186 |
+
ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center")
|
187 |
|
188 |
# Add horizontal dashed line for confidence threshold
|
189 |
ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
|
|
|
229 |
# Plot confidence for each part
|
230 |
x = range(1, len(parts) + 1)
|
231 |
confidences = [output["confidence"] for output in model_outputs]
|
232 |
+
scores = [output["correct"] for output in model_outputs]
|
233 |
|
234 |
# Plot confidence bars
|
235 |
bars = ax.bar(x, confidences, color="#4698cf")
|
|
|
288 |
pos_gaps = gaps.loc[gaps >= 0]
|
289 |
neg_gaps = gaps.loc[gaps < 0]
|
290 |
|
291 |
+
mean_tossup_score = df["raw_score"].sum() / len(df)
|
292 |
+
expected_score = df["expected_score"].sum() / len(df)
|
293 |
+
buzz_precision = df["is_correct"].sum() / df["buzz"].sum()
|
294 |
|
295 |
return pd.DataFrame(
|
296 |
[
|
297 |
{
|
298 |
+
"Raw Score": f"{mean_tossup_score:5.1f}",
|
299 |
+
"Expected Score": f"{expected_score:5.1f}",
|
300 |
+
"Buzz Precision": f"{buzz_precision:5.1%}",
|
301 |
"Buzz Position": f"{np.mean(positions):5.1f}",
|
302 |
"+ve Gap": f"{pos_gaps.mean():5.1f}",
|
303 |
"-ve Gap": f"{neg_gaps.mean():5.1f}",
|
|
|
497 |
|
498 |
outputs.append(
|
499 |
{
|
500 |
+
"run_idx": i + 1,
|
501 |
"buzz": will_buzz,
|
502 |
+
"correct": 1 if is_correct else 0,
|
503 |
"confidence": np.random.random(),
|
504 |
"logprob": np.log(np.random.random()),
|
505 |
+
"guess": f"Answer {i + 1}",
|
506 |
}
|
507 |
)
|
508 |
|
509 |
+
dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs})
|
510 |
|
511 |
return dummy_outputs
|
512 |
|
src/components/quizbowl/tossup.py
CHANGED
@@ -16,6 +16,7 @@ from shared.workflows import factory
|
|
16 |
from shared.workflows.metrics import evaluate_prediction
|
17 |
from shared.workflows.metrics.qb_metrics import prepare_tossup_results_df
|
18 |
from shared.workflows.qb_agents import QuizBowlTossupAgent, TossupResult
|
|
|
19 |
from submission import submit
|
20 |
|
21 |
from . import populate, validation
|
@@ -28,9 +29,6 @@ from .plotting import (
|
|
28 |
from .utils import create_error_message
|
29 |
from .validation import UserInputWorkflowValidator
|
30 |
|
31 |
-
# TODO: Error handling on run tossup and evaluate tossup and show correct messages
|
32 |
-
# TODO: ^^ Same for Bonus
|
33 |
-
|
34 |
|
35 |
class ScoredTossupResult(TossupResult):
|
36 |
"""Result of a tossup question with evaluation score and position."""
|
@@ -44,8 +42,8 @@ def add_model_scores(
|
|
44 |
) -> list[ScoredTossupResult]:
|
45 |
"""Add model scores to the model outputs."""
|
46 |
for output in run_outputs:
|
47 |
-
output["
|
48 |
-
output["token_position"] = run_indices[output["
|
49 |
return run_outputs
|
50 |
|
51 |
|
@@ -58,7 +56,7 @@ def prepare_buzz_evals(
|
|
58 |
return [], []
|
59 |
eval_points = []
|
60 |
for o in model_outputs:
|
61 |
-
token_position = run_indices[o["
|
62 |
eval_points.append((token_position, o))
|
63 |
|
64 |
return eval_points
|
@@ -80,9 +78,11 @@ def initialize_eval_interface(
|
|
80 |
eval_points = [(o["token_position"], o) for o in run_outputs]
|
81 |
|
82 |
if not tokens:
|
83 |
-
|
|
|
|
|
84 |
html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
|
85 |
-
plot_data = create_tossup_confidence_pyplot(tokens,
|
86 |
|
87 |
# Store tokens, values, and buzzes as JSON for later use
|
88 |
state = {"tokens": tokens, "values": eval_points}
|
@@ -91,15 +91,16 @@ def initialize_eval_interface(
|
|
91 |
step_outputs = {}
|
92 |
for output in run_outputs:
|
93 |
tok_pos = output["token_position"]
|
94 |
-
key = "{pos}:{token}".format(pos=tok_pos
|
95 |
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
96 |
if output["logprob"] is not None:
|
97 |
step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
|
98 |
|
99 |
return html_content, plot_data, state, step_outputs
|
100 |
except Exception as e:
|
101 |
-
|
102 |
-
|
|
|
103 |
|
104 |
|
105 |
def process_tossup_results(results: list[dict]) -> pd.DataFrame:
|
@@ -108,12 +109,12 @@ def process_tossup_results(results: list[dict]) -> pd.DataFrame:
|
|
108 |
for r in results:
|
109 |
entry = {
|
110 |
"Token Position": r["token_position"],
|
111 |
-
"Correct?": "✅" if r["
|
112 |
"Confidence": r["confidence"],
|
113 |
}
|
114 |
if r["logprob"] is not None:
|
115 |
entry["Probability"] = f"{np.exp(r['logprob']):.3f}"
|
116 |
-
entry["Prediction"] = r["
|
117 |
data.append(entry)
|
118 |
return pd.DataFrame(data)
|
119 |
|
@@ -141,18 +142,23 @@ class TossupInterface:
|
|
141 |
|
142 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE ------------------------------------
|
143 |
|
|
|
|
|
|
|
|
|
|
|
144 |
def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
|
145 |
-
logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
|
146 |
try:
|
147 |
state_dict = browser_state["tossup"].get("pipeline_state", {})
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
151 |
except Exception as e:
|
152 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
153 |
-
output_state =
|
154 |
-
workflow = self.defaults["init_workflow"]
|
155 |
-
pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
|
156 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
157 |
|
158 |
# ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
|
@@ -256,18 +262,6 @@ class TossupInterface:
|
|
256 |
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
257 |
|
258 |
# ------------------------------------- Agent Functions -----------------------------------------------------------
|
259 |
-
def get_agent_outputs(
|
260 |
-
self, example: dict, pipeline_state: TossupPipelineState, early_stop: bool
|
261 |
-
) -> list[ScoredTossupResult]:
|
262 |
-
"""Get the model outputs for a given question ID."""
|
263 |
-
question_runs = []
|
264 |
-
tokens = example["question"].split()
|
265 |
-
for run_idx in example["run_indices"]:
|
266 |
-
question_runs.append(" ".join(tokens[: run_idx + 1]))
|
267 |
-
agent = QuizBowlTossupAgent(pipeline_state.workflow)
|
268 |
-
outputs = list(agent.run(question_runs, early_stop=early_stop))
|
269 |
-
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
|
270 |
-
return outputs
|
271 |
|
272 |
def single_run(
|
273 |
self,
|
@@ -295,15 +289,20 @@ class TossupInterface:
|
|
295 |
if not self.ds or question_id < 0 or question_id >= len(self.ds):
|
296 |
raise gr.Error("Invalid question ID or dataset not loaded")
|
297 |
example = self.ds[question_id]
|
298 |
-
outputs =
|
299 |
-
|
|
|
|
|
|
|
|
|
|
|
300 |
# Process results and prepare visualization data
|
301 |
confidence_threshold = workflow.buzzer.confidence_threshold
|
302 |
prob_threshold = workflow.buzzer.prob_threshold
|
303 |
tokens_html, plot_data, output_state, step_outputs = initialize_eval_interface(
|
304 |
-
example,
|
305 |
)
|
306 |
-
df = process_tossup_results(
|
307 |
|
308 |
return (
|
309 |
tokens_html,
|
@@ -332,10 +331,10 @@ class TossupInterface:
|
|
332 |
if not self.ds or not self.ds.num_rows:
|
333 |
return "No dataset loaded", None, None
|
334 |
pipeline_state = validation.validate_tossup_workflow(state_dict)
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
eval_df = prepare_tossup_results_df(model_outputs, self.ds["run_indices"])
|
340 |
plot_data = create_tossup_eval_dashboard(self.ds["run_indices"], eval_df)
|
341 |
output_df = create_tossup_eval_table(eval_df)
|
|
|
16 |
from shared.workflows.metrics import evaluate_prediction
|
17 |
from shared.workflows.metrics.qb_metrics import prepare_tossup_results_df
|
18 |
from shared.workflows.qb_agents import QuizBowlTossupAgent, TossupResult
|
19 |
+
from shared.workflows.runners import run_and_eval_tossup_dataset, run_and_evaluate_tossup
|
20 |
from submission import submit
|
21 |
|
22 |
from . import populate, validation
|
|
|
29 |
from .utils import create_error_message
|
30 |
from .validation import UserInputWorkflowValidator
|
31 |
|
|
|
|
|
|
|
32 |
|
33 |
class ScoredTossupResult(TossupResult):
|
34 |
"""Result of a tossup question with evaluation score and position."""
|
|
|
42 |
) -> list[ScoredTossupResult]:
|
43 |
"""Add model scores to the model outputs."""
|
44 |
for output in run_outputs:
|
45 |
+
output["correct"] = evaluate_prediction(output["guess"], clean_answers)
|
46 |
+
output["token_position"] = run_indices[output["run_idx"] - 1]
|
47 |
return run_outputs
|
48 |
|
49 |
|
|
|
56 |
return [], []
|
57 |
eval_points = []
|
58 |
for o in model_outputs:
|
59 |
+
token_position = run_indices[o["run_idx"] - 1]
|
60 |
eval_points.append((token_position, o))
|
61 |
|
62 |
return eval_points
|
|
|
78 |
eval_points = [(o["token_position"], o) for o in run_outputs]
|
79 |
|
80 |
if not tokens:
|
81 |
+
error_msg = "No tokens found in the provided text."
|
82 |
+
logger.exception(error_msg)
|
83 |
+
return styled_error(error_msg), pd.DataFrame(), {}, {}
|
84 |
html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
|
85 |
+
plot_data = create_tossup_confidence_pyplot(tokens, run_outputs, confidence_threshold, prob_threshold)
|
86 |
|
87 |
# Store tokens, values, and buzzes as JSON for later use
|
88 |
state = {"tokens": tokens, "values": eval_points}
|
|
|
91 |
step_outputs = {}
|
92 |
for output in run_outputs:
|
93 |
tok_pos = output["token_position"]
|
94 |
+
key = "{pos}:{token}".format(pos=tok_pos, token=tokens[tok_pos - 1])
|
95 |
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
96 |
if output["logprob"] is not None:
|
97 |
step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
|
98 |
|
99 |
return html_content, plot_data, state, step_outputs
|
100 |
except Exception as e:
|
101 |
+
error_msg = f"Error initializing interface: {str(e)}"
|
102 |
+
logger.exception(error_msg)
|
103 |
+
return styled_error(error_msg), pd.DataFrame(), {}, {}
|
104 |
|
105 |
|
106 |
def process_tossup_results(results: list[dict]) -> pd.DataFrame:
|
|
|
109 |
for r in results:
|
110 |
entry = {
|
111 |
"Token Position": r["token_position"],
|
112 |
+
"Correct?": "✅" if r["correct"] == 1 else "❌",
|
113 |
"Confidence": r["confidence"],
|
114 |
}
|
115 |
if r["logprob"] is not None:
|
116 |
entry["Probability"] = f"{np.exp(r['logprob']):.3f}"
|
117 |
+
entry["Prediction"] = r["guess"]
|
118 |
data.append(entry)
|
119 |
return pd.DataFrame(data)
|
120 |
|
|
|
142 |
|
143 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE ------------------------------------
|
144 |
|
145 |
+
def load_default_workflow(self):
|
146 |
+
workflow = self.defaults["init_workflow"]
|
147 |
+
pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
|
148 |
+
return pipeline_state_dict, {}
|
149 |
+
|
150 |
def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
|
|
|
151 |
try:
|
152 |
state_dict = browser_state["tossup"].get("pipeline_state", {})
|
153 |
+
if state_dict:
|
154 |
+
pipeline_state = TossupPipelineState.model_validate(state_dict)
|
155 |
+
pipeline_state_dict = pipeline_state.model_dump()
|
156 |
+
output_state = browser_state["tossup"].get("output_state", {})
|
157 |
+
else:
|
158 |
+
pipeline_state_dict, output_state = self.load_default_workflow()
|
159 |
except Exception as e:
|
160 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
161 |
+
pipeline_state_dict, output_state = self.load_default_workflow()
|
|
|
|
|
162 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
163 |
|
164 |
# ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
|
|
|
262 |
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
|
263 |
|
264 |
# ------------------------------------- Agent Functions -----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
def single_run(
|
267 |
self,
|
|
|
289 |
if not self.ds or question_id < 0 or question_id >= len(self.ds):
|
290 |
raise gr.Error("Invalid question ID or dataset not loaded")
|
291 |
example = self.ds[question_id]
|
292 |
+
outputs = run_and_evaluate_tossup(
|
293 |
+
QuizBowlTossupAgent(pipeline_state.workflow),
|
294 |
+
example,
|
295 |
+
return_extras=True,
|
296 |
+
early_stop=early_stop,
|
297 |
+
)
|
298 |
+
run_outputs = outputs["run_outputs"]
|
299 |
# Process results and prepare visualization data
|
300 |
confidence_threshold = workflow.buzzer.confidence_threshold
|
301 |
prob_threshold = workflow.buzzer.prob_threshold
|
302 |
tokens_html, plot_data, output_state, step_outputs = initialize_eval_interface(
|
303 |
+
example, run_outputs, workflow.inputs, confidence_threshold, prob_threshold
|
304 |
)
|
305 |
+
df = process_tossup_results(run_outputs)
|
306 |
|
307 |
return (
|
308 |
tokens_html,
|
|
|
331 |
if not self.ds or not self.ds.num_rows:
|
332 |
return "No dataset loaded", None, None
|
333 |
pipeline_state = validation.validate_tossup_workflow(state_dict)
|
334 |
+
agent = QuizBowlTossupAgent(pipeline_state.workflow)
|
335 |
+
model_outputs = run_and_eval_tossup_dataset(
|
336 |
+
agent, self.ds, return_extras=True, tqdm_provider=progress.tqdm, num_workers=2
|
337 |
+
)
|
338 |
eval_df = prepare_tossup_results_df(model_outputs, self.ds["run_indices"])
|
339 |
plot_data = create_tossup_eval_dashboard(self.ds["run_indices"], eval_df)
|
340 |
output_df = create_tossup_eval_table(eval_df)
|
src/components/quizbowl/utils.py
CHANGED
@@ -1,7 +1,3 @@
|
|
1 |
-
from typing import Any, Dict, List
|
2 |
-
|
3 |
-
import pandas as pd
|
4 |
-
|
5 |
from shared.workflows.errors import ProviderAPIError, WorkflowExecutionError
|
6 |
|
7 |
|
@@ -15,61 +11,3 @@ def create_error_message(e: Exception) -> str:
|
|
15 |
return f"Invalid input -- {e}. Please try again. \n\nIf the problem persists, please contact support."
|
16 |
else:
|
17 |
return "An unexpected error occurred. Please contact support."
|
18 |
-
|
19 |
-
|
20 |
-
def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
|
21 |
-
"""Create a DataFrame for the confidence plot."""
|
22 |
-
if not top_k_mode:
|
23 |
-
return pd.DataFrame(
|
24 |
-
{
|
25 |
-
"position": [r["position"] for r in results],
|
26 |
-
"confidence": [r["confidence"] for r in results],
|
27 |
-
"answer": [r["answer"] for r in results],
|
28 |
-
}
|
29 |
-
)
|
30 |
-
|
31 |
-
# For top-k mode, extract and plot top answers
|
32 |
-
return _create_top_k_plot_data(results)
|
33 |
-
|
34 |
-
|
35 |
-
def _create_top_k_plot_data(results: List[Dict]) -> pd.DataFrame:
|
36 |
-
"""Create plot data for top-k mode."""
|
37 |
-
# Find top answers across all positions (limited to top 5)
|
38 |
-
top_answers = set()
|
39 |
-
for r in results:
|
40 |
-
for g in r.get("guesses", [])[:3]: # Get top 3 from each position
|
41 |
-
if g.get("answer"):
|
42 |
-
top_answers.add(g.get("answer"))
|
43 |
-
|
44 |
-
top_answers = list(top_answers)[:5] # Limit to 5 total answers
|
45 |
-
|
46 |
-
# Create plot data for each answer
|
47 |
-
all_data = []
|
48 |
-
for position_idx, result in enumerate(results):
|
49 |
-
position = result["position"]
|
50 |
-
for answer in top_answers:
|
51 |
-
confidence = 0
|
52 |
-
for guess in result.get("guesses", []):
|
53 |
-
if guess.get("answer") == answer:
|
54 |
-
confidence = guess.get("confidence", 0)
|
55 |
-
break
|
56 |
-
all_data.append({"position": position, "confidence": confidence, "answer": answer})
|
57 |
-
|
58 |
-
return pd.DataFrame(all_data)
|
59 |
-
|
60 |
-
|
61 |
-
def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
|
62 |
-
"""Create a DataFrame for top-k results."""
|
63 |
-
df_rows = []
|
64 |
-
for result in results:
|
65 |
-
position = result["position"]
|
66 |
-
for i, guess in enumerate(result.get("guesses", [])):
|
67 |
-
df_rows.append(
|
68 |
-
{
|
69 |
-
"position": position,
|
70 |
-
"answer": guess.get("answer", ""),
|
71 |
-
"confidence": guess.get("confidence", 0),
|
72 |
-
"rank": i + 1,
|
73 |
-
}
|
74 |
-
)
|
75 |
-
return pd.DataFrame(df_rows)
|
|
|
|
|
|
|
|
|
|
|
1 |
from shared.workflows.errors import ProviderAPIError, WorkflowExecutionError
|
2 |
|
3 |
|
|
|
11 |
return f"Invalid input -- {e}. Please try again. \n\nIf the problem persists, please contact support."
|
12 |
else:
|
13 |
return "An unexpected error occurred. Please contact support."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/custom_css.py
CHANGED
@@ -12,6 +12,7 @@ css_pipeline = """
|
|
12 |
--input-text-size: var(--text-sm) !important;
|
13 |
--body-text-size: 14px !important;
|
14 |
--input-background-fill-focus: var(--secondary-300) !important;
|
|
|
15 |
|
16 |
// Button Colors
|
17 |
--button-primary-background-fill: var(--primary-800) !important;
|
@@ -38,6 +39,7 @@ css_pipeline = """
|
|
38 |
--text-lg: 16px !important;
|
39 |
--input-text-size: var(--text-sm) !important;
|
40 |
--body-text-size: 14px !important;
|
|
|
41 |
|
42 |
--button-primary-background-fill: var(--neutral-100) !important;
|
43 |
--button-secondary-background-fill: var(--secondary-300) !important;
|
|
|
12 |
--input-text-size: var(--text-sm) !important;
|
13 |
--body-text-size: 14px !important;
|
14 |
--input-background-fill-focus: var(--secondary-300) !important;
|
15 |
+
--link-text-color: blue !important;
|
16 |
|
17 |
// Button Colors
|
18 |
--button-primary-background-fill: var(--primary-800) !important;
|
|
|
39 |
--text-lg: 16px !important;
|
40 |
--input-text-size: var(--text-sm) !important;
|
41 |
--body-text-size: 14px !important;
|
42 |
+
--link-text-color: blue !important;
|
43 |
|
44 |
--button-primary-background-fill: var(--neutral-100) !important;
|
45 |
--button-secondary-background-fill: var(--secondary-300) !important;
|
src/envs.py
CHANGED
@@ -41,6 +41,7 @@ PLAYGROUND_DATASET_NAMES = {
|
|
41 |
|
42 |
# If you setup a cache later, just change HF_HOME
|
43 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
|
44 |
|
45 |
# Local caches
|
46 |
LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
|
|
|
41 |
|
42 |
# If you setup a cache later, just change HF_HOME
|
43 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
44 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
45 |
|
46 |
# Local caches
|
47 |
LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
|
src/submission/structs.py
CHANGED
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Field
|
|
6 |
from shared.workflows.structs import TossupWorkflow, Workflow
|
7 |
|
8 |
CompetitionType = Literal["tossup", "bonus"]
|
9 |
-
SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
|
10 |
SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
|
11 |
|
12 |
|
|
|
6 |
from shared.workflows.structs import TossupWorkflow, Workflow
|
7 |
|
8 |
CompetitionType = Literal["tossup", "bonus"]
|
9 |
+
SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow", "hf_pipeline"]
|
10 |
SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
|
11 |
|
12 |
|
src/submission/submit.py
CHANGED
@@ -40,7 +40,7 @@ def get_user_submissions(username: str, competition_type: str, pattern: str = No
|
|
40 |
def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
|
41 |
"""Get all submission model names for a user."""
|
42 |
if profile is None:
|
43 |
-
logger.
|
44 |
return []
|
45 |
submissions = get_user_submissions(profile.username, competition_type)
|
46 |
return [f"{s.username}/{s.model_name}" for s in submissions]
|
@@ -88,7 +88,7 @@ def get_time_until_next_submission(tz: timezone = timezone.utc) -> str:
|
|
88 |
return remaining_time_str
|
89 |
|
90 |
|
91 |
-
def
|
92 |
username: str,
|
93 |
model_name: str,
|
94 |
description: str,
|
@@ -125,6 +125,41 @@ def create_submission(
|
|
125 |
return submission
|
126 |
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
def validate_model_name(model_name: str):
|
129 |
# check if model_name has no white spaces, no special characters apart from _ and -
|
130 |
if " " in model_name:
|
@@ -177,7 +212,7 @@ def submit_model(
|
|
177 |
return styled_error(f"Submission Error! Invalid model name '{model_name}'.<br>{error_msg}")
|
178 |
|
179 |
try:
|
180 |
-
submission =
|
181 |
username=username,
|
182 |
model_name=model_name,
|
183 |
description=description,
|
|
|
40 |
def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
|
41 |
"""Get all submission model names for a user."""
|
42 |
if profile is None:
|
43 |
+
logger.info("No user profile provided. Returning empty list.")
|
44 |
return []
|
45 |
submissions = get_user_submissions(profile.username, competition_type)
|
46 |
return [f"{s.username}/{s.model_name}" for s in submissions]
|
|
|
88 |
return remaining_time_str
|
89 |
|
90 |
|
91 |
+
def create_workflow_submission(
|
92 |
username: str,
|
93 |
model_name: str,
|
94 |
description: str,
|
|
|
125 |
return submission
|
126 |
|
127 |
|
128 |
+
def create_hf_submission(
|
129 |
+
username: str,
|
130 |
+
model_name: str,
|
131 |
+
description: str,
|
132 |
+
competition_type: CompetitionType,
|
133 |
+
) -> Submission:
|
134 |
+
"""
|
135 |
+
Create a submission for a tossup model.
|
136 |
+
|
137 |
+
Args:
|
138 |
+
username: Username of the user who created the submission
|
139 |
+
model_name: Name of the model
|
140 |
+
description: Detailed description of what the submission does
|
141 |
+
competition_type: Type of competition
|
142 |
+
|
143 |
+
Returns:
|
144 |
+
Submission object if successful, None if validation fails
|
145 |
+
"""
|
146 |
+
# Create the submission
|
147 |
+
dt = datetime.now(timezone.utc)
|
148 |
+
submission = Submission(
|
149 |
+
id=f"{competition_type}__hf__{dt.strftime('%Y%m%d_%H%M%S')}__{username}__{model_name.lower().replace(' ', '_')}",
|
150 |
+
model_name=model_name,
|
151 |
+
username=username,
|
152 |
+
description=description,
|
153 |
+
competition_type=competition_type,
|
154 |
+
submission_type="hf_pipeline",
|
155 |
+
status="submitted",
|
156 |
+
created_at=dt.isoformat(),
|
157 |
+
updated_at=dt.isoformat(),
|
158 |
+
)
|
159 |
+
|
160 |
+
return submission
|
161 |
+
|
162 |
+
|
163 |
def validate_model_name(model_name: str):
|
164 |
# check if model_name has no white spaces, no special characters apart from _ and -
|
165 |
if " " in model_name:
|
|
|
212 |
return styled_error(f"Submission Error! Invalid model name '{model_name}'.<br>{error_msg}")
|
213 |
|
214 |
try:
|
215 |
+
submission = create_workflow_submission(
|
216 |
username=username,
|
217 |
model_name=model_name,
|
218 |
description=description,
|