Maharshi Gor commited on
Commit
f064c62
·
1 Parent(s): bdbc03c

Updated workflow APIs, code clean up and minor functions for hf pipeline support

Browse files
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
 
3
  import datasets
4
  import gradio as gr
@@ -6,17 +7,20 @@ from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
  from loguru import logger
8
 
9
- import populate
10
- from about import LEADERBOARD_INTRODUCTION_TEXT, LEADERBOARD_TITLE
 
 
 
 
11
  from app_configs import DEFAULT_SELECTIONS, THEME
12
- from components.leaderboard import create_leaderboard_interface
13
  from components.quizbowl.bonus import BonusInterface
14
  from components.quizbowl.tossup import TossupInterface
15
  from components.typed_dicts import PipelineInterfaceDefaults, TossupInterfaceDefaults
16
  from display.css_html_js import fonts_header, js_head, leaderboard_css
17
  from display.custom_css import css_bonus, css_pipeline, css_tossup
18
  from display.guide import BUILDING_MARKDOWN, QUICKSTART_MARKDOWN
19
- from display.utils import AutoEvalColumn, fields
20
 
21
  # Constants
22
  from envs import (
@@ -40,9 +44,11 @@ from envs import (
40
  from hf_datasets_utils import download_dataset_snapshot
41
  from shared.workflows import factory
42
  from shared.workflows.configs import AVAILABLE_MODELS
 
43
 
44
 
45
  def restart_space():
 
46
  API.restart_space(repo_id=REPO_ID)
47
 
48
 
@@ -112,7 +118,7 @@ if __name__ == "__main__":
112
  with gr.Row():
113
  with gr.Column(scale=5):
114
  gr.Markdown(
115
- f"## 🤖 Welcome to QANTA 2025 Quizbowl Arena!     🏆 [Leaderboard]({LEADERBOARD_URL}) "
116
  "\n### 🎲 Create, play around, and submit your quizbowl agents."
117
  f"<br>📋 [Register]({REGISTRATION_URL}) to participate in our [QANTA 2025 Human-AI Quizbowl Competition]({COMPETITION_URL}).",
118
  elem_classes="welcome-text",
@@ -140,10 +146,8 @@ if __name__ == "__main__":
140
  **DEFAULT_SELECTIONS["bonus"], init_workflow=factory.create_simple_qb_bonus_workflow()
141
  )
142
  bonus_interface = BonusInterface(demo, browser_state, bonus_ds, AVAILABLE_MODELS, defaults)
143
- # with gr.Tab("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id="leaderboard"):
144
- # gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
145
- # gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
146
- # create_leaderboard_interface(demo)
147
  with gr.Tab("❓ Help", id="help"):
148
  with gr.Row():
149
  with gr.Column():
@@ -153,6 +157,8 @@ if __name__ == "__main__":
153
 
154
  # Event Listeners
155
 
 
 
156
  login_btn.click(
157
  fn=presave_pipeline_state,
158
  inputs=[
 
1
  import json
2
+ import sys
3
 
4
  import datasets
5
  import gradio as gr
 
7
  from huggingface_hub import snapshot_download
8
  from loguru import logger
9
 
10
+ from envs import LOG_LEVEL
11
+
12
+ # Set the log level to INFO
13
+ logger.remove()
14
+ logger.add(sys.stdout, level=LOG_LEVEL, diagnose=False)
15
+
16
  from app_configs import DEFAULT_SELECTIONS, THEME
17
+ from components.hf_pipelines import create_hf_pipeline_submission_interface
18
  from components.quizbowl.bonus import BonusInterface
19
  from components.quizbowl.tossup import TossupInterface
20
  from components.typed_dicts import PipelineInterfaceDefaults, TossupInterfaceDefaults
21
  from display.css_html_js import fonts_header, js_head, leaderboard_css
22
  from display.custom_css import css_bonus, css_pipeline, css_tossup
23
  from display.guide import BUILDING_MARKDOWN, QUICKSTART_MARKDOWN
 
24
 
25
  # Constants
26
  from envs import (
 
44
  from hf_datasets_utils import download_dataset_snapshot
45
  from shared.workflows import factory
46
  from shared.workflows.configs import AVAILABLE_MODELS
47
+ from shared.workflows.llms import llm_cache
48
 
49
 
50
  def restart_space():
51
+ llm_cache.sync_to_hf()
52
  API.restart_space(repo_id=REPO_ID)
53
 
54
 
 
118
  with gr.Row():
119
  with gr.Column(scale=5):
120
  gr.Markdown(
121
+ f"## 🤖 Welcome to QANTA 2025 Quizbowl Arena! &emsp;&emsp;&emsp; 👉 🏆 [Leaderboard]({LEADERBOARD_URL}) 👈"
122
  "\n### 🎲 Create, play around, and submit your quizbowl agents."
123
  f"<br>📋 [Register]({REGISTRATION_URL}) to participate in our [QANTA 2025 Human-AI Quizbowl Competition]({COMPETITION_URL}).",
124
  elem_classes="welcome-text",
 
146
  **DEFAULT_SELECTIONS["bonus"], init_workflow=factory.create_simple_qb_bonus_workflow()
147
  )
148
  bonus_interface = BonusInterface(demo, browser_state, bonus_ds, AVAILABLE_MODELS, defaults)
149
+ # with gr.Tab("🤗 HuggingFace Pipelines", elem_id="hf-pipeline-tab", id="hf-pipeline-tab"):
150
+ # hf_pipeline_interface = create_hf_pipeline_submission_interface(demo)
 
 
151
  with gr.Tab("❓ Help", id="help"):
152
  with gr.Row():
153
  with gr.Column():
 
157
 
158
  # Event Listeners
159
 
160
+ # This is used to retrieve the pipeline state user was working on before login.
161
+ # makes things less annoying when progress is lost due to login.
162
  login_btn.click(
163
  fn=presave_pipeline_state,
164
  inputs=[
shared/workflows CHANGED
@@ -1 +1 @@
1
- Subproject commit 6f9f3742d977eca6a385d6dbc5f28b5b16287cf1
 
1
+ Subproject commit e5b9e225ca82372ee86f6d340b1523d4574bed3d
src/components/model_pipeline/model_pipeline.py CHANGED
@@ -13,7 +13,6 @@ from components.model_step.model_step import ModelStepComponent
13
  from components.structs import ModelStepUIState, PipelineState, PipelineUIState
14
  from components.utils import make_state
15
  from shared.workflows.structs import ModelStep, Workflow
16
- from shared.workflows.validators import WorkflowValidationError, WorkflowValidator
17
 
18
  from .state_manager import get_output_panel_state
19
 
@@ -117,7 +116,6 @@ class PipelineInterface:
117
  return step_interface
118
 
119
  is_multi_step = n_steps > 1
120
- # logger.debug(f"Rendering step {position} of {n_steps}")
121
 
122
  # Add step controls below
123
  with gr.Row(elem_classes="step-controls", visible=is_multi_step):
 
13
  from components.structs import ModelStepUIState, PipelineState, PipelineUIState
14
  from components.utils import make_state
15
  from shared.workflows.structs import ModelStep, Workflow
 
16
 
17
  from .state_manager import get_output_panel_state
18
 
 
116
  return step_interface
117
 
118
  is_multi_step = n_steps > 1
 
119
 
120
  # Add step controls below
121
  with gr.Row(elem_classes="step-controls", visible=is_multi_step):
src/components/model_pipeline/state_manager.py CHANGED
@@ -215,7 +215,6 @@ class PipelineStateManager:
215
  """Update a workflow from a YAML string."""
216
  try:
217
  workflow = self.parse_yaml_workflow(yaml_str, strict=True)
218
- logger.debug(f"Validator: {self.validator}")
219
  self.validator and self.validator(workflow)
220
  state = self.pipeline_state_cls.from_workflow(workflow)
221
  return state.model_dump(), not change_state, gr.update(visible=False)
 
215
  """Update a workflow from a YAML string."""
216
  try:
217
  workflow = self.parse_yaml_workflow(yaml_str, strict=True)
 
218
  self.validator and self.validator(workflow)
219
  state = self.pipeline_state_cls.from_workflow(workflow)
220
  return state.model_dump(), not change_state, gr.update(visible=False)
src/components/quizbowl/bonus.py CHANGED
@@ -15,6 +15,7 @@ from display.formatting import styled_error
15
  from shared.workflows import factory
16
  from shared.workflows.metrics import evaluate_prediction
17
  from shared.workflows.qb_agents import QuizBowlBonusAgent
 
18
  from submission import submit
19
 
20
  from . import populate, validation
@@ -28,10 +29,10 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
28
  return pd.DataFrame(
29
  [
30
  {
31
- "Part": f"Part {r['part_number']}",
32
- "Correct?": "✅" if r["score"] == 1 else "❌",
33
  "Confidence": r["confidence"],
34
- "Prediction": r["answer"],
35
  "Explanation": r["explanation"],
36
  }
37
  for r in results
@@ -39,20 +40,20 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
39
  )
40
 
41
 
42
- def initialize_eval_interface(example: dict, model_outputs: list[dict], input_vars: list[str]):
43
  """Initialize the interface with example text."""
44
  try:
45
  html_content = create_bonus_html(example["leadin"], example["parts"])
46
 
47
  # Create confidence plot data
48
- plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
49
 
50
  # Store state
51
- state = {"parts": example["parts"], "outputs": model_outputs}
52
 
53
  # Preparing step outputs for the model
54
  step_outputs = {}
55
- for i, output in enumerate(model_outputs):
56
  key = f"part {i + 1}"
57
  step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
58
  if output["logprob"] is not None:
@@ -60,8 +61,9 @@ def initialize_eval_interface(example: dict, model_outputs: list[dict], input_va
60
 
61
  return html_content, plot_data, state, step_outputs
62
  except Exception as e:
63
- logger.exception(f"Error initializing interface: {e.args}")
64
- return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), {}, {}
 
65
 
66
 
67
  class BonusInterface:
@@ -79,19 +81,23 @@ class BonusInterface:
79
  self.render()
80
 
81
  # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
 
 
 
 
82
 
83
  def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
84
- logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
85
  try:
86
  state_dict = browser_state["bonus"].get("pipeline_state", {})
87
- pipeline_state = PipelineState.model_validate(state_dict)
88
- pipeline_state_dict = pipeline_state.model_dump()
89
- output_state = browser_state["bonus"].get("output_state", {})
 
 
 
90
  except Exception as e:
91
  logger.warning(f"Error loading presaved pipeline state: {e}")
92
- output_state = {}
93
- workflow = self.defaults["init_workflow"]
94
- pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
95
  return browser_state, not pipeline_change, pipeline_state_dict, output_state
96
 
97
  # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
@@ -101,6 +107,7 @@ class BonusInterface:
101
  self.pipeline_selector = commons.get_pipeline_selector([])
102
  self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
103
  self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
 
104
  self.pipeline_interface = PipelineInterface(
105
  self.app,
106
  pipeline_state.workflow,
@@ -135,7 +142,7 @@ class BonusInterface:
135
  def render(self):
136
  """Create the Gradio interface."""
137
  self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
138
- workflow = factory.create_empty_tossup_workflow()
139
  pipeline_state = PipelineState.from_workflow(workflow)
140
 
141
  with gr.Row():
@@ -195,25 +202,7 @@ class BonusInterface:
195
  error_msg = styled_error(f"Error loading pipeline: {str(e)}")
196
  return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
197
 
198
- # ------------------------------------- Agent Functions -----------------------------------------------------------
199
-
200
- def get_agent_outputs(self, example: dict, pipeline_state: PipelineState):
201
- """Get the model outputs for a given question ID."""
202
- outputs = []
203
- leadin = example["leadin"]
204
- agent = QuizBowlBonusAgent(pipeline_state.workflow)
205
-
206
- for i, part in enumerate(example["parts"]):
207
- # Run model for each part
208
- part_output = agent.run(leadin, part["part"])
209
-
210
- # Add part number and evaluate score
211
- part_output["part_number"] = i + 1
212
- part_output["score"] = evaluate_prediction(part_output["answer"], part["clean_answers"])
213
-
214
- outputs.append(part_output)
215
-
216
- return outputs
217
 
218
  def single_run(
219
  self,
@@ -237,13 +226,14 @@ class BonusInterface:
237
  raise gr.Error("Invalid question ID or dataset not loaded")
238
 
239
  example = self.ds[question_id]
240
- outputs = self.get_agent_outputs(example, pipeline_state)
241
-
 
242
  # Process results and prepare visualization data
243
  html_content, plot_data, output_state, step_outputs = initialize_eval_interface(
244
- example, outputs, pipeline_state.workflow.inputs
245
  )
246
- df = process_bonus_results(outputs)
247
 
248
  return (
249
  html_content,
@@ -254,7 +244,7 @@ class BonusInterface:
254
  )
255
  except Exception as e:
256
  error_msg = styled_error(create_error_message(e))
257
- logger.exception(f"Error running tossup: {e}")
258
  return (
259
  gr.skip(),
260
  gr.skip(),
@@ -271,27 +261,26 @@ class BonusInterface:
271
  if not self.ds or not self.ds.num_rows:
272
  return "No dataset loaded", None, None
273
 
274
- total_correct = 0
 
 
 
 
275
  total_parts = 0
276
- part_scores = []
277
- part_numbers = []
278
-
279
- for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"):
280
- model_outputs = self.get_agent_outputs(example, pipeline_state)
281
-
282
- for output in model_outputs:
283
- total_parts += 1
284
- if output["score"] == 1:
285
- total_correct += 1
286
- part_scores.append(output["score"])
287
- part_numbers.append(output["part_number"])
288
-
289
- accuracy = total_correct / total_parts
290
  df = pd.DataFrame(
291
  [
292
  {
293
- "Part Accuracy": f"{accuracy:.2%}",
294
- "Total Score": f"{total_correct}/{total_parts}",
295
  "Questions Evaluated": len(self.ds),
296
  }
297
  ]
@@ -305,7 +294,7 @@ class BonusInterface:
305
  )
306
  except Exception as e:
307
  error_msg = styled_error(create_error_message(e))
308
- logger.exception(f"Error evaluating tossups: {e}")
309
  return gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
310
 
311
  def submit_model(
 
15
  from shared.workflows import factory
16
  from shared.workflows.metrics import evaluate_prediction
17
  from shared.workflows.qb_agents import QuizBowlBonusAgent
18
+ from shared.workflows.runners import run_and_eval_bonus_dataset, run_and_evaluate_bonus
19
  from submission import submit
20
 
21
  from . import populate, validation
 
29
  return pd.DataFrame(
30
  [
31
  {
32
+ "Part": f"Part {r['number']}",
33
+ "Correct?": "✅" if r["correct"] == 1 else "❌",
34
  "Confidence": r["confidence"],
35
+ "Prediction": r["guess"],
36
  "Explanation": r["explanation"],
37
  }
38
  for r in results
 
40
  )
41
 
42
 
43
+ def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
44
  """Initialize the interface with example text."""
45
  try:
46
  html_content = create_bonus_html(example["leadin"], example["parts"])
47
 
48
  # Create confidence plot data
49
+ plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
50
 
51
  # Store state
52
+ state = {"parts": example["parts"], "outputs": part_outputs}
53
 
54
  # Preparing step outputs for the model
55
  step_outputs = {}
56
+ for i, output in enumerate(part_outputs):
57
  key = f"part {i + 1}"
58
  step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
59
  if output["logprob"] is not None:
 
61
 
62
  return html_content, plot_data, state, step_outputs
63
  except Exception as e:
64
+ error_msg = f"Error initializing interface: {str(e)}"
65
+ logger.exception(error_msg)
66
+ return styled_error(error_msg), pd.DataFrame(), {}, {}
67
 
68
 
69
  class BonusInterface:
 
81
  self.render()
82
 
83
  # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
84
+ def load_default_workflow(self):
85
+ workflow = self.defaults["init_workflow"]
86
+ pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
87
+ return pipeline_state_dict, {}
88
 
89
  def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
 
90
  try:
91
  state_dict = browser_state["bonus"].get("pipeline_state", {})
92
+ if state_dict:
93
+ pipeline_state = PipelineState.model_validate(state_dict)
94
+ pipeline_state_dict = pipeline_state.model_dump()
95
+ output_state = browser_state["bonus"].get("output_state", {})
96
+ else:
97
+ pipeline_state_dict, output_state = self.load_default_workflow()
98
  except Exception as e:
99
  logger.warning(f"Error loading presaved pipeline state: {e}")
100
+ pipeline_state_dict, output_state = self.load_default_workflow()
 
 
101
  return browser_state, not pipeline_change, pipeline_state_dict, output_state
102
 
103
  # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
 
107
  self.pipeline_selector = commons.get_pipeline_selector([])
108
  self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
109
  self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
110
+ logger.info(f"Rendering {self.__class__.__name__} with pipeline state: {pipeline_state}")
111
  self.pipeline_interface = PipelineInterface(
112
  self.app,
113
  pipeline_state.workflow,
 
142
  def render(self):
143
  """Create the Gradio interface."""
144
  self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
145
+ workflow = factory.create_empty_bonus_workflow()
146
  pipeline_state = PipelineState.from_workflow(workflow)
147
 
148
  with gr.Row():
 
202
  error_msg = styled_error(f"Error loading pipeline: {str(e)}")
203
  return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
204
 
205
+ # ------------------------------------- Agent Functions -----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def single_run(
208
  self,
 
226
  raise gr.Error("Invalid question ID or dataset not loaded")
227
 
228
  example = self.ds[question_id]
229
+ agent = QuizBowlBonusAgent(pipeline_state.workflow)
230
+ model_output = run_and_evaluate_bonus(agent, example, return_extras=True)
231
+ part_outputs = model_output["part_outputs"]
232
  # Process results and prepare visualization data
233
  html_content, plot_data, output_state, step_outputs = initialize_eval_interface(
234
+ example, part_outputs, pipeline_state.workflow.inputs
235
  )
236
+ df = process_bonus_results(part_outputs)
237
 
238
  return (
239
  html_content,
 
244
  )
245
  except Exception as e:
246
  error_msg = styled_error(create_error_message(e))
247
+ logger.exception(f"Error running bonus: {e}")
248
  return (
249
  gr.skip(),
250
  gr.skip(),
 
261
  if not self.ds or not self.ds.num_rows:
262
  return "No dataset loaded", None, None
263
 
264
+ agent = QuizBowlBonusAgent(pipeline_state.workflow)
265
+ model_outputs = run_and_eval_bonus_dataset(
266
+ agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
267
+ )
268
+ n_parts_correct = 0
269
  total_parts = 0
270
+ n_questions_correct = 0
271
+ for model_output in model_outputs:
272
+ part_outputs = model_output["part_outputs"]
273
+ n_parts_correct += sum(output["correct"] for output in part_outputs)
274
+ total_parts += len(part_outputs)
275
+ n_questions_correct += int(n_parts_correct == len(part_outputs))
276
+
277
+ p_accuracy = n_parts_correct / total_parts
278
+ q_accuracy = n_questions_correct / len(self.ds)
 
 
 
 
 
279
  df = pd.DataFrame(
280
  [
281
  {
282
+ "Question Accuracy": f"{q_accuracy:.2%}",
283
+ "Part Accuracy": f"{p_accuracy:.2%}",
284
  "Questions Evaluated": len(self.ds),
285
  }
286
  ]
 
294
  )
295
  except Exception as e:
296
  error_msg = styled_error(create_error_message(e))
297
+ logger.exception(f"Error evaluating bonus: {e}")
298
  return gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
299
 
300
  def submit_model(
src/components/quizbowl/plotting.py CHANGED
@@ -37,14 +37,14 @@ def _create_token_tooltip_html(values) -> str:
37
  return ""
38
  confidence = values.get("confidence", 0)
39
  buzz = values.get("buzz", 0)
40
- score = values.get("score", 0)
41
- answer = values.get("answer", "")
42
- answer_tokens = answer.split()
43
- if len(answer_tokens) > 10:
44
- k = len(answer_tokens) - 10
45
- answer = " ".join(answer_tokens[:10]) + f"...[{k} more words]"
46
 
47
- color = "#a3c9a3" if score else "#ebbec4" # Light green for correct, light pink for incorrect
48
 
49
  if values.get("logprob", None) is not None:
50
  prob = np.exp(values["logprob"])
@@ -56,10 +56,10 @@ def _create_token_tooltip_html(values) -> str:
56
  <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
57
  <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
58
  <h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
59
- <p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{answer}</code></p>
60
  <p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
61
  {prob_str}
62
- <p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if score else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
63
  </div>
64
  </div>
65
  """
@@ -68,14 +68,14 @@ def _create_token_tooltip_html(values) -> str:
68
  def create_token_html(token: str, values: dict, i: int) -> str:
69
  confidence = values.get("confidence", None)
70
  buzz = values.get("buzz", 0)
71
- score = values.get("score", 0)
72
 
73
  # Replace non-word characters for proper display in HTML
74
  display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
75
  if not re.match(r"\w+", token):
76
  display_token = token.replace(" ", "&nbsp;")
77
 
78
- css_class = _get_token_classes(confidence, buzz, score)
79
  # Add tooltip if we have values for this token
80
  tooltip_html = _create_token_tooltip_html(values)
81
 
@@ -98,8 +98,8 @@ def create_tossup_html(
98
  marker_indices = set(marker_indices)
99
 
100
  html_tokens = []
101
- for i, token in enumerate(tokens):
102
- token_html = create_token_html(token, ep.get(i, {}), i + 1)
103
  html_tokens.append(token_html)
104
 
105
  answer_html = _make_answer_html(answer_primary, clean_answers)
@@ -156,7 +156,7 @@ def create_bonus_html(leadin: str, parts: list[dict]) -> str:
156
 
157
  def create_tossup_confidence_pyplot(
158
  tokens: list[str],
159
- eval_points: list[tuple[int, dict]],
160
  confidence_threshold: float = 0.5,
161
  prob_threshold: float | None = None,
162
  ) -> plt.Figure:
@@ -164,25 +164,26 @@ def create_tossup_confidence_pyplot(
164
  plt.style.use("ggplot") # Set theme to grid paper
165
  fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
166
  ax = fig.add_subplot(111)
167
- x = [0] + [int(i + 1) for i, _ in eval_points]
168
- y_conf = [0] + [v["confidence"] for _, v in eval_points]
169
- logprob_values = [v["logprob"] for _, v in eval_points if v["logprob"] is not None]
170
- y_prob = [0] + [np.exp(v) for v in logprob_values]
171
 
172
  ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
173
  ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
174
- for i, v in eval_points:
175
- if not v["buzz"]:
176
  continue
177
- color = "green" if v["score"] else "red"
178
- conf = v["confidence"]
179
- ax.plot(i + 1, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
180
- if v["logprob"] is not None:
181
- prob = np.exp(v["logprob"])
182
- ax.plot(i + 1, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
183
- if i >= len(tokens):
184
- print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
185
- ax.annotate(f"{tokens[i]}", (i + 1, conf), textcoords="offset points", xytext=(0, 10), ha="center")
 
186
 
187
  # Add horizontal dashed line for confidence threshold
188
  ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
@@ -228,7 +229,7 @@ def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -
228
  # Plot confidence for each part
229
  x = range(1, len(parts) + 1)
230
  confidences = [output["confidence"] for output in model_outputs]
231
- scores = [output["score"] for output in model_outputs]
232
 
233
  # Plot confidence bars
234
  bars = ax.bar(x, confidences, color="#4698cf")
@@ -287,13 +288,16 @@ def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
287
  pos_gaps = gaps.loc[gaps >= 0]
288
  neg_gaps = gaps.loc[gaps < 0]
289
 
290
- mean_tossup_score = df["tossup_score"].sum() / len(df)
 
 
291
 
292
  return pd.DataFrame(
293
  [
294
  {
295
- "Tossup Score (10)": f"{mean_tossup_score:5.1f}",
296
- "Buzz Accuracy": f"{df['is_correct'].mean():5.1%}",
 
297
  "Buzz Position": f"{np.mean(positions):5.1f}",
298
  "+ve Gap": f"{pos_gaps.mean():5.1f}",
299
  "-ve Gap": f"{neg_gaps.mean():5.1f}",
@@ -493,16 +497,16 @@ def create_dummy_model_outputs(n_entries=10, n_positions=5):
493
 
494
  outputs.append(
495
  {
496
- "position": i + 1,
497
  "buzz": will_buzz,
498
- "score": 1 if is_correct else 0,
499
  "confidence": np.random.random(),
500
  "logprob": np.log(np.random.random()),
501
- "answer": f"Answer {i + 1}",
502
  }
503
  )
504
 
505
- dummy_outputs.append({"run_indices": run_indices, "outputs": outputs})
506
 
507
  return dummy_outputs
508
 
 
37
  return ""
38
  confidence = values.get("confidence", 0)
39
  buzz = values.get("buzz", 0)
40
+ correct = values.get("correct", 0)
41
+ guess = values.get("guess", "")
42
+ guess_tokens = guess.split()
43
+ if len(guess_tokens) > 10:
44
+ k = len(guess_tokens) - 10
45
+ guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]"
46
 
47
+ color = "#a3c9a3" if correct else "#ebbec4" # Light green for correct, light pink for incorrect
48
 
49
  if values.get("logprob", None) is not None:
50
  prob = np.exp(values["logprob"])
 
56
  <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
57
  <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
58
  <h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
59
+ <p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{guess}</code></p>
60
  <p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
61
  {prob_str}
62
+ <p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if correct else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
63
  </div>
64
  </div>
65
  """
 
68
  def create_token_html(token: str, values: dict, i: int) -> str:
69
  confidence = values.get("confidence", None)
70
  buzz = values.get("buzz", 0)
71
+ correct = values.get("correct", 0)
72
 
73
  # Replace non-word characters for proper display in HTML
74
  display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
75
  if not re.match(r"\w+", token):
76
  display_token = token.replace(" ", "&nbsp;")
77
 
78
+ css_class = _get_token_classes(confidence, buzz, correct)
79
  # Add tooltip if we have values for this token
80
  tooltip_html = _create_token_tooltip_html(values)
81
 
 
98
  marker_indices = set(marker_indices)
99
 
100
  html_tokens = []
101
+ for i, token in enumerate(tokens, start=1):
102
+ token_html = create_token_html(token, ep.get(i, {}), i)
103
  html_tokens.append(token_html)
104
 
105
  answer_html = _make_answer_html(answer_primary, clean_answers)
 
156
 
157
  def create_tossup_confidence_pyplot(
158
  tokens: list[str],
159
+ run_outputs: list[dict],
160
  confidence_threshold: float = 0.5,
161
  prob_threshold: float | None = None,
162
  ) -> plt.Figure:
 
164
  plt.style.use("ggplot") # Set theme to grid paper
165
  fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
166
  ax = fig.add_subplot(111)
167
+ x = [0] + [o["token_position"] for o in run_outputs]
168
+ y_conf = [0] + [o["confidence"] for o in run_outputs]
169
+ logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None]
170
+ y_prob = [0] + [np.exp(v) for v in logprobs]
171
 
172
  ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
173
  ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
174
+ for o in run_outputs:
175
+ if not o["buzz"]:
176
  continue
177
+ color = "green" if o["correct"] else "red"
178
+ conf = o["confidence"]
179
+ i = o["token_position"]
180
+ ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
181
+ if o["logprob"] is not None:
182
+ prob = np.exp(o["logprob"])
183
+ ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
184
+ if i > len(tokens):
185
+ print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}")
186
+ ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center")
187
 
188
  # Add horizontal dashed line for confidence threshold
189
  ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
 
229
  # Plot confidence for each part
230
  x = range(1, len(parts) + 1)
231
  confidences = [output["confidence"] for output in model_outputs]
232
+ scores = [output["correct"] for output in model_outputs]
233
 
234
  # Plot confidence bars
235
  bars = ax.bar(x, confidences, color="#4698cf")
 
288
  pos_gaps = gaps.loc[gaps >= 0]
289
  neg_gaps = gaps.loc[gaps < 0]
290
 
291
+ mean_tossup_score = df["raw_score"].sum() / len(df)
292
+ expected_score = df["expected_score"].sum() / len(df)
293
+ buzz_precision = df["is_correct"].sum() / df["buzz"].sum()
294
 
295
  return pd.DataFrame(
296
  [
297
  {
298
+ "Raw Score": f"{mean_tossup_score:5.1f}",
299
+ "Expected Score": f"{expected_score:5.1f}",
300
+ "Buzz Precision": f"{buzz_precision:5.1%}",
301
  "Buzz Position": f"{np.mean(positions):5.1f}",
302
  "+ve Gap": f"{pos_gaps.mean():5.1f}",
303
  "-ve Gap": f"{neg_gaps.mean():5.1f}",
 
497
 
498
  outputs.append(
499
  {
500
+ "run_idx": i + 1,
501
  "buzz": will_buzz,
502
+ "correct": 1 if is_correct else 0,
503
  "confidence": np.random.random(),
504
  "logprob": np.log(np.random.random()),
505
+ "guess": f"Answer {i + 1}",
506
  }
507
  )
508
 
509
+ dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs})
510
 
511
  return dummy_outputs
512
 
src/components/quizbowl/tossup.py CHANGED
@@ -16,6 +16,7 @@ from shared.workflows import factory
16
  from shared.workflows.metrics import evaluate_prediction
17
  from shared.workflows.metrics.qb_metrics import prepare_tossup_results_df
18
  from shared.workflows.qb_agents import QuizBowlTossupAgent, TossupResult
 
19
  from submission import submit
20
 
21
  from . import populate, validation
@@ -28,9 +29,6 @@ from .plotting import (
28
  from .utils import create_error_message
29
  from .validation import UserInputWorkflowValidator
30
 
31
- # TODO: Error handling on run tossup and evaluate tossup and show correct messages
32
- # TODO: ^^ Same for Bonus
33
-
34
 
35
  class ScoredTossupResult(TossupResult):
36
  """Result of a tossup question with evaluation score and position."""
@@ -44,8 +42,8 @@ def add_model_scores(
44
  ) -> list[ScoredTossupResult]:
45
  """Add model scores to the model outputs."""
46
  for output in run_outputs:
47
- output["score"] = evaluate_prediction(output["answer"], clean_answers)
48
- output["token_position"] = run_indices[output["position"] - 1]
49
  return run_outputs
50
 
51
 
@@ -58,7 +56,7 @@ def prepare_buzz_evals(
58
  return [], []
59
  eval_points = []
60
  for o in model_outputs:
61
- token_position = run_indices[o["position"] - 1]
62
  eval_points.append((token_position, o))
63
 
64
  return eval_points
@@ -80,9 +78,11 @@ def initialize_eval_interface(
80
  eval_points = [(o["token_position"], o) for o in run_outputs]
81
 
82
  if not tokens:
83
- return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
 
 
84
  html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
85
- plot_data = create_tossup_confidence_pyplot(tokens, eval_points, confidence_threshold, prob_threshold)
86
 
87
  # Store tokens, values, and buzzes as JSON for later use
88
  state = {"tokens": tokens, "values": eval_points}
@@ -91,15 +91,16 @@ def initialize_eval_interface(
91
  step_outputs = {}
92
  for output in run_outputs:
93
  tok_pos = output["token_position"]
94
- key = "{pos}:{token}".format(pos=tok_pos + 1, token=tokens[tok_pos])
95
  step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
96
  if output["logprob"] is not None:
97
  step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
98
 
99
  return html_content, plot_data, state, step_outputs
100
  except Exception as e:
101
- logger.exception(f"Error initializing interface: {e.args}")
102
- return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}", {}
 
103
 
104
 
105
  def process_tossup_results(results: list[dict]) -> pd.DataFrame:
@@ -108,12 +109,12 @@ def process_tossup_results(results: list[dict]) -> pd.DataFrame:
108
  for r in results:
109
  entry = {
110
  "Token Position": r["token_position"],
111
- "Correct?": "✅" if r["score"] == 1 else "❌",
112
  "Confidence": r["confidence"],
113
  }
114
  if r["logprob"] is not None:
115
  entry["Probability"] = f"{np.exp(r['logprob']):.3f}"
116
- entry["Prediction"] = r["answer"]
117
  data.append(entry)
118
  return pd.DataFrame(data)
119
 
@@ -141,18 +142,23 @@ class TossupInterface:
141
 
142
  # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE ------------------------------------
143
 
 
 
 
 
 
144
  def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
145
- logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
146
  try:
147
  state_dict = browser_state["tossup"].get("pipeline_state", {})
148
- pipeline_state = TossupPipelineState.model_validate(state_dict)
149
- pipeline_state_dict = pipeline_state.model_dump()
150
- output_state = browser_state["tossup"].get("output_state", {})
 
 
 
151
  except Exception as e:
152
  logger.warning(f"Error loading presaved pipeline state: {e}")
153
- output_state = {}
154
- workflow = self.defaults["init_workflow"]
155
- pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
156
  return browser_state, not pipeline_change, pipeline_state_dict, output_state
157
 
158
  # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
@@ -256,18 +262,6 @@ class TossupInterface:
256
  return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
257
 
258
  # ------------------------------------- Agent Functions -----------------------------------------------------------
259
- def get_agent_outputs(
260
- self, example: dict, pipeline_state: TossupPipelineState, early_stop: bool
261
- ) -> list[ScoredTossupResult]:
262
- """Get the model outputs for a given question ID."""
263
- question_runs = []
264
- tokens = example["question"].split()
265
- for run_idx in example["run_indices"]:
266
- question_runs.append(" ".join(tokens[: run_idx + 1]))
267
- agent = QuizBowlTossupAgent(pipeline_state.workflow)
268
- outputs = list(agent.run(question_runs, early_stop=early_stop))
269
- outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
270
- return outputs
271
 
272
  def single_run(
273
  self,
@@ -295,15 +289,20 @@ class TossupInterface:
295
  if not self.ds or question_id < 0 or question_id >= len(self.ds):
296
  raise gr.Error("Invalid question ID or dataset not loaded")
297
  example = self.ds[question_id]
298
- outputs = self.get_agent_outputs(example, pipeline_state, early_stop)
299
-
 
 
 
 
 
300
  # Process results and prepare visualization data
301
  confidence_threshold = workflow.buzzer.confidence_threshold
302
  prob_threshold = workflow.buzzer.prob_threshold
303
  tokens_html, plot_data, output_state, step_outputs = initialize_eval_interface(
304
- example, outputs, workflow.inputs, confidence_threshold, prob_threshold
305
  )
306
- df = process_tossup_results(outputs)
307
 
308
  return (
309
  tokens_html,
@@ -332,10 +331,10 @@ class TossupInterface:
332
  if not self.ds or not self.ds.num_rows:
333
  return "No dataset loaded", None, None
334
  pipeline_state = validation.validate_tossup_workflow(state_dict)
335
- model_outputs = []
336
- for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
337
- run_outputs = self.get_agent_outputs(example, pipeline_state, early_stop=True)
338
- model_outputs.append(run_outputs)
339
  eval_df = prepare_tossup_results_df(model_outputs, self.ds["run_indices"])
340
  plot_data = create_tossup_eval_dashboard(self.ds["run_indices"], eval_df)
341
  output_df = create_tossup_eval_table(eval_df)
 
16
  from shared.workflows.metrics import evaluate_prediction
17
  from shared.workflows.metrics.qb_metrics import prepare_tossup_results_df
18
  from shared.workflows.qb_agents import QuizBowlTossupAgent, TossupResult
19
+ from shared.workflows.runners import run_and_eval_tossup_dataset, run_and_evaluate_tossup
20
  from submission import submit
21
 
22
  from . import populate, validation
 
29
  from .utils import create_error_message
30
  from .validation import UserInputWorkflowValidator
31
 
 
 
 
32
 
33
  class ScoredTossupResult(TossupResult):
34
  """Result of a tossup question with evaluation score and position."""
 
42
  ) -> list[ScoredTossupResult]:
43
  """Add model scores to the model outputs."""
44
  for output in run_outputs:
45
+ output["correct"] = evaluate_prediction(output["guess"], clean_answers)
46
+ output["token_position"] = run_indices[output["run_idx"] - 1]
47
  return run_outputs
48
 
49
 
 
56
  return [], []
57
  eval_points = []
58
  for o in model_outputs:
59
+ token_position = run_indices[o["run_idx"] - 1]
60
  eval_points.append((token_position, o))
61
 
62
  return eval_points
 
78
  eval_points = [(o["token_position"], o) for o in run_outputs]
79
 
80
  if not tokens:
81
+ error_msg = "No tokens found in the provided text."
82
+ logger.exception(error_msg)
83
+ return styled_error(error_msg), pd.DataFrame(), {}, {}
84
  html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
85
+ plot_data = create_tossup_confidence_pyplot(tokens, run_outputs, confidence_threshold, prob_threshold)
86
 
87
  # Store tokens, values, and buzzes as JSON for later use
88
  state = {"tokens": tokens, "values": eval_points}
 
91
  step_outputs = {}
92
  for output in run_outputs:
93
  tok_pos = output["token_position"]
94
+ key = "{pos}:{token}".format(pos=tok_pos, token=tokens[tok_pos - 1])
95
  step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
96
  if output["logprob"] is not None:
97
  step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
98
 
99
  return html_content, plot_data, state, step_outputs
100
  except Exception as e:
101
+ error_msg = f"Error initializing interface: {str(e)}"
102
+ logger.exception(error_msg)
103
+ return styled_error(error_msg), pd.DataFrame(), {}, {}
104
 
105
 
106
  def process_tossup_results(results: list[dict]) -> pd.DataFrame:
 
109
  for r in results:
110
  entry = {
111
  "Token Position": r["token_position"],
112
+ "Correct?": "✅" if r["correct"] == 1 else "❌",
113
  "Confidence": r["confidence"],
114
  }
115
  if r["logprob"] is not None:
116
  entry["Probability"] = f"{np.exp(r['logprob']):.3f}"
117
+ entry["Prediction"] = r["guess"]
118
  data.append(entry)
119
  return pd.DataFrame(data)
120
 
 
142
 
143
  # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE ------------------------------------
144
 
145
+ def load_default_workflow(self):
146
+ workflow = self.defaults["init_workflow"]
147
+ pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
148
+ return pipeline_state_dict, {}
149
+
150
  def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
 
151
  try:
152
  state_dict = browser_state["tossup"].get("pipeline_state", {})
153
+ if state_dict:
154
+ pipeline_state = TossupPipelineState.model_validate(state_dict)
155
+ pipeline_state_dict = pipeline_state.model_dump()
156
+ output_state = browser_state["tossup"].get("output_state", {})
157
+ else:
158
+ pipeline_state_dict, output_state = self.load_default_workflow()
159
  except Exception as e:
160
  logger.warning(f"Error loading presaved pipeline state: {e}")
161
+ pipeline_state_dict, output_state = self.load_default_workflow()
 
 
162
  return browser_state, not pipeline_change, pipeline_state_dict, output_state
163
 
164
  # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
 
262
  return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
263
 
264
  # ------------------------------------- Agent Functions -----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  def single_run(
267
  self,
 
289
  if not self.ds or question_id < 0 or question_id >= len(self.ds):
290
  raise gr.Error("Invalid question ID or dataset not loaded")
291
  example = self.ds[question_id]
292
+ outputs = run_and_evaluate_tossup(
293
+ QuizBowlTossupAgent(pipeline_state.workflow),
294
+ example,
295
+ return_extras=True,
296
+ early_stop=early_stop,
297
+ )
298
+ run_outputs = outputs["run_outputs"]
299
  # Process results and prepare visualization data
300
  confidence_threshold = workflow.buzzer.confidence_threshold
301
  prob_threshold = workflow.buzzer.prob_threshold
302
  tokens_html, plot_data, output_state, step_outputs = initialize_eval_interface(
303
+ example, run_outputs, workflow.inputs, confidence_threshold, prob_threshold
304
  )
305
+ df = process_tossup_results(run_outputs)
306
 
307
  return (
308
  tokens_html,
 
331
  if not self.ds or not self.ds.num_rows:
332
  return "No dataset loaded", None, None
333
  pipeline_state = validation.validate_tossup_workflow(state_dict)
334
+ agent = QuizBowlTossupAgent(pipeline_state.workflow)
335
+ model_outputs = run_and_eval_tossup_dataset(
336
+ agent, self.ds, return_extras=True, tqdm_provider=progress.tqdm, num_workers=2
337
+ )
338
  eval_df = prepare_tossup_results_df(model_outputs, self.ds["run_indices"])
339
  plot_data = create_tossup_eval_dashboard(self.ds["run_indices"], eval_df)
340
  output_df = create_tossup_eval_table(eval_df)
src/components/quizbowl/utils.py CHANGED
@@ -1,7 +1,3 @@
1
- from typing import Any, Dict, List
2
-
3
- import pandas as pd
4
-
5
  from shared.workflows.errors import ProviderAPIError, WorkflowExecutionError
6
 
7
 
@@ -15,61 +11,3 @@ def create_error_message(e: Exception) -> str:
15
  return f"Invalid input -- {e}. Please try again. \n\nIf the problem persists, please contact support."
16
  else:
17
  return "An unexpected error occurred. Please contact support."
18
-
19
-
20
- def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
21
- """Create a DataFrame for the confidence plot."""
22
- if not top_k_mode:
23
- return pd.DataFrame(
24
- {
25
- "position": [r["position"] for r in results],
26
- "confidence": [r["confidence"] for r in results],
27
- "answer": [r["answer"] for r in results],
28
- }
29
- )
30
-
31
- # For top-k mode, extract and plot top answers
32
- return _create_top_k_plot_data(results)
33
-
34
-
35
- def _create_top_k_plot_data(results: List[Dict]) -> pd.DataFrame:
36
- """Create plot data for top-k mode."""
37
- # Find top answers across all positions (limited to top 5)
38
- top_answers = set()
39
- for r in results:
40
- for g in r.get("guesses", [])[:3]: # Get top 3 from each position
41
- if g.get("answer"):
42
- top_answers.add(g.get("answer"))
43
-
44
- top_answers = list(top_answers)[:5] # Limit to 5 total answers
45
-
46
- # Create plot data for each answer
47
- all_data = []
48
- for position_idx, result in enumerate(results):
49
- position = result["position"]
50
- for answer in top_answers:
51
- confidence = 0
52
- for guess in result.get("guesses", []):
53
- if guess.get("answer") == answer:
54
- confidence = guess.get("confidence", 0)
55
- break
56
- all_data.append({"position": position, "confidence": confidence, "answer": answer})
57
-
58
- return pd.DataFrame(all_data)
59
-
60
-
61
- def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
62
- """Create a DataFrame for top-k results."""
63
- df_rows = []
64
- for result in results:
65
- position = result["position"]
66
- for i, guess in enumerate(result.get("guesses", [])):
67
- df_rows.append(
68
- {
69
- "position": position,
70
- "answer": guess.get("answer", ""),
71
- "confidence": guess.get("confidence", 0),
72
- "rank": i + 1,
73
- }
74
- )
75
- return pd.DataFrame(df_rows)
 
 
 
 
 
1
  from shared.workflows.errors import ProviderAPIError, WorkflowExecutionError
2
 
3
 
 
11
  return f"Invalid input -- {e}. Please try again. \n\nIf the problem persists, please contact support."
12
  else:
13
  return "An unexpected error occurred. Please contact support."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/custom_css.py CHANGED
@@ -12,6 +12,7 @@ css_pipeline = """
12
  --input-text-size: var(--text-sm) !important;
13
  --body-text-size: 14px !important;
14
  --input-background-fill-focus: var(--secondary-300) !important;
 
15
 
16
  // Button Colors
17
  --button-primary-background-fill: var(--primary-800) !important;
@@ -38,6 +39,7 @@ css_pipeline = """
38
  --text-lg: 16px !important;
39
  --input-text-size: var(--text-sm) !important;
40
  --body-text-size: 14px !important;
 
41
 
42
  --button-primary-background-fill: var(--neutral-100) !important;
43
  --button-secondary-background-fill: var(--secondary-300) !important;
 
12
  --input-text-size: var(--text-sm) !important;
13
  --body-text-size: 14px !important;
14
  --input-background-fill-focus: var(--secondary-300) !important;
15
+ --link-text-color: blue !important;
16
 
17
  // Button Colors
18
  --button-primary-background-fill: var(--primary-800) !important;
 
39
  --text-lg: 16px !important;
40
  --input-text-size: var(--text-sm) !important;
41
  --body-text-size: 14px !important;
42
+ --link-text-color: blue !important;
43
 
44
  --button-primary-background-fill: var(--neutral-100) !important;
45
  --button-secondary-background-fill: var(--secondary-300) !important;
src/envs.py CHANGED
@@ -41,6 +41,7 @@ PLAYGROUND_DATASET_NAMES = {
41
 
42
  # If you setup a cache later, just change HF_HOME
43
  CACHE_PATH = os.getenv("HF_HOME", ".")
 
44
 
45
  # Local caches
46
  LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
 
41
 
42
  # If you setup a cache later, just change HF_HOME
43
  CACHE_PATH = os.getenv("HF_HOME", ".")
44
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
45
 
46
  # Local caches
47
  LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
src/submission/structs.py CHANGED
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Field
6
  from shared.workflows.structs import TossupWorkflow, Workflow
7
 
8
  CompetitionType = Literal["tossup", "bonus"]
9
- SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
10
  SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
11
 
12
 
 
6
  from shared.workflows.structs import TossupWorkflow, Workflow
7
 
8
  CompetitionType = Literal["tossup", "bonus"]
9
+ SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow", "hf_pipeline"]
10
  SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
11
 
12
 
src/submission/submit.py CHANGED
@@ -40,7 +40,7 @@ def get_user_submissions(username: str, competition_type: str, pattern: str = No
40
  def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
41
  """Get all submission model names for a user."""
42
  if profile is None:
43
- logger.warning("No user profile provided. Returning empty list.")
44
  return []
45
  submissions = get_user_submissions(profile.username, competition_type)
46
  return [f"{s.username}/{s.model_name}" for s in submissions]
@@ -88,7 +88,7 @@ def get_time_until_next_submission(tz: timezone = timezone.utc) -> str:
88
  return remaining_time_str
89
 
90
 
91
- def create_submission(
92
  username: str,
93
  model_name: str,
94
  description: str,
@@ -125,6 +125,41 @@ def create_submission(
125
  return submission
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def validate_model_name(model_name: str):
129
  # check if model_name has no white spaces, no special characters apart from _ and -
130
  if " " in model_name:
@@ -177,7 +212,7 @@ def submit_model(
177
  return styled_error(f"Submission Error! Invalid model name '{model_name}'.<br>{error_msg}")
178
 
179
  try:
180
- submission = create_submission(
181
  username=username,
182
  model_name=model_name,
183
  description=description,
 
40
  def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
41
  """Get all submission model names for a user."""
42
  if profile is None:
43
+ logger.info("No user profile provided. Returning empty list.")
44
  return []
45
  submissions = get_user_submissions(profile.username, competition_type)
46
  return [f"{s.username}/{s.model_name}" for s in submissions]
 
88
  return remaining_time_str
89
 
90
 
91
+ def create_workflow_submission(
92
  username: str,
93
  model_name: str,
94
  description: str,
 
125
  return submission
126
 
127
 
128
+ def create_hf_submission(
129
+ username: str,
130
+ model_name: str,
131
+ description: str,
132
+ competition_type: CompetitionType,
133
+ ) -> Submission:
134
+ """
135
+ Create a submission for a tossup model.
136
+
137
+ Args:
138
+ username: Username of the user who created the submission
139
+ model_name: Name of the model
140
+ description: Detailed description of what the submission does
141
+ competition_type: Type of competition
142
+
143
+ Returns:
144
+ Submission object if successful, None if validation fails
145
+ """
146
+ # Create the submission
147
+ dt = datetime.now(timezone.utc)
148
+ submission = Submission(
149
+ id=f"{competition_type}__hf__{dt.strftime('%Y%m%d_%H%M%S')}__{username}__{model_name.lower().replace(' ', '_')}",
150
+ model_name=model_name,
151
+ username=username,
152
+ description=description,
153
+ competition_type=competition_type,
154
+ submission_type="hf_pipeline",
155
+ status="submitted",
156
+ created_at=dt.isoformat(),
157
+ updated_at=dt.isoformat(),
158
+ )
159
+
160
+ return submission
161
+
162
+
163
  def validate_model_name(model_name: str):
164
  # check if model_name has no white spaces, no special characters apart from _ and -
165
  if " " in model_name:
 
212
  return styled_error(f"Submission Error! Invalid model name '{model_name}'.<br>{error_msg}")
213
 
214
  try:
215
+ submission = create_workflow_submission(
216
  username=username,
217
  model_name=model_name,
218
  description=description,