Maharshi Gor commited on
Commit
5d637a7
·
1 Parent(s): d43ec9f

Changed OWNER name at places,

Browse files

leaderboard for tossups and bonus,
submission id now contains username

app.py CHANGED
@@ -55,10 +55,16 @@ def download_dataset_snapshot(repo_id, local_dir):
55
  download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
56
 
57
 
58
- def fetch_leaderboard_df():
59
- logger.info("Leaderboard fetched...")
60
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
61
- return populate.get_leaderboard_df(EVAL_RESULTS_PATH)
 
 
 
 
 
 
62
 
63
 
64
  def load_dataset(mode: str):
@@ -150,16 +156,29 @@ if __name__ == "__main__":
150
  gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
151
  gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
152
  refresh_btn = gr.Button("🔄 Refresh")
153
- leaderboard_table = gr.Dataframe(
154
- value=fetch_leaderboard_df,
 
 
155
  every=leaderboard_timer,
156
  headers=[c.name for c in fields(AutoEvalColumn)],
157
  datatype=[c.type for c in fields(AutoEvalColumn)],
158
- elem_id="leaderboard-table",
159
  interactive=False,
160
  visible=True,
161
  )
162
- refresh_btn.click(fn=fetch_leaderboard_df, inputs=[], outputs=leaderboard_table)
 
 
 
 
 
 
 
 
 
 
 
163
  with gr.Tab("❓ Help", id="help"):
164
  with gr.Row():
165
  with gr.Column():
 
55
  download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
56
 
57
 
58
+ def fetch_tossup_leaderboard():
59
+ logger.info("Tossup leaderboard fetched...")
60
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
61
+ return populate.get_tossups_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
62
+
63
+
64
+ def fetch_bonus_leaderboard():
65
+ logger.info("Bonus leaderboard fetched...")
66
+ download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
67
+ return populate.get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
68
 
69
 
70
  def load_dataset(mode: str):
 
156
  gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
157
  gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
158
  refresh_btn = gr.Button("🔄 Refresh")
159
+
160
+ gr.Markdown("## 📚 Tossup Round Leaderboard")
161
+ tossup_leaderboard = gr.Dataframe(
162
+ value=fetch_tossup_leaderboard,
163
  every=leaderboard_timer,
164
  headers=[c.name for c in fields(AutoEvalColumn)],
165
  datatype=[c.type for c in fields(AutoEvalColumn)],
166
+ elem_id="tossup-table",
167
  interactive=False,
168
  visible=True,
169
  )
170
+
171
+ gr.Markdown("## 📚 Bonus Round Leaderboard")
172
+ bonus_leaderboard = gr.Dataframe(
173
+ value=fetch_bonus_leaderboard,
174
+ every=leaderboard_timer,
175
+ headers=[c.name for c in fields(AutoEvalColumn)],
176
+ datatype=[c.type for c in fields(AutoEvalColumn)],
177
+ elem_id="bonus-table",
178
+ )
179
+
180
+ refresh_btn.click(fn=fetch_tossup_leaderboard, inputs=[], outputs=tossup_leaderboard)
181
+ refresh_btn.click(fn=fetch_bonus_leaderboard, inputs=[], outputs=bonus_leaderboard)
182
  with gr.Tab("❓ Help", id="help"):
183
  with gr.Row():
184
  with gr.Column():
src/components/quizbowl/populate.py CHANGED
@@ -6,6 +6,7 @@ from loguru import logger
6
  from app_configs import UNSELECTED_PIPELINE_NAME
7
  from components.structs import TossupWorkflow, Workflow
8
  from display.formatting import styled_error
 
9
  from submission import submit
10
 
11
 
@@ -30,7 +31,7 @@ def load_workflow(
30
  if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
31
  return None
32
  username, model_name = model_name.split("/")
33
- if username == "umdclip":
34
  workflow = submit.load_demo_example(model_name, competition_type)
35
  elif profile is not None:
36
  submission = submit.load_submission(model_name, competition_type, profile)
 
6
  from app_configs import UNSELECTED_PIPELINE_NAME
7
  from components.structs import TossupWorkflow, Workflow
8
  from display.formatting import styled_error
9
+ from src.envs import OWNER
10
  from submission import submit
11
 
12
 
 
31
  if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
32
  return None
33
  username, model_name = model_name.split("/")
34
+ if username == OWNER:
35
  workflow = submit.load_demo_example(model_name, competition_type)
36
  elif profile is not None:
37
  submission = submit.load_submission(model_name, competition_type, profile)
src/components/quizbowl/tossup.py CHANGED
@@ -35,10 +35,12 @@ class ScoredTossupResult(TossupResult):
35
  """Result of a tossup question with evaluation score and position."""
36
 
37
  score: int # Correctness score of the answer
38
- token_position: int # Position in the question where prediction was made
39
 
40
 
41
- def add_model_scores(run_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
 
 
42
  """Add model scores to the model outputs."""
43
  for output in run_outputs:
44
  output["score"] = evaluate_prediction(output["answer"], clean_answers)
 
35
  """Result of a tossup question with evaluation score and position."""
36
 
37
  score: int # Correctness score of the answer
38
+ token_position: int # 0-indexed position in the question where prediction was made
39
 
40
 
41
+ def add_model_scores(
42
+ run_outputs: list[TossupResult], clean_answers: list[str], run_indices: list[int]
43
+ ) -> list[ScoredTossupResult]:
44
  """Add model scores to the model outputs."""
45
  for output in run_outputs:
46
  output["score"] = evaluate_prediction(output["answer"], clean_answers)
src/envs.py CHANGED
@@ -14,9 +14,10 @@ OWNER = "qanta-challenge"
14
 
15
  REPO_ID = f"{OWNER}/quizbowl-submission"
16
  QUEUE_REPO = f"{OWNER}/advcal-requests"
17
- RESULTS_REPO = f"{OWNER}/model-results" # TODO: change to advcal-results after testing is done
18
  LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
19
  USERS_REPO = f"{OWNER}/registered-users"
 
20
 
21
  DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
22
  DOCS_REPO_BRANCH = "main"
 
14
 
15
  REPO_ID = f"{OWNER}/quizbowl-submission"
16
  QUEUE_REPO = f"{OWNER}/advcal-requests"
17
+ RESULTS_REPO = f"{OWNER}/advcal-results"
18
  LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
19
  USERS_REPO = f"{OWNER}/registered-users"
20
+ EVAL_SPLITS = ["tiny_eval"]
21
 
22
  DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
23
  DOCS_REPO_BRANCH = "main"
src/populate.py CHANGED
@@ -2,31 +2,79 @@ import json
2
  import os
3
 
4
  import pandas as pd
 
5
 
6
  from display.formatting import make_clickable_model
7
  from display.utils_old import EvalQueueColumn
8
 
9
 
10
- def get_leaderboard_df(results_path: str) -> pd.DataFrame:
11
- model_result_filepaths = []
12
- for root, _, files in os.walk(results_path):
 
13
  if len(files) == 0 or not all(f.endswith(".json") for f in files):
14
  continue
15
  for file in files:
16
- model_result_filepaths.append(os.path.join(root, file))
17
-
18
- eval_results = {"model": [], "buzz_accuracy": [], "win_rate_human": [], "win_rate_model": []}
19
- for model_result_filepath in model_result_filepaths:
20
- with open(model_result_filepath, "r") as fin:
21
- model_result = json.load(fin)
22
- model_id = model_result["model_id"]
23
- buzz_accuracy = model_result["buzz_accuracy"]
24
- win_rate_human = model_result["win_rate_human"]
25
- win_rate_model = model_result["win_rate_model"]
26
- eval_results["model"].append(model_id)
27
- eval_results["buzz_accuracy"].append(buzz_accuracy)
28
- eval_results["win_rate_human"].append(win_rate_human)
29
- eval_results["win_rate_model"].append(win_rate_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return pd.DataFrame(eval_results)
31
 
32
 
 
2
  import os
3
 
4
  import pandas as pd
5
+ from loguru import logger
6
 
7
  from display.formatting import make_clickable_model
8
  from display.utils_old import EvalQueueColumn
9
 
10
 
11
+ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
12
+ model_results = []
13
+ dirpath = os.path.join(repo_dir, competition_type, eval_split)
14
+ for root, _, files in os.walk(dirpath):
15
  if len(files) == 0 or not all(f.endswith(".json") for f in files):
16
  continue
17
  for file in files:
18
+ filepath = os.path.join(root, file)
19
+ try:
20
+ with open(filepath, "r") as fp:
21
+ result = json.load(fp)
22
+ model_results.append(result)
23
+ except Exception as e:
24
+ logger.error(f"Error loading model result from {filepath}: {e}")
25
+ continue
26
+
27
+ return model_results
28
+
29
+
30
+ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
31
+ model_results = fetch_model_results(repo_dir, "tossup", eval_split)
32
+
33
+ eval_results = []
34
+ for result in model_results:
35
+ try:
36
+ metrics = result["metrics"]
37
+ username = result["username"]
38
+ model_name = result["model_name"]
39
+ buzz_accuracy = metrics["buzz_accuracy"]
40
+
41
+ row = {
42
+ "Submission": f"{username}/{model_name}",
43
+ "Avg Score (Max 10)": metrics["tossup_score"],
44
+ "Buzzer Accuracy": buzz_accuracy,
45
+ "Buzzer Position": metrics["buzz_position"],
46
+ }
47
+ if "human_win_rate" in metrics:
48
+ row["Win Rate w/ Human"] = metrics["human_win_rate"]
49
+ row["Win Rate w/ Human (Aggressive)"] = metrics["human_win_rate_strict"]
50
+ eval_results.append(row)
51
+ except Exception as e:
52
+ logger.error(f"Error processing model result: {e}")
53
+ continue
54
+
55
+ return pd.DataFrame(eval_results)
56
+
57
+
58
+ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
59
+ model_results = fetch_model_results(repo_dir, "bonus", eval_split)
60
+
61
+ eval_results = []
62
+ for result in model_results:
63
+ try:
64
+ metrics = result["metrics"]
65
+ username = result["username"]
66
+ model_name = result["model_name"]
67
+
68
+ row = {
69
+ "Submission": f"{username}/{model_name}",
70
+ "Question Accuracy": metrics["question_accuracy"],
71
+ "Part Accuracy": metrics["part_accuracy"],
72
+ }
73
+ eval_results.append(row)
74
+ except Exception as e:
75
+ logger.error(f"Error processing model result: {e}")
76
+ continue
77
+
78
  return pd.DataFrame(eval_results)
79
 
80
 
src/submission/structs.py CHANGED
@@ -10,6 +10,17 @@ SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
10
  SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
13
  class Submission(BaseModel):
14
  """
15
  Represents a submission in the competition system, formatted for HuggingFace datasets.
 
10
  SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
11
 
12
 
13
+ class User(BaseModel):
14
+ """
15
+ Represents a user in the competition system, formatted for HuggingFace datasets.
16
+ """
17
+
18
+ username: str = Field(description="HuggingFace username of the user")
19
+ name: str = Field(description="Full name of the user")
20
+ email: str = Field(description="Contact email of the user")
21
+ affiliation: str = Field(description="Affiliation of the user")
22
+
23
+
24
  class Submission(BaseModel):
25
  """
26
  Represents a submission in the competition system, formatted for HuggingFace datasets.
src/submission/submit.py CHANGED
@@ -12,7 +12,7 @@ from loguru import logger
12
 
13
  from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
14
  from display.formatting import styled_error, styled_message
15
- from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, QUEUE_REPO
16
  from submission.structs import CompetitionType, Submission, SubmissionStatus
17
  from workflows.structs import TossupWorkflow, Workflow
18
 
@@ -49,7 +49,7 @@ def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile |
49
  def get_demo_example_submissions(competition_type: str) -> list[str]:
50
  """Get all submissions for a demo example."""
51
  examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
52
- return [f"umdclip/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
53
 
54
 
55
  def get_user_submissions_by_date(
@@ -110,7 +110,7 @@ def create_submission(
110
  # Create the submission
111
  dt = datetime.now(timezone.utc)
112
  submission = Submission(
113
- id=f"{competition_type}_{dt.strftime('%Y%m%d_%H%M%S')}_{model_name.lower().replace(' ', '_')}",
114
  model_name=model_name,
115
  username=username,
116
  description=description,
 
12
 
13
  from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
14
  from display.formatting import styled_error, styled_message
15
+ from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, OWNER, QUEUE_REPO
16
  from submission.structs import CompetitionType, Submission, SubmissionStatus
17
  from workflows.structs import TossupWorkflow, Workflow
18
 
 
49
  def get_demo_example_submissions(competition_type: str) -> list[str]:
50
  """Get all submissions for a demo example."""
51
  examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
52
+ return [f"{OWNER}/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
53
 
54
 
55
  def get_user_submissions_by_date(
 
110
  # Create the submission
111
  dt = datetime.now(timezone.utc)
112
  submission = Submission(
113
+ id=f"{competition_type}__{dt.strftime('%Y%m%d_%H%M%S')}__{username}__{model_name.lower().replace(' ', '_')}",
114
  model_name=model_name,
115
  username=username,
116
  description=description,
src/workflows/llmcache.py CHANGED
@@ -7,7 +7,7 @@ import time
7
  from pathlib import Path
8
  from typing import Any, Optional
9
 
10
- from datasets import Dataset, Features, Value
11
  from huggingface_hub import snapshot_download
12
  from loguru import logger
13
 
@@ -21,6 +21,7 @@ def load_dataset_from_hf(repo_id, local_dir):
21
  etag_timeout=30,
22
  token=os.environ["HF_TOKEN"],
23
  )
 
24
 
25
 
26
  class CacheDB:
@@ -394,43 +395,49 @@ class LLMCache:
394
 
395
  try:
396
  # Check for new commits before loading the dataset
397
- dataset = load_dataset_from_hf(self.hf_repo_id, self.cache_dir / "hf_cache")
398
- if dataset:
399
- existing_keys = self.db.get_existing_keys()
400
-
401
- # Prepare batch items for insertion
402
- items_to_insert = []
403
- for item in dataset:
404
- key = item["key"]
405
- # Only update if not in local cache to prioritize local changes
406
- if key in existing_keys:
407
- continue
408
- # Create request JSON
409
- request_data = {
410
- "model": item["model"],
411
- "system": item["system"],
412
- "prompt": item["prompt"],
413
- "temperature": item["temperature"],
414
- "response_format": None, # We can't fully reconstruct this
415
- }
 
 
 
 
 
 
416
 
417
- items_to_insert.append(
418
- (
419
- key,
420
- json.dumps(request_data),
421
- item["response"], # This is already a JSON string
422
- )
423
- )
424
- logger.info(
425
- f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
426
  )
427
-
428
- # Bulk insert new items
429
- if items_to_insert:
430
- inserted_count = self.db.bulk_insert(items_to_insert)
431
- logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
432
- else:
433
- logger.info("No new items to merge from HF dataset")
 
 
 
 
434
  except Exception as e:
435
  logger.warning(f"Could not load cache from HF dataset: {e}")
436
 
@@ -449,6 +456,8 @@ class LLMCache:
449
  if not self.hf_repo_id:
450
  return
451
 
 
 
452
  # Get all entries from the database
453
  cache = self.db.get_all_entries()
454
 
 
7
  from pathlib import Path
8
  from typing import Any, Optional
9
 
10
+ from datasets import Dataset, load_dataset, load_from_disk
11
  from huggingface_hub import snapshot_download
12
  from loguru import logger
13
 
 
21
  etag_timeout=30,
22
  token=os.environ["HF_TOKEN"],
23
  )
24
+ return load_dataset(repo_id)
25
 
26
 
27
  class CacheDB:
 
395
 
396
  try:
397
  # Check for new commits before loading the dataset
398
+ ds_path = (self.cache_dir / "hf_cache").as_posix()
399
+ dataset = load_dataset_from_hf(self.hf_repo_id, ds_path)["train"]
400
+ if not dataset:
401
+ logger.info("No new items to merge from HF dataset")
402
+ return
403
+
404
+ existing_keys = self.db.get_existing_keys()
405
+
406
+ logger.info(f"Found {len(dataset)} items in HF dataset. Existing keys: {len(existing_keys)}")
407
+
408
+ # Prepare batch items for insertion
409
+ items_to_insert = []
410
+ for item in dataset:
411
+ key = item["key"]
412
+ # Only update if not in local cache to prioritize local changes
413
+ if key in existing_keys:
414
+ continue
415
+ # Create request JSON
416
+ request_data = {
417
+ "model": item["model"],
418
+ "system": item["system"],
419
+ "prompt": item["prompt"],
420
+ "temperature": item["temperature"],
421
+ "response_format": None, # We can't fully reconstruct this
422
+ }
423
 
424
+ items_to_insert.append(
425
+ (
426
+ key,
427
+ json.dumps(request_data),
428
+ item["response"], # This is already a JSON string
 
 
 
 
429
  )
430
+ )
431
+ logger.info(
432
+ f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
433
+ )
434
+
435
+ # Bulk insert new items
436
+ if items_to_insert:
437
+ inserted_count = self.db.bulk_insert(items_to_insert)
438
+ logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
439
+ else:
440
+ logger.info("No new items to merge from HF dataset")
441
  except Exception as e:
442
  logger.warning(f"Could not load cache from HF dataset: {e}")
443
 
 
456
  if not self.hf_repo_id:
457
  return
458
 
459
+ self._load_cache_from_hf()
460
+
461
  # Get all entries from the database
462
  cache = self.db.get_all_entries()
463
 
src/workflows/llms.py CHANGED
@@ -13,13 +13,27 @@ from langchain_openai import ChatOpenAI
13
  from loguru import logger
14
  from openai import OpenAI
15
  from pydantic import BaseModel, Field
 
 
16
  from rich import print as rprint
17
 
 
 
 
18
  from .configs import AVAILABLE_MODELS
19
  from .llmcache import LLMCache
20
 
21
- # Initialize global cache
22
- llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache")
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def _openai_is_json_mode_supported(model_name: str) -> bool:
@@ -52,10 +66,17 @@ def _cohere_completion(
52
  {"role": "user", "content": prompt},
53
  ]
54
  client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
 
 
 
 
 
 
 
55
  response = client.chat(
56
  model=model,
57
  messages=messages,
58
- response_format={"type": "json_schema", "json_schema": response_model.model_json_schema()},
59
  logprobs=logprobs,
60
  temperature=temperature,
61
  )
@@ -180,10 +201,10 @@ def completion(
180
  # Check cache first
181
  cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
182
  if cached_response and (not logprobs or cached_response.get("logprob")):
183
- logger.info(f"Cache hit for model {model}")
184
  return cached_response
185
 
186
- logger.info(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
187
 
188
  # Continue with the original implementation for cache miss
189
  response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
@@ -217,7 +238,7 @@ if __name__ == "__main__":
217
  system = "You are an accurate and concise explainer of scientific concepts."
218
  prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
219
 
220
- llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache", reset=True)
221
 
222
  # First call - should be a cache miss
223
  logger.info("First call - should be a cache miss")
 
13
  from loguru import logger
14
  from openai import OpenAI
15
  from pydantic import BaseModel, Field
16
+ from pydantic._internal._core_utils import CoreSchemaOrField, is_core_schema
17
+ from pydantic.json_schema import GenerateJsonSchema
18
  from rich import print as rprint
19
 
20
+ # Initialize global cache
21
+ from src.envs import CACHE_PATH, LLM_CACHE_REPO
22
+
23
  from .configs import AVAILABLE_MODELS
24
  from .llmcache import LLMCache
25
 
26
+ llm_cache = LLMCache(cache_dir=CACHE_PATH, hf_repo=LLM_CACHE_REPO)
27
+
28
+
29
+ class CohereSchemaGenerator(GenerateJsonSchema):
30
+ """Generates JSON schema for Cohere models without default titles."""
31
+
32
+ def field_title_should_be_set(self, schema: CoreSchemaOrField) -> bool:
33
+ return_value = super().field_title_should_be_set(schema)
34
+ if return_value and is_core_schema(schema):
35
+ return False
36
+ return return_value
37
 
38
 
39
  def _openai_is_json_mode_supported(model_name: str) -> bool:
 
66
  {"role": "user", "content": prompt},
67
  ]
68
  client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
69
+ schema = response_model.model_json_schema(schema_generator=CohereSchemaGenerator)
70
+ if "title" in schema:
71
+ del schema["title"]
72
+ response_format = {
73
+ "type": "json_object",
74
+ "schema": schema,
75
+ }
76
  response = client.chat(
77
  model=model,
78
  messages=messages,
79
+ response_format=response_format,
80
  logprobs=logprobs,
81
  temperature=temperature,
82
  )
 
201
  # Check cache first
202
  cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
203
  if cached_response and (not logprobs or cached_response.get("logprob")):
204
+ logger.debug(f"Cache hit for model {model}")
205
  return cached_response
206
 
207
+ logger.debug(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
208
 
209
  # Continue with the original implementation for cache miss
210
  response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
 
238
  system = "You are an accurate and concise explainer of scientific concepts."
239
  prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
240
 
241
+ llm_cache = LLMCache(cache_dir=".", hf_repo="qanta-challenge/advcal-llm-cache", reset=True)
242
 
243
  # First call - should be a cache miss
244
  logger.info("First call - should be a cache miss")
src/workflows/qb_agents.py CHANGED
@@ -18,13 +18,13 @@ def _get_workflow_response(
18
 
19
 
20
  class TossupResult(TypedDict):
21
- answer: str
22
- confidence: float
23
- logprob: float | None
24
- buzz: bool
25
- question_fragment: str
26
- position: int
27
- step_contents: list[str]
28
  response_time: float
29
  step_outputs: dict[str, Any]
30
 
@@ -64,7 +64,14 @@ class QuizBowlTossupAgent:
64
  raise ValueError(f"Output variable {out_var} not found in workflow outputs")
65
 
66
  def _single_run(self, question_run: str, position: int) -> TossupResult:
67
- """Process a single question run."""
 
 
 
 
 
 
 
68
  answer_var_step = self.workflow.outputs["answer"].split(".")[0]
69
  workflow_output, response_time = _get_workflow_response(
70
  self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
@@ -181,7 +188,7 @@ if __name__ == "__main__":
181
 
182
  from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
183
 
184
- ds_name = "umdclip/leaderboard_co_set"
185
  ds = load_dataset(ds_name, split="train")
186
 
187
  # Create the agents with multi-step workflows
 
18
 
19
 
20
  class TossupResult(TypedDict):
21
+ answer: str # the model's answer
22
+ confidence: float # confidence score
23
+ logprob: float | None # log probability of the answer
24
+ buzz: bool # whether the agent buzzed
25
+ question_fragment: str # prefix of the question text so far
26
+ position: int # 1-indexed question run index
27
+ step_contents: list[str] # string content outputs of each step
28
  response_time: float
29
  step_outputs: dict[str, Any]
30
 
 
64
  raise ValueError(f"Output variable {out_var} not found in workflow outputs")
65
 
66
  def _single_run(self, question_run: str, position: int) -> TossupResult:
67
+ """Process a single question run.
68
+ Args:
69
+ question_run: The question run to process
70
+ position: The position of the question run
71
+
72
+ Returns:
73
+ A TossupResult containing the answer, confidence, logprob, buzz, question fragment, position, step contents, response time, and step outputs
74
+ """
75
  answer_var_step = self.workflow.outputs["answer"].split(".")[0]
76
  workflow_output, response_time = _get_workflow_response(
77
  self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
 
188
 
189
  from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
190
 
191
+ ds_name = "qanta-challenge/leaderboard_co_set"
192
  ds = load_dataset(ds_name, split="train")
193
 
194
  # Create the agents with multi-step workflows