Maharshi Gor
commited on
Commit
·
5d637a7
1
Parent(s):
d43ec9f
Changed OWNER name at places,
Browse filesleaderboard for tossups and bonus,
submission id now contains username
- app.py +26 -7
- src/components/quizbowl/populate.py +2 -1
- src/components/quizbowl/tossup.py +4 -2
- src/envs.py +2 -1
- src/populate.py +65 -17
- src/submission/structs.py +11 -0
- src/submission/submit.py +3 -3
- src/workflows/llmcache.py +45 -36
- src/workflows/llms.py +27 -6
- src/workflows/qb_agents.py +16 -9
app.py
CHANGED
@@ -55,10 +55,16 @@ def download_dataset_snapshot(repo_id, local_dir):
|
|
55 |
download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
56 |
|
57 |
|
58 |
-
def
|
59 |
-
logger.info("
|
60 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
61 |
-
return populate.
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
def load_dataset(mode: str):
|
@@ -150,16 +156,29 @@ if __name__ == "__main__":
|
|
150 |
gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
|
151 |
gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
|
152 |
refresh_btn = gr.Button("🔄 Refresh")
|
153 |
-
|
154 |
-
|
|
|
|
|
155 |
every=leaderboard_timer,
|
156 |
headers=[c.name for c in fields(AutoEvalColumn)],
|
157 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
158 |
-
elem_id="
|
159 |
interactive=False,
|
160 |
visible=True,
|
161 |
)
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
with gr.Tab("❓ Help", id="help"):
|
164 |
with gr.Row():
|
165 |
with gr.Column():
|
|
|
55 |
download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
56 |
|
57 |
|
58 |
+
def fetch_tossup_leaderboard():
|
59 |
+
logger.info("Tossup leaderboard fetched...")
|
60 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
61 |
+
return populate.get_tossups_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
|
62 |
+
|
63 |
+
|
64 |
+
def fetch_bonus_leaderboard():
|
65 |
+
logger.info("Bonus leaderboard fetched...")
|
66 |
+
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
67 |
+
return populate.get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
|
68 |
|
69 |
|
70 |
def load_dataset(mode: str):
|
|
|
156 |
gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
|
157 |
gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
|
158 |
refresh_btn = gr.Button("🔄 Refresh")
|
159 |
+
|
160 |
+
gr.Markdown("## 📚 Tossup Round Leaderboard")
|
161 |
+
tossup_leaderboard = gr.Dataframe(
|
162 |
+
value=fetch_tossup_leaderboard,
|
163 |
every=leaderboard_timer,
|
164 |
headers=[c.name for c in fields(AutoEvalColumn)],
|
165 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
166 |
+
elem_id="tossup-table",
|
167 |
interactive=False,
|
168 |
visible=True,
|
169 |
)
|
170 |
+
|
171 |
+
gr.Markdown("## 📚 Bonus Round Leaderboard")
|
172 |
+
bonus_leaderboard = gr.Dataframe(
|
173 |
+
value=fetch_bonus_leaderboard,
|
174 |
+
every=leaderboard_timer,
|
175 |
+
headers=[c.name for c in fields(AutoEvalColumn)],
|
176 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
177 |
+
elem_id="bonus-table",
|
178 |
+
)
|
179 |
+
|
180 |
+
refresh_btn.click(fn=fetch_tossup_leaderboard, inputs=[], outputs=tossup_leaderboard)
|
181 |
+
refresh_btn.click(fn=fetch_bonus_leaderboard, inputs=[], outputs=bonus_leaderboard)
|
182 |
with gr.Tab("❓ Help", id="help"):
|
183 |
with gr.Row():
|
184 |
with gr.Column():
|
src/components/quizbowl/populate.py
CHANGED
@@ -6,6 +6,7 @@ from loguru import logger
|
|
6 |
from app_configs import UNSELECTED_PIPELINE_NAME
|
7 |
from components.structs import TossupWorkflow, Workflow
|
8 |
from display.formatting import styled_error
|
|
|
9 |
from submission import submit
|
10 |
|
11 |
|
@@ -30,7 +31,7 @@ def load_workflow(
|
|
30 |
if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
|
31 |
return None
|
32 |
username, model_name = model_name.split("/")
|
33 |
-
if username ==
|
34 |
workflow = submit.load_demo_example(model_name, competition_type)
|
35 |
elif profile is not None:
|
36 |
submission = submit.load_submission(model_name, competition_type, profile)
|
|
|
6 |
from app_configs import UNSELECTED_PIPELINE_NAME
|
7 |
from components.structs import TossupWorkflow, Workflow
|
8 |
from display.formatting import styled_error
|
9 |
+
from src.envs import OWNER
|
10 |
from submission import submit
|
11 |
|
12 |
|
|
|
31 |
if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
|
32 |
return None
|
33 |
username, model_name = model_name.split("/")
|
34 |
+
if username == OWNER:
|
35 |
workflow = submit.load_demo_example(model_name, competition_type)
|
36 |
elif profile is not None:
|
37 |
submission = submit.load_submission(model_name, competition_type, profile)
|
src/components/quizbowl/tossup.py
CHANGED
@@ -35,10 +35,12 @@ class ScoredTossupResult(TossupResult):
|
|
35 |
"""Result of a tossup question with evaluation score and position."""
|
36 |
|
37 |
score: int # Correctness score of the answer
|
38 |
-
token_position: int #
|
39 |
|
40 |
|
41 |
-
def add_model_scores(
|
|
|
|
|
42 |
"""Add model scores to the model outputs."""
|
43 |
for output in run_outputs:
|
44 |
output["score"] = evaluate_prediction(output["answer"], clean_answers)
|
|
|
35 |
"""Result of a tossup question with evaluation score and position."""
|
36 |
|
37 |
score: int # Correctness score of the answer
|
38 |
+
token_position: int # 0-indexed position in the question where prediction was made
|
39 |
|
40 |
|
41 |
+
def add_model_scores(
|
42 |
+
run_outputs: list[TossupResult], clean_answers: list[str], run_indices: list[int]
|
43 |
+
) -> list[ScoredTossupResult]:
|
44 |
"""Add model scores to the model outputs."""
|
45 |
for output in run_outputs:
|
46 |
output["score"] = evaluate_prediction(output["answer"], clean_answers)
|
src/envs.py
CHANGED
@@ -14,9 +14,10 @@ OWNER = "qanta-challenge"
|
|
14 |
|
15 |
REPO_ID = f"{OWNER}/quizbowl-submission"
|
16 |
QUEUE_REPO = f"{OWNER}/advcal-requests"
|
17 |
-
RESULTS_REPO = f"{OWNER}/
|
18 |
LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
|
19 |
USERS_REPO = f"{OWNER}/registered-users"
|
|
|
20 |
|
21 |
DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
|
22 |
DOCS_REPO_BRANCH = "main"
|
|
|
14 |
|
15 |
REPO_ID = f"{OWNER}/quizbowl-submission"
|
16 |
QUEUE_REPO = f"{OWNER}/advcal-requests"
|
17 |
+
RESULTS_REPO = f"{OWNER}/advcal-results"
|
18 |
LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
|
19 |
USERS_REPO = f"{OWNER}/registered-users"
|
20 |
+
EVAL_SPLITS = ["tiny_eval"]
|
21 |
|
22 |
DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
|
23 |
DOCS_REPO_BRANCH = "main"
|
src/populate.py
CHANGED
@@ -2,31 +2,79 @@ import json
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd
|
|
|
5 |
|
6 |
from display.formatting import make_clickable_model
|
7 |
from display.utils_old import EvalQueueColumn
|
8 |
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
|
|
|
13 |
if len(files) == 0 or not all(f.endswith(".json") for f in files):
|
14 |
continue
|
15 |
for file in files:
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
return pd.DataFrame(eval_results)
|
31 |
|
32 |
|
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd
|
5 |
+
from loguru import logger
|
6 |
|
7 |
from display.formatting import make_clickable_model
|
8 |
from display.utils_old import EvalQueueColumn
|
9 |
|
10 |
|
11 |
+
def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
|
12 |
+
model_results = []
|
13 |
+
dirpath = os.path.join(repo_dir, competition_type, eval_split)
|
14 |
+
for root, _, files in os.walk(dirpath):
|
15 |
if len(files) == 0 or not all(f.endswith(".json") for f in files):
|
16 |
continue
|
17 |
for file in files:
|
18 |
+
filepath = os.path.join(root, file)
|
19 |
+
try:
|
20 |
+
with open(filepath, "r") as fp:
|
21 |
+
result = json.load(fp)
|
22 |
+
model_results.append(result)
|
23 |
+
except Exception as e:
|
24 |
+
logger.error(f"Error loading model result from {filepath}: {e}")
|
25 |
+
continue
|
26 |
+
|
27 |
+
return model_results
|
28 |
+
|
29 |
+
|
30 |
+
def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
31 |
+
model_results = fetch_model_results(repo_dir, "tossup", eval_split)
|
32 |
+
|
33 |
+
eval_results = []
|
34 |
+
for result in model_results:
|
35 |
+
try:
|
36 |
+
metrics = result["metrics"]
|
37 |
+
username = result["username"]
|
38 |
+
model_name = result["model_name"]
|
39 |
+
buzz_accuracy = metrics["buzz_accuracy"]
|
40 |
+
|
41 |
+
row = {
|
42 |
+
"Submission": f"{username}/{model_name}",
|
43 |
+
"Avg Score (Max 10)": metrics["tossup_score"],
|
44 |
+
"Buzzer Accuracy": buzz_accuracy,
|
45 |
+
"Buzzer Position": metrics["buzz_position"],
|
46 |
+
}
|
47 |
+
if "human_win_rate" in metrics:
|
48 |
+
row["Win Rate w/ Human"] = metrics["human_win_rate"]
|
49 |
+
row["Win Rate w/ Human (Aggressive)"] = metrics["human_win_rate_strict"]
|
50 |
+
eval_results.append(row)
|
51 |
+
except Exception as e:
|
52 |
+
logger.error(f"Error processing model result: {e}")
|
53 |
+
continue
|
54 |
+
|
55 |
+
return pd.DataFrame(eval_results)
|
56 |
+
|
57 |
+
|
58 |
+
def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
59 |
+
model_results = fetch_model_results(repo_dir, "bonus", eval_split)
|
60 |
+
|
61 |
+
eval_results = []
|
62 |
+
for result in model_results:
|
63 |
+
try:
|
64 |
+
metrics = result["metrics"]
|
65 |
+
username = result["username"]
|
66 |
+
model_name = result["model_name"]
|
67 |
+
|
68 |
+
row = {
|
69 |
+
"Submission": f"{username}/{model_name}",
|
70 |
+
"Question Accuracy": metrics["question_accuracy"],
|
71 |
+
"Part Accuracy": metrics["part_accuracy"],
|
72 |
+
}
|
73 |
+
eval_results.append(row)
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error processing model result: {e}")
|
76 |
+
continue
|
77 |
+
|
78 |
return pd.DataFrame(eval_results)
|
79 |
|
80 |
|
src/submission/structs.py
CHANGED
@@ -10,6 +10,17 @@ SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
|
|
10 |
SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class Submission(BaseModel):
|
14 |
"""
|
15 |
Represents a submission in the competition system, formatted for HuggingFace datasets.
|
|
|
10 |
SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
|
11 |
|
12 |
|
13 |
+
class User(BaseModel):
|
14 |
+
"""
|
15 |
+
Represents a user in the competition system, formatted for HuggingFace datasets.
|
16 |
+
"""
|
17 |
+
|
18 |
+
username: str = Field(description="HuggingFace username of the user")
|
19 |
+
name: str = Field(description="Full name of the user")
|
20 |
+
email: str = Field(description="Contact email of the user")
|
21 |
+
affiliation: str = Field(description="Affiliation of the user")
|
22 |
+
|
23 |
+
|
24 |
class Submission(BaseModel):
|
25 |
"""
|
26 |
Represents a submission in the competition system, formatted for HuggingFace datasets.
|
src/submission/submit.py
CHANGED
@@ -12,7 +12,7 @@ from loguru import logger
|
|
12 |
|
13 |
from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
|
14 |
from display.formatting import styled_error, styled_message
|
15 |
-
from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, QUEUE_REPO
|
16 |
from submission.structs import CompetitionType, Submission, SubmissionStatus
|
17 |
from workflows.structs import TossupWorkflow, Workflow
|
18 |
|
@@ -49,7 +49,7 @@ def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile |
|
|
49 |
def get_demo_example_submissions(competition_type: str) -> list[str]:
|
50 |
"""Get all submissions for a demo example."""
|
51 |
examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
|
52 |
-
return [f"
|
53 |
|
54 |
|
55 |
def get_user_submissions_by_date(
|
@@ -110,7 +110,7 @@ def create_submission(
|
|
110 |
# Create the submission
|
111 |
dt = datetime.now(timezone.utc)
|
112 |
submission = Submission(
|
113 |
-
id=f"{competition_type}
|
114 |
model_name=model_name,
|
115 |
username=username,
|
116 |
description=description,
|
|
|
12 |
|
13 |
from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
|
14 |
from display.formatting import styled_error, styled_message
|
15 |
+
from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, OWNER, QUEUE_REPO
|
16 |
from submission.structs import CompetitionType, Submission, SubmissionStatus
|
17 |
from workflows.structs import TossupWorkflow, Workflow
|
18 |
|
|
|
49 |
def get_demo_example_submissions(competition_type: str) -> list[str]:
|
50 |
"""Get all submissions for a demo example."""
|
51 |
examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
|
52 |
+
return [f"{OWNER}/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
|
53 |
|
54 |
|
55 |
def get_user_submissions_by_date(
|
|
|
110 |
# Create the submission
|
111 |
dt = datetime.now(timezone.utc)
|
112 |
submission = Submission(
|
113 |
+
id=f"{competition_type}__{dt.strftime('%Y%m%d_%H%M%S')}__{username}__{model_name.lower().replace(' ', '_')}",
|
114 |
model_name=model_name,
|
115 |
username=username,
|
116 |
description=description,
|
src/workflows/llmcache.py
CHANGED
@@ -7,7 +7,7 @@ import time
|
|
7 |
from pathlib import Path
|
8 |
from typing import Any, Optional
|
9 |
|
10 |
-
from datasets import Dataset,
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from loguru import logger
|
13 |
|
@@ -21,6 +21,7 @@ def load_dataset_from_hf(repo_id, local_dir):
|
|
21 |
etag_timeout=30,
|
22 |
token=os.environ["HF_TOKEN"],
|
23 |
)
|
|
|
24 |
|
25 |
|
26 |
class CacheDB:
|
@@ -394,43 +395,49 @@ class LLMCache:
|
|
394 |
|
395 |
try:
|
396 |
# Check for new commits before loading the dataset
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
)
|
423 |
-
)
|
424 |
-
logger.info(
|
425 |
-
f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
|
426 |
)
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
|
|
434 |
except Exception as e:
|
435 |
logger.warning(f"Could not load cache from HF dataset: {e}")
|
436 |
|
@@ -449,6 +456,8 @@ class LLMCache:
|
|
449 |
if not self.hf_repo_id:
|
450 |
return
|
451 |
|
|
|
|
|
452 |
# Get all entries from the database
|
453 |
cache = self.db.get_all_entries()
|
454 |
|
|
|
7 |
from pathlib import Path
|
8 |
from typing import Any, Optional
|
9 |
|
10 |
+
from datasets import Dataset, load_dataset, load_from_disk
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from loguru import logger
|
13 |
|
|
|
21 |
etag_timeout=30,
|
22 |
token=os.environ["HF_TOKEN"],
|
23 |
)
|
24 |
+
return load_dataset(repo_id)
|
25 |
|
26 |
|
27 |
class CacheDB:
|
|
|
395 |
|
396 |
try:
|
397 |
# Check for new commits before loading the dataset
|
398 |
+
ds_path = (self.cache_dir / "hf_cache").as_posix()
|
399 |
+
dataset = load_dataset_from_hf(self.hf_repo_id, ds_path)["train"]
|
400 |
+
if not dataset:
|
401 |
+
logger.info("No new items to merge from HF dataset")
|
402 |
+
return
|
403 |
+
|
404 |
+
existing_keys = self.db.get_existing_keys()
|
405 |
+
|
406 |
+
logger.info(f"Found {len(dataset)} items in HF dataset. Existing keys: {len(existing_keys)}")
|
407 |
+
|
408 |
+
# Prepare batch items for insertion
|
409 |
+
items_to_insert = []
|
410 |
+
for item in dataset:
|
411 |
+
key = item["key"]
|
412 |
+
# Only update if not in local cache to prioritize local changes
|
413 |
+
if key in existing_keys:
|
414 |
+
continue
|
415 |
+
# Create request JSON
|
416 |
+
request_data = {
|
417 |
+
"model": item["model"],
|
418 |
+
"system": item["system"],
|
419 |
+
"prompt": item["prompt"],
|
420 |
+
"temperature": item["temperature"],
|
421 |
+
"response_format": None, # We can't fully reconstruct this
|
422 |
+
}
|
423 |
|
424 |
+
items_to_insert.append(
|
425 |
+
(
|
426 |
+
key,
|
427 |
+
json.dumps(request_data),
|
428 |
+
item["response"], # This is already a JSON string
|
|
|
|
|
|
|
|
|
429 |
)
|
430 |
+
)
|
431 |
+
logger.info(
|
432 |
+
f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
|
433 |
+
)
|
434 |
+
|
435 |
+
# Bulk insert new items
|
436 |
+
if items_to_insert:
|
437 |
+
inserted_count = self.db.bulk_insert(items_to_insert)
|
438 |
+
logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
|
439 |
+
else:
|
440 |
+
logger.info("No new items to merge from HF dataset")
|
441 |
except Exception as e:
|
442 |
logger.warning(f"Could not load cache from HF dataset: {e}")
|
443 |
|
|
|
456 |
if not self.hf_repo_id:
|
457 |
return
|
458 |
|
459 |
+
self._load_cache_from_hf()
|
460 |
+
|
461 |
# Get all entries from the database
|
462 |
cache = self.db.get_all_entries()
|
463 |
|
src/workflows/llms.py
CHANGED
@@ -13,13 +13,27 @@ from langchain_openai import ChatOpenAI
|
|
13 |
from loguru import logger
|
14 |
from openai import OpenAI
|
15 |
from pydantic import BaseModel, Field
|
|
|
|
|
16 |
from rich import print as rprint
|
17 |
|
|
|
|
|
|
|
18 |
from .configs import AVAILABLE_MODELS
|
19 |
from .llmcache import LLMCache
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
|
25 |
def _openai_is_json_mode_supported(model_name: str) -> bool:
|
@@ -52,10 +66,17 @@ def _cohere_completion(
|
|
52 |
{"role": "user", "content": prompt},
|
53 |
]
|
54 |
client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
response = client.chat(
|
56 |
model=model,
|
57 |
messages=messages,
|
58 |
-
response_format=
|
59 |
logprobs=logprobs,
|
60 |
temperature=temperature,
|
61 |
)
|
@@ -180,10 +201,10 @@ def completion(
|
|
180 |
# Check cache first
|
181 |
cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
|
182 |
if cached_response and (not logprobs or cached_response.get("logprob")):
|
183 |
-
logger.
|
184 |
return cached_response
|
185 |
|
186 |
-
logger.
|
187 |
|
188 |
# Continue with the original implementation for cache miss
|
189 |
response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
|
@@ -217,7 +238,7 @@ if __name__ == "__main__":
|
|
217 |
system = "You are an accurate and concise explainer of scientific concepts."
|
218 |
prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
|
219 |
|
220 |
-
llm_cache = LLMCache(cache_dir=".", hf_repo="
|
221 |
|
222 |
# First call - should be a cache miss
|
223 |
logger.info("First call - should be a cache miss")
|
|
|
13 |
from loguru import logger
|
14 |
from openai import OpenAI
|
15 |
from pydantic import BaseModel, Field
|
16 |
+
from pydantic._internal._core_utils import CoreSchemaOrField, is_core_schema
|
17 |
+
from pydantic.json_schema import GenerateJsonSchema
|
18 |
from rich import print as rprint
|
19 |
|
20 |
+
# Initialize global cache
|
21 |
+
from src.envs import CACHE_PATH, LLM_CACHE_REPO
|
22 |
+
|
23 |
from .configs import AVAILABLE_MODELS
|
24 |
from .llmcache import LLMCache
|
25 |
|
26 |
+
llm_cache = LLMCache(cache_dir=CACHE_PATH, hf_repo=LLM_CACHE_REPO)
|
27 |
+
|
28 |
+
|
29 |
+
class CohereSchemaGenerator(GenerateJsonSchema):
|
30 |
+
"""Generates JSON schema for Cohere models without default titles."""
|
31 |
+
|
32 |
+
def field_title_should_be_set(self, schema: CoreSchemaOrField) -> bool:
|
33 |
+
return_value = super().field_title_should_be_set(schema)
|
34 |
+
if return_value and is_core_schema(schema):
|
35 |
+
return False
|
36 |
+
return return_value
|
37 |
|
38 |
|
39 |
def _openai_is_json_mode_supported(model_name: str) -> bool:
|
|
|
66 |
{"role": "user", "content": prompt},
|
67 |
]
|
68 |
client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
|
69 |
+
schema = response_model.model_json_schema(schema_generator=CohereSchemaGenerator)
|
70 |
+
if "title" in schema:
|
71 |
+
del schema["title"]
|
72 |
+
response_format = {
|
73 |
+
"type": "json_object",
|
74 |
+
"schema": schema,
|
75 |
+
}
|
76 |
response = client.chat(
|
77 |
model=model,
|
78 |
messages=messages,
|
79 |
+
response_format=response_format,
|
80 |
logprobs=logprobs,
|
81 |
temperature=temperature,
|
82 |
)
|
|
|
201 |
# Check cache first
|
202 |
cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
|
203 |
if cached_response and (not logprobs or cached_response.get("logprob")):
|
204 |
+
logger.debug(f"Cache hit for model {model}")
|
205 |
return cached_response
|
206 |
|
207 |
+
logger.debug(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
|
208 |
|
209 |
# Continue with the original implementation for cache miss
|
210 |
response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
|
|
|
238 |
system = "You are an accurate and concise explainer of scientific concepts."
|
239 |
prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
|
240 |
|
241 |
+
llm_cache = LLMCache(cache_dir=".", hf_repo="qanta-challenge/advcal-llm-cache", reset=True)
|
242 |
|
243 |
# First call - should be a cache miss
|
244 |
logger.info("First call - should be a cache miss")
|
src/workflows/qb_agents.py
CHANGED
@@ -18,13 +18,13 @@ def _get_workflow_response(
|
|
18 |
|
19 |
|
20 |
class TossupResult(TypedDict):
|
21 |
-
answer: str
|
22 |
-
confidence: float
|
23 |
-
logprob: float | None
|
24 |
-
buzz: bool
|
25 |
-
question_fragment: str
|
26 |
-
position: int
|
27 |
-
step_contents: list[str]
|
28 |
response_time: float
|
29 |
step_outputs: dict[str, Any]
|
30 |
|
@@ -64,7 +64,14 @@ class QuizBowlTossupAgent:
|
|
64 |
raise ValueError(f"Output variable {out_var} not found in workflow outputs")
|
65 |
|
66 |
def _single_run(self, question_run: str, position: int) -> TossupResult:
|
67 |
-
"""Process a single question run.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
answer_var_step = self.workflow.outputs["answer"].split(".")[0]
|
69 |
workflow_output, response_time = _get_workflow_response(
|
70 |
self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
|
@@ -181,7 +188,7 @@ if __name__ == "__main__":
|
|
181 |
|
182 |
from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
|
183 |
|
184 |
-
ds_name = "
|
185 |
ds = load_dataset(ds_name, split="train")
|
186 |
|
187 |
# Create the agents with multi-step workflows
|
|
|
18 |
|
19 |
|
20 |
class TossupResult(TypedDict):
|
21 |
+
answer: str # the model's answer
|
22 |
+
confidence: float # confidence score
|
23 |
+
logprob: float | None # log probability of the answer
|
24 |
+
buzz: bool # whether the agent buzzed
|
25 |
+
question_fragment: str # prefix of the question text so far
|
26 |
+
position: int # 1-indexed question run index
|
27 |
+
step_contents: list[str] # string content outputs of each step
|
28 |
response_time: float
|
29 |
step_outputs: dict[str, Any]
|
30 |
|
|
|
64 |
raise ValueError(f"Output variable {out_var} not found in workflow outputs")
|
65 |
|
66 |
def _single_run(self, question_run: str, position: int) -> TossupResult:
|
67 |
+
"""Process a single question run.
|
68 |
+
Args:
|
69 |
+
question_run: The question run to process
|
70 |
+
position: The position of the question run
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
A TossupResult containing the answer, confidence, logprob, buzz, question fragment, position, step contents, response time, and step outputs
|
74 |
+
"""
|
75 |
answer_var_step = self.workflow.outputs["answer"].split(".")[0]
|
76 |
workflow_output, response_time = _get_workflow_response(
|
77 |
self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
|
|
|
188 |
|
189 |
from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
|
190 |
|
191 |
+
ds_name = "qanta-challenge/leaderboard_co_set"
|
192 |
ds = load_dataset(ds_name, split="train")
|
193 |
|
194 |
# Create the agents with multi-step workflows
|