Spaces:

qanta-challenge
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on 9 days ago

Commit

5d637a7

1 Parent(s): d43ec9f

Changed OWNER name at places,

Browse files

leaderboard for tossups and bonus,
submission id now contains username

Files changed (10) hide show

app.py +26 -7
src/components/quizbowl/populate.py +2 -1
src/components/quizbowl/tossup.py +4 -2
src/envs.py +2 -1
src/populate.py +65 -17
src/submission/structs.py +11 -0
src/submission/submit.py +3 -3
src/workflows/llmcache.py +45 -36
src/workflows/llms.py +27 -6
src/workflows/qb_agents.py +16 -9

app.py CHANGED Viewed

@@ -55,10 +55,16 @@ def download_dataset_snapshot(repo_id, local_dir):
 download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
-def fetch_leaderboard_df():
-    logger.info("Leaderboard fetched...")
     download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
-    return populate.get_leaderboard_df(EVAL_RESULTS_PATH)
 def load_dataset(mode: str):
@@ -150,16 +156,29 @@ if __name__ == "__main__":
                 gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
                 gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
                 refresh_btn = gr.Button("🔄 Refresh")
-                leaderboard_table = gr.Dataframe(
-                    value=fetch_leaderboard_df,
                     every=leaderboard_timer,
                     headers=[c.name for c in fields(AutoEvalColumn)],
                     datatype=[c.type for c in fields(AutoEvalColumn)],
-                    elem_id="leaderboard-table",
                     interactive=False,
                     visible=True,
                 )
-                refresh_btn.click(fn=fetch_leaderboard_df, inputs=[], outputs=leaderboard_table)
             with gr.Tab("❓ Help", id="help"):
                 with gr.Row():
                     with gr.Column():

 download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
+def fetch_tossup_leaderboard():
+    logger.info("Tossup leaderboard fetched...")
     download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
+    return populate.get_tossups_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
+def fetch_bonus_leaderboard():
+    logger.info("Bonus leaderboard fetched...")
+    download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
+    return populate.get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
 def load_dataset(mode: str):
                 gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
                 gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
                 refresh_btn = gr.Button("🔄 Refresh")
+                gr.Markdown("## 📚 Tossup Round Leaderboard")
+                tossup_leaderboard = gr.Dataframe(
+                    value=fetch_tossup_leaderboard,
                     every=leaderboard_timer,
                     headers=[c.name for c in fields(AutoEvalColumn)],
                     datatype=[c.type for c in fields(AutoEvalColumn)],
+                    elem_id="tossup-table",
                     interactive=False,
                     visible=True,
                 )
+                gr.Markdown("## 📚 Bonus Round Leaderboard")
+                bonus_leaderboard = gr.Dataframe(
+                    value=fetch_bonus_leaderboard,
+                    every=leaderboard_timer,
+                    headers=[c.name for c in fields(AutoEvalColumn)],
+                    datatype=[c.type for c in fields(AutoEvalColumn)],
+                    elem_id="bonus-table",
+                )
+                refresh_btn.click(fn=fetch_tossup_leaderboard, inputs=[], outputs=tossup_leaderboard)
+                refresh_btn.click(fn=fetch_bonus_leaderboard, inputs=[], outputs=bonus_leaderboard)
             with gr.Tab("❓ Help", id="help"):
                 with gr.Row():
                     with gr.Column():

src/components/quizbowl/populate.py CHANGED Viewed

@@ -6,6 +6,7 @@ from loguru import logger
 from app_configs import UNSELECTED_PIPELINE_NAME
 from components.structs import TossupWorkflow, Workflow
 from display.formatting import styled_error
 from submission import submit
@@ -30,7 +31,7 @@ def load_workflow(
     if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
         return None
     username, model_name = model_name.split("/")
-    if username == "umdclip":
         workflow = submit.load_demo_example(model_name, competition_type)
     elif profile is not None:
         submission = submit.load_submission(model_name, competition_type, profile)

 from app_configs import UNSELECTED_PIPELINE_NAME
 from components.structs import TossupWorkflow, Workflow
 from display.formatting import styled_error
+from src.envs import OWNER
 from submission import submit
     if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
         return None
     username, model_name = model_name.split("/")
+    if username == OWNER:
         workflow = submit.load_demo_example(model_name, competition_type)
     elif profile is not None:
         submission = submit.load_submission(model_name, competition_type, profile)

src/components/quizbowl/tossup.py CHANGED Viewed

@@ -35,10 +35,12 @@ class ScoredTossupResult(TossupResult):
     """Result of a tossup question with evaluation score and position."""
     score: int  # Correctness score of the answer
-    token_position: int  # Position in the question where prediction was made
-def add_model_scores(run_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
     """Add model scores to the model outputs."""
     for output in run_outputs:
         output["score"] = evaluate_prediction(output["answer"], clean_answers)

     """Result of a tossup question with evaluation score and position."""
     score: int  # Correctness score of the answer
+    token_position: int  # 0-indexed position in the question where prediction was made
+def add_model_scores(
+    run_outputs: list[TossupResult], clean_answers: list[str], run_indices: list[int]
+) -> list[ScoredTossupResult]:
     """Add model scores to the model outputs."""
     for output in run_outputs:
         output["score"] = evaluate_prediction(output["answer"], clean_answers)

src/envs.py CHANGED Viewed

@@ -14,9 +14,10 @@ OWNER = "qanta-challenge"
 REPO_ID = f"{OWNER}/quizbowl-submission"
 QUEUE_REPO = f"{OWNER}/advcal-requests"
-RESULTS_REPO = f"{OWNER}/model-results"  # TODO: change to advcal-results after testing is done
 LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
 USERS_REPO = f"{OWNER}/registered-users"
 DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
 DOCS_REPO_BRANCH = "main"

 REPO_ID = f"{OWNER}/quizbowl-submission"
 QUEUE_REPO = f"{OWNER}/advcal-requests"
+RESULTS_REPO = f"{OWNER}/advcal-results"
 LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
 USERS_REPO = f"{OWNER}/registered-users"
+EVAL_SPLITS = ["tiny_eval"]
 DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
 DOCS_REPO_BRANCH = "main"

src/populate.py CHANGED Viewed

@@ -2,31 +2,79 @@ import json
 import os
 import pandas as pd
 from display.formatting import make_clickable_model
 from display.utils_old import EvalQueueColumn
-def get_leaderboard_df(results_path: str) -> pd.DataFrame:
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
         if len(files) == 0 or not all(f.endswith(".json") for f in files):
             continue
         for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {"model": [], "buzz_accuracy": [], "win_rate_human": [], "win_rate_model": []}
-    for model_result_filepath in model_result_filepaths:
-        with open(model_result_filepath, "r") as fin:
-            model_result = json.load(fin)
-            model_id = model_result["model_id"]
-            buzz_accuracy = model_result["buzz_accuracy"]
-            win_rate_human = model_result["win_rate_human"]
-            win_rate_model = model_result["win_rate_model"]
-            eval_results["model"].append(model_id)
-            eval_results["buzz_accuracy"].append(buzz_accuracy)
-            eval_results["win_rate_human"].append(win_rate_human)
-            eval_results["win_rate_model"].append(win_rate_model)
     return pd.DataFrame(eval_results)

 import os
 import pandas as pd
+from loguru import logger
 from display.formatting import make_clickable_model
 from display.utils_old import EvalQueueColumn
+def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
+    model_results = []
+    dirpath = os.path.join(repo_dir, competition_type, eval_split)
+    for root, _, files in os.walk(dirpath):
         if len(files) == 0 or not all(f.endswith(".json") for f in files):
             continue
         for file in files:
+            filepath = os.path.join(root, file)
+            try:
+                with open(filepath, "r") as fp:
+                    result = json.load(fp)
+                model_results.append(result)
+            except Exception as e:
+                logger.error(f"Error loading model result from {filepath}: {e}")
+                continue
+    return model_results
+def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
+    model_results = fetch_model_results(repo_dir, "tossup", eval_split)
+    eval_results = []
+    for result in model_results:
+        try:
+            metrics = result["metrics"]
+            username = result["username"]
+            model_name = result["model_name"]
+            buzz_accuracy = metrics["buzz_accuracy"]
+            row = {
+                "Submission": f"{username}/{model_name}",
+                "Avg Score (Max 10)": metrics["tossup_score"],
+                "Buzzer Accuracy": buzz_accuracy,
+                "Buzzer Position": metrics["buzz_position"],
+            }
+            if "human_win_rate" in metrics:
+                row["Win Rate w/ Human"] = metrics["human_win_rate"]
+                row["Win Rate w/ Human (Aggressive)"] = metrics["human_win_rate_strict"]
+            eval_results.append(row)
+        except Exception as e:
+            logger.error(f"Error processing model result: {e}")
+            continue
+    return pd.DataFrame(eval_results)
+def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
+    model_results = fetch_model_results(repo_dir, "bonus", eval_split)
+    eval_results = []
+    for result in model_results:
+        try:
+            metrics = result["metrics"]
+            username = result["username"]
+            model_name = result["model_name"]
+            row = {
+                "Submission": f"{username}/{model_name}",
+                "Question Accuracy": metrics["question_accuracy"],
+                "Part Accuracy": metrics["part_accuracy"],
+            }
+            eval_results.append(row)
+        except Exception as e:
+            logger.error(f"Error processing model result: {e}")
+            continue
     return pd.DataFrame(eval_results)

src/submission/structs.py CHANGED Viewed

@@ -10,6 +10,17 @@ SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
 SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
 class Submission(BaseModel):
     """
     Represents a submission in the competition system, formatted for HuggingFace datasets.

 SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
+class User(BaseModel):
+    """
+    Represents a user in the competition system, formatted for HuggingFace datasets.
+    """
+    username: str = Field(description="HuggingFace username of the user")
+    name: str = Field(description="Full name of the user")
+    email: str = Field(description="Contact email of the user")
+    affiliation: str = Field(description="Affiliation of the user")
 class Submission(BaseModel):
     """
     Represents a submission in the competition system, formatted for HuggingFace datasets.

src/submission/submit.py CHANGED Viewed

@@ -12,7 +12,7 @@ from loguru import logger
 from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
 from display.formatting import styled_error, styled_message
-from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, QUEUE_REPO
 from submission.structs import CompetitionType, Submission, SubmissionStatus
 from workflows.structs import TossupWorkflow, Workflow
@@ -49,7 +49,7 @@ def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile |
 def get_demo_example_submissions(competition_type: str) -> list[str]:
     """Get all submissions for a demo example."""
     examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
-    return [f"umdclip/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
 def get_user_submissions_by_date(
@@ -110,7 +110,7 @@ def create_submission(
     # Create the submission
     dt = datetime.now(timezone.utc)
     submission = Submission(
-        id=f"{competition_type}_{dt.strftime('%Y%m%d_%H%M%S')}_{model_name.lower().replace(' ', '_')}",
         model_name=model_name,
         username=username,
         description=description,

 from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
 from display.formatting import styled_error, styled_message
+from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, OWNER, QUEUE_REPO
 from submission.structs import CompetitionType, Submission, SubmissionStatus
 from workflows.structs import TossupWorkflow, Workflow
 def get_demo_example_submissions(competition_type: str) -> list[str]:
     """Get all submissions for a demo example."""
     examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
+    return [f"{OWNER}/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
 def get_user_submissions_by_date(
     # Create the submission
     dt = datetime.now(timezone.utc)
     submission = Submission(
+        id=f"{competition_type}__{dt.strftime('%Y%m%d_%H%M%S')}__{username}__{model_name.lower().replace(' ', '_')}",
         model_name=model_name,
         username=username,
         description=description,

src/workflows/llmcache.py CHANGED Viewed

@@ -7,7 +7,7 @@ import time
 from pathlib import Path
 from typing import Any, Optional
-from datasets import Dataset, Features, Value
 from huggingface_hub import snapshot_download
 from loguru import logger
@@ -21,6 +21,7 @@ def load_dataset_from_hf(repo_id, local_dir):
         etag_timeout=30,
         token=os.environ["HF_TOKEN"],
     )
 class CacheDB:
@@ -394,43 +395,49 @@ class LLMCache:
         try:
             # Check for new commits before loading the dataset
-            dataset = load_dataset_from_hf(self.hf_repo_id, self.cache_dir / "hf_cache")
-            if dataset:
-                existing_keys = self.db.get_existing_keys()
-                # Prepare batch items for insertion
-                items_to_insert = []
-                for item in dataset:
-                    key = item["key"]
-                    # Only update if not in local cache to prioritize local changes
-                    if key in existing_keys:
-                        continue
-                    # Create request JSON
-                    request_data = {
-                        "model": item["model"],
-                        "system": item["system"],
-                        "prompt": item["prompt"],
-                        "temperature": item["temperature"],
-                        "response_format": None,  # We can't fully reconstruct this
-                    }
-                    items_to_insert.append(
-                        (
-                            key,
-                            json.dumps(request_data),
-                            item["response"],  # This is already a JSON string
-                        )
-                    )
-                    logger.info(
-                        f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
                     )
-                # Bulk insert new items
-                if items_to_insert:
-                    inserted_count = self.db.bulk_insert(items_to_insert)
-                    logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
-                else:
-                    logger.info("No new items to merge from HF dataset")
         except Exception as e:
             logger.warning(f"Could not load cache from HF dataset: {e}")
@@ -449,6 +456,8 @@ class LLMCache:
         if not self.hf_repo_id:
             return
         # Get all entries from the database
         cache = self.db.get_all_entries()

 from pathlib import Path
 from typing import Any, Optional
+from datasets import Dataset, load_dataset, load_from_disk
 from huggingface_hub import snapshot_download
 from loguru import logger
         etag_timeout=30,
         token=os.environ["HF_TOKEN"],
     )
+    return load_dataset(repo_id)
 class CacheDB:
         try:
             # Check for new commits before loading the dataset
+            ds_path = (self.cache_dir / "hf_cache").as_posix()
+            dataset = load_dataset_from_hf(self.hf_repo_id, ds_path)["train"]
+            if not dataset:
+                logger.info("No new items to merge from HF dataset")
+                return
+            existing_keys = self.db.get_existing_keys()
+            logger.info(f"Found {len(dataset)} items in HF dataset. Existing keys: {len(existing_keys)}")
+            # Prepare batch items for insertion
+            items_to_insert = []
+            for item in dataset:
+                key = item["key"]
+                # Only update if not in local cache to prioritize local changes
+                if key in existing_keys:
+                    continue
+                # Create request JSON
+                request_data = {
+                    "model": item["model"],
+                    "system": item["system"],
+                    "prompt": item["prompt"],
+                    "temperature": item["temperature"],
+                    "response_format": None,  # We can't fully reconstruct this
+                }
+                items_to_insert.append(
+                    (
+                        key,
+                        json.dumps(request_data),
+                        item["response"],  # This is already a JSON string
                     )
+                )
+                logger.info(
+                    f"Inserting item: {key} with temperature: {item['temperature']} and response: {item['response']}"
+                )
+            # Bulk insert new items
+            if items_to_insert:
+                inserted_count = self.db.bulk_insert(items_to_insert)
+                logger.info(f"Merged {inserted_count} items from HF dataset into SQLite cache")
+            else:
+                logger.info("No new items to merge from HF dataset")
         except Exception as e:
             logger.warning(f"Could not load cache from HF dataset: {e}")
         if not self.hf_repo_id:
             return
+        self._load_cache_from_hf()
         # Get all entries from the database
         cache = self.db.get_all_entries()

src/workflows/llms.py CHANGED Viewed

@@ -13,13 +13,27 @@ from langchain_openai import ChatOpenAI
 from loguru import logger
 from openai import OpenAI
 from pydantic import BaseModel, Field
 from rich import print as rprint
 from .configs import AVAILABLE_MODELS
 from .llmcache import LLMCache
-# Initialize global cache
-llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache")
 def _openai_is_json_mode_supported(model_name: str) -> bool:
@@ -52,10 +66,17 @@ def _cohere_completion(
         {"role": "user", "content": prompt},
     ]
     client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
     response = client.chat(
         model=model,
         messages=messages,
-        response_format={"type": "json_schema", "json_schema": response_model.model_json_schema()},
         logprobs=logprobs,
         temperature=temperature,
     )
@@ -180,10 +201,10 @@ def completion(
     # Check cache first
     cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
     if cached_response and (not logprobs or cached_response.get("logprob")):
-        logger.info(f"Cache hit for model {model}")
         return cached_response
-    logger.info(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
     # Continue with the original implementation for cache miss
     response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
@@ -217,7 +238,7 @@ if __name__ == "__main__":
     system = "You are an accurate and concise explainer of scientific concepts."
     prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
-    llm_cache = LLMCache(cache_dir=".", hf_repo="umdclip/advcal-llm-cache", reset=True)
     # First call - should be a cache miss
     logger.info("First call - should be a cache miss")

 from loguru import logger
 from openai import OpenAI
 from pydantic import BaseModel, Field
+from pydantic._internal._core_utils import CoreSchemaOrField, is_core_schema
+from pydantic.json_schema import GenerateJsonSchema
 from rich import print as rprint
+# Initialize global cache
+from src.envs import CACHE_PATH, LLM_CACHE_REPO
 from .configs import AVAILABLE_MODELS
 from .llmcache import LLMCache
+llm_cache = LLMCache(cache_dir=CACHE_PATH, hf_repo=LLM_CACHE_REPO)
+class CohereSchemaGenerator(GenerateJsonSchema):
+    """Generates JSON schema for Cohere models without default titles."""
+    def field_title_should_be_set(self, schema: CoreSchemaOrField) -> bool:
+        return_value = super().field_title_should_be_set(schema)
+        if return_value and is_core_schema(schema):
+            return False
+        return return_value
 def _openai_is_json_mode_supported(model_name: str) -> bool:
         {"role": "user", "content": prompt},
     ]
     client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
+    schema = response_model.model_json_schema(schema_generator=CohereSchemaGenerator)
+    if "title" in schema:
+        del schema["title"]
+    response_format = {
+        "type": "json_object",
+        "schema": schema,
+    }
     response = client.chat(
         model=model,
         messages=messages,
+        response_format=response_format,
         logprobs=logprobs,
         temperature=temperature,
     )
     # Check cache first
     cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
     if cached_response and (not logprobs or cached_response.get("logprob")):
+        logger.debug(f"Cache hit for model {model}")
         return cached_response
+    logger.debug(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
     # Continue with the original implementation for cache miss
     response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
     system = "You are an accurate and concise explainer of scientific concepts."
     prompt = "Which planet is closest to the sun in the Milky Way galaxy? Answer directly, no explanation needed."
+    llm_cache = LLMCache(cache_dir=".", hf_repo="qanta-challenge/advcal-llm-cache", reset=True)
     # First call - should be a cache miss
     logger.info("First call - should be a cache miss")

src/workflows/qb_agents.py CHANGED Viewed

@@ -18,13 +18,13 @@ def _get_workflow_response(
 class TossupResult(TypedDict):
-    answer: str
-    confidence: float
-    logprob: float | None
-    buzz: bool
-    question_fragment: str
-    position: int
-    step_contents: list[str]
     response_time: float
     step_outputs: dict[str, Any]
@@ -64,7 +64,14 @@ class QuizBowlTossupAgent:
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
     def _single_run(self, question_run: str, position: int) -> TossupResult:
-        """Process a single question run."""
         answer_var_step = self.workflow.outputs["answer"].split(".")[0]
         workflow_output, response_time = _get_workflow_response(
             self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
@@ -181,7 +188,7 @@ if __name__ == "__main__":
     from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
-    ds_name = "umdclip/leaderboard_co_set"
     ds = load_dataset(ds_name, split="train")
     # Create the agents with multi-step workflows

 class TossupResult(TypedDict):
+    answer: str  # the model's answer
+    confidence: float  # confidence score
+    logprob: float | None  # log probability of the answer
+    buzz: bool  # whether the agent buzzed
+    question_fragment: str  # prefix of the question text so far
+    position: int  # 1-indexed question run index
+    step_contents: list[str]  # string content outputs of each step
     response_time: float
     step_outputs: dict[str, Any]
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
     def _single_run(self, question_run: str, position: int) -> TossupResult:
+        """Process a single question run.
+        Args:
+            question_run: The question run to process
+            position: The position of the question run
+        Returns:
+            A TossupResult containing the answer, confidence, logprob, buzz, question fragment, position, step contents, response time, and step outputs
+        """
         answer_var_step = self.workflow.outputs["answer"].split(".")[0]
         workflow_output, response_time = _get_workflow_response(
             self.workflow, {self.external_input_variable: question_run}, logprob_step=answer_var_step
     from workflows.factory import create_quizbowl_bonus_workflow, create_quizbowl_tossup_workflow
+    ds_name = "qanta-challenge/leaderboard_co_set"
     ds = load_dataset(ds_name, split="train")
     # Create the agents with multi-step workflows