Spaces:

UltraRonin
/

LR2Bench

Running

UltraRonin commited on Mar 11

Commit

dd7ade0

1 Parent(s): 35bf5a2

add

Files changed (2) hide show

app.py CHANGED Viewed

@@ -49,8 +49,10 @@ def restart_space():
 # except Exception:
 #     restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task="Overall")
 (
     finished_eval_queue_df,
@@ -112,9 +114,11 @@ with demo:
     with gr.Tabs(elem_id="main-tabs", elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
-            with gr.Tabs(elem_id="nested-tabs", elem_classes="nested-tab-buttons"):
-                with gr.TabItem("Overall", elem_id="overall", id=0):
-                    leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

 # except Exception:
 #     restart_space()
+task = ['Overall', 'Acrostic', 'Crossword', 'Cryptogram', 'Logic_Puzzle', 'Sudoku', 'Drop_Quote']
+leaderboard_dict = {}
+for t in task:
+    leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task=t)
 (
     finished_eval_queue_df,
     with gr.Tabs(elem_id="main-tabs", elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
+            with gr.Tabs():
+                for i, t in enumerate(task):
+                    with gr.TabItem(t.replace("_", " "), elem_id=f"llm-benchmark-tab-table-{t}", id=i):
+                        leaderboard = init_leaderboard(leaderboard_dict[t])
+                        gr.Row(leaderboard)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

src/about.py CHANGED Viewed

@@ -45,6 +45,8 @@ INTRODUCTION_TEXT = """
 <strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works

 <strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works