UltraRonin commited on
Commit
dd7ade0
Β·
1 Parent(s): 35bf5a2
Files changed (2) hide show
  1. app.py +9 -5
  2. src/about.py +2 -0
app.py CHANGED
@@ -49,8 +49,10 @@ def restart_space():
49
  # except Exception:
50
  # restart_space()
51
 
52
-
53
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task="Overall")
 
 
54
 
55
  (
56
  finished_eval_queue_df,
@@ -112,9 +114,11 @@ with demo:
112
  with gr.Tabs(elem_id="main-tabs", elem_classes="tab-buttons") as tabs:
113
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
114
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
115
- with gr.Tabs(elem_id="nested-tabs", elem_classes="nested-tab-buttons"):
116
- with gr.TabItem("Overall", elem_id="overall", id=0):
117
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
118
 
119
 
120
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
49
  # except Exception:
50
  # restart_space()
51
 
52
+ task = ['Overall', 'Acrostic', 'Crossword', 'Cryptogram', 'Logic_Puzzle', 'Sudoku', 'Drop_Quote']
53
+ leaderboard_dict = {}
54
+ for t in task:
55
+ leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task=t)
56
 
57
  (
58
  finished_eval_queue_df,
 
114
  with gr.Tabs(elem_id="main-tabs", elem_classes="tab-buttons") as tabs:
115
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
116
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
117
+ with gr.Tabs():
118
+ for i, t in enumerate(task):
119
+ with gr.TabItem(t.replace("_", " "), elem_id=f"llm-benchmark-tab-table-{t}", id=i):
120
+ leaderboard = init_leaderboard(leaderboard_dict[t])
121
+ gr.Row(leaderboard)
122
 
123
 
124
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
src/about.py CHANGED
@@ -45,6 +45,8 @@ INTRODUCTION_TEXT = """
45
  <strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
46
  """
47
 
 
 
48
  # Which evaluations are you running? how can people reproduce what you have?
49
  LLM_BENCHMARKS_TEXT = f"""
50
  ## How it works
 
45
  <strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
46
  """
47
 
48
+
49
+
50
  # Which evaluations are you running? how can people reproduce what you have?
51
  LLM_BENCHMARKS_TEXT = f"""
52
  ## How it works