Quazim0t0 commited on
Commit
f71f535
·
verified ·
1 Parent(s): ae8cd7f

Delete leaderboard.py

Browse files
Files changed (1) hide show
  1. leaderboard.py +0 -396
leaderboard.py DELETED
@@ -1,396 +0,0 @@
1
- """
2
- Leaderboard module for Dynamic Highscores system.
3
-
4
- This module implements the unified leaderboard with tag-based filtering
5
- for displaying all evaluated models.
6
- """
7
-
8
- import os
9
- import json
10
- import pandas as pd
11
- import gradio as gr
12
- import plotly.express as px
13
- import plotly.graph_objects as go
14
-
15
- class Leaderboard:
16
- """Manages the unified leaderboard with filtering capabilities."""
17
-
18
- def __init__(self, db_manager):
19
- """Initialize the leaderboard manager.
20
-
21
- Args:
22
- db_manager: Database manager instance
23
- """
24
- self.db_manager = db_manager
25
- self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
26
-
27
- # Define color scheme for tags
28
- self.tag_colors = {
29
- "Merge": "#FF6B6B",
30
- "Agent": "#4ECDC4",
31
- "Reasoning": "#FFD166",
32
- "Coding": "#6B5B95",
33
- "General": "#88D8B0",
34
- "Specialized": "#FF8C42",
35
- "Instruction": "#5D9CEC",
36
- "Chat": "#AC92EB"
37
- }
38
-
39
- def get_leaderboard_data(self, tag=None, benchmark_id=None):
40
- """Get leaderboard data, optionally filtered by tag or benchmark.
41
-
42
- Args:
43
- tag: Model tag to filter by (None for all)
44
- benchmark_id: Benchmark ID to filter by (None for all)
45
-
46
- Returns:
47
- pd.DataFrame: Leaderboard data
48
- """
49
- # Get evaluation results from database
50
- if tag and tag != "All":
51
- df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id)
52
- else:
53
- df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id)
54
-
55
- return df
56
-
57
- def format_leaderboard_for_display(self, df):
58
- """Format leaderboard data for display.
59
-
60
- Args:
61
- df: Leaderboard DataFrame
62
-
63
- Returns:
64
- pd.DataFrame: Formatted leaderboard for display
65
- """
66
- if df.empty:
67
- return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
68
-
69
- # Select and rename columns for display
70
- display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
71
- display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
72
-
73
- # Round score to 2 decimal places
74
- display_df['Score'] = display_df['Score'].round(2)
75
-
76
- # Sort by score (descending)
77
- display_df = display_df.sort_values('Score', ascending=False)
78
-
79
- return display_df
80
-
81
- def create_performance_chart(self, df, chart_type="bar"):
82
- """Create a performance chart from leaderboard data.
83
-
84
- Args:
85
- df: Leaderboard DataFrame
86
- chart_type: Type of chart to create ("bar" or "scatter")
87
-
88
- Returns:
89
- plotly.graph_objects.Figure: Performance chart
90
- """
91
- if df.empty:
92
- # Return empty figure
93
- fig = go.Figure()
94
- fig.update_layout(
95
- title="No data available",
96
- xaxis_title="Model",
97
- yaxis_title="Score"
98
- )
99
- return fig
100
-
101
- # Prepare data for visualization
102
- plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
103
- plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
104
-
105
- # Create chart based on type
106
- if chart_type == "scatter":
107
- fig = px.scatter(
108
- plot_df,
109
- x="Model",
110
- y="Score",
111
- color="Tag",
112
- symbol="Benchmark",
113
- size="Score",
114
- hover_data=["Model", "Benchmark", "Score"],
115
- color_discrete_map=self.tag_colors
116
- )
117
- else: # Default to bar chart
118
- fig = px.bar(
119
- plot_df,
120
- x="Model",
121
- y="Score",
122
- color="Tag",
123
- barmode="group",
124
- hover_data=["Model", "Benchmark", "Score"],
125
- color_discrete_map=self.tag_colors
126
- )
127
-
128
- # Customize layout
129
- fig.update_layout(
130
- title="Model Performance Comparison",
131
- xaxis_title="Model",
132
- yaxis_title="Score",
133
- legend_title="Tag",
134
- font=dict(size=12)
135
- )
136
-
137
- return fig
138
-
139
- def create_tag_distribution_chart(self, df):
140
- """Create a chart showing distribution of models by tag.
141
-
142
- Args:
143
- df: Leaderboard DataFrame
144
-
145
- Returns:
146
- plotly.graph_objects.Figure: Tag distribution chart
147
- """
148
- if df.empty:
149
- # Return empty figure
150
- fig = go.Figure()
151
- fig.update_layout(
152
- title="No data available",
153
- xaxis_title="Tag",
154
- yaxis_title="Count"
155
- )
156
- return fig
157
-
158
- # Count models by tag
159
- tag_counts = df['tag'].value_counts().reset_index()
160
- tag_counts.columns = ['Tag', 'Count']
161
-
162
- # Create pie chart
163
- fig = px.pie(
164
- tag_counts,
165
- names='Tag',
166
- values='Count',
167
- title='Model Distribution by Tag',
168
- color='Tag',
169
- color_discrete_map=self.tag_colors
170
- )
171
-
172
- # Customize layout
173
- fig.update_layout(
174
- font=dict(size=12)
175
- )
176
-
177
- return fig
178
-
179
- def create_benchmark_comparison_chart(self, df):
180
- """Create a chart comparing performance across benchmarks.
181
-
182
- Args:
183
- df: Leaderboard DataFrame
184
-
185
- Returns:
186
- plotly.graph_objects.Figure: Benchmark comparison chart
187
- """
188
- if df.empty:
189
- # Return empty figure
190
- fig = go.Figure()
191
- fig.update_layout(
192
- title="No data available",
193
- xaxis_title="Benchmark",
194
- yaxis_title="Average Score"
195
- )
196
- return fig
197
-
198
- # Calculate average score by benchmark
199
- benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
200
- benchmark_avg.columns = ['Benchmark', 'Average Score']
201
-
202
- # Create bar chart
203
- fig = px.bar(
204
- benchmark_avg,
205
- x='Benchmark',
206
- y='Average Score',
207
- title='Average Performance by Benchmark',
208
- color='Benchmark'
209
- )
210
-
211
- # Customize layout
212
- fig.update_layout(
213
- xaxis_title="Benchmark",
214
- yaxis_title="Average Score",
215
- font=dict(size=12)
216
- )
217
-
218
- return fig
219
-
220
- # Leaderboard UI components
221
- def create_leaderboard_ui(leaderboard, db_manager):
222
- """Create the leaderboard UI components.
223
-
224
- Args:
225
- leaderboard: Leaderboard instance
226
- db_manager: Database manager instance
227
-
228
- Returns:
229
- gr.Blocks: Gradio Blocks component with leaderboard UI
230
- """
231
- with gr.Blocks() as leaderboard_ui:
232
- gr.Markdown("# Dynamic Highscores Leaderboard")
233
-
234
- with gr.Row():
235
- with gr.Column(scale=1):
236
- tag_filter = gr.Dropdown(
237
- choices=leaderboard.model_tags,
238
- value="All",
239
- label="Filter by Tag"
240
- )
241
-
242
- benchmark_filter = gr.Dropdown(
243
- choices=[("all", "All Benchmarks")],
244
- value="all",
245
- label="Filter by Benchmark"
246
- )
247
-
248
- refresh_button = gr.Button("Refresh Leaderboard")
249
-
250
- with gr.Column(scale=2):
251
- chart_type = gr.Radio(
252
- choices=["bar", "scatter"],
253
- value="bar",
254
- label="Chart Type"
255
- )
256
-
257
- view_type = gr.Radio(
258
- choices=["Table", "Chart", "Dashboard"],
259
- value="Table",
260
- label="View Type"
261
- )
262
-
263
- # Table view
264
- leaderboard_table = gr.Dataframe(
265
- headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
266
- label="Leaderboard",
267
- visible=True
268
- )
269
-
270
- # Chart view
271
- with gr.Row(visible=False) as chart_view:
272
- performance_chart = gr.Plot(label="Performance Chart")
273
-
274
- # Dashboard view
275
- with gr.Row(visible=False) as dashboard_view:
276
- with gr.Column(scale=2):
277
- dashboard_performance_chart = gr.Plot(label="Performance Comparison")
278
-
279
- with gr.Column(scale=1):
280
- with gr.Row():
281
- tag_distribution_chart = gr.Plot(label="Model Distribution")
282
-
283
- with gr.Row():
284
- benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
285
-
286
- # Event handlers
287
- def refresh_benchmarks():
288
- try:
289
- benchmarks = db_manager.get_benchmarks()
290
-
291
- # Format for dropdown
292
- choices = [("all", "All Benchmarks")]
293
- choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
294
-
295
- return gr.update(choices=choices)
296
- except Exception as e:
297
- print(f"Error refreshing benchmarks: {e}")
298
- return gr.update(choices=[("all", "All Benchmarks")])
299
-
300
- def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
301
- try:
302
- # Get leaderboard data
303
- if benchmark_id == "all":
304
- benchmark_id = None
305
-
306
- df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
307
-
308
- # Format for display
309
- display_df = leaderboard.format_leaderboard_for_display(df)
310
-
311
- # Create charts
312
- perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
313
- tag_chart = leaderboard.create_tag_distribution_chart(df)
314
- benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
315
-
316
- # Update visibility based on view type
317
- table_visible = view_type_val == "Table"
318
- chart_visible = view_type_val == "Chart"
319
- dashboard_visible = view_type_val == "Dashboard"
320
-
321
- return (
322
- display_df,
323
- perf_chart,
324
- perf_chart, # Same chart for both views
325
- tag_chart,
326
- benchmark_chart,
327
- gr.update(visible=table_visible),
328
- gr.update(visible=chart_visible),
329
- gr.update(visible=dashboard_visible)
330
- )
331
- except Exception as e:
332
- print(f"Error updating leaderboard: {e}")
333
- empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
334
- empty_chart = go.Figure()
335
- empty_chart.update_layout(title="Error loading data")
336
-
337
- return (
338
- empty_df,
339
- empty_chart,
340
- empty_chart,
341
- empty_chart,
342
- empty_chart,
343
- gr.update(visible=True),
344
- gr.update(visible=False),
345
- gr.update(visible=False)
346
- )
347
-
348
- # Connect event handlers
349
- refresh_button.click(
350
- fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
351
- inputs=[tag_filter, benchmark_filter, chart_type, view_type],
352
- outputs=[
353
- leaderboard_table,
354
- performance_chart,
355
- dashboard_performance_chart,
356
- tag_distribution_chart,
357
- benchmark_comparison_chart,
358
- leaderboard_table,
359
- chart_view,
360
- dashboard_view
361
- ]
362
- )
363
-
364
- view_type.change(
365
- fn=lambda view_t: (
366
- gr.update(visible=view_t == "Table"),
367
- gr.update(visible=view_t == "Chart"),
368
- gr.update(visible=view_t == "Dashboard")
369
- ),
370
- inputs=[view_type],
371
- outputs=[leaderboard_table, chart_view, dashboard_view]
372
- )
373
-
374
- # Initialize on load
375
- leaderboard_ui.load(
376
- fn=refresh_benchmarks,
377
- inputs=[],
378
- outputs=[benchmark_filter]
379
- )
380
-
381
- leaderboard_ui.load(
382
- fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
383
- inputs=[],
384
- outputs=[
385
- leaderboard_table,
386
- performance_chart,
387
- dashboard_performance_chart,
388
- tag_distribution_chart,
389
- benchmark_comparison_chart,
390
- leaderboard_table,
391
- chart_view,
392
- dashboard_view
393
- ]
394
- )
395
-
396
- return leaderboard_ui