Daniel Kantor commited on
Commit
dd9222d
·
1 Parent(s): c2f7972

fix columns

Browse files
backend/app/services/leaderboard.py CHANGED
@@ -116,38 +116,21 @@ class LeaderboardService:
116
  )
117
 
118
  # Create unique ID combining model name, precision, sha and chat template status
 
119
  unique_id = f"{data.get('fullname', 'Unknown')}_{data.get('Precision', 'Unknown')}_{data.get('Model sha', 'Unknown')}_{str(data.get('Chat Template', False))}"
120
 
 
 
121
  evaluations = {
122
- "ifeval": {
123
- "name": "IFEval",
124
- "value": data.get("IFEval Raw", 0),
125
- "normalized_score": data.get("IFEval", 0),
126
- },
127
- "bbh": {
128
- "name": "BBH",
129
- "value": data.get("BBH Raw", 0),
130
- "normalized_score": data.get("BBH", 0),
131
- },
132
- "math": {
133
- "name": "MATH Level 5",
134
- "value": data.get("MATH Lvl 5 Raw", 0),
135
- "normalized_score": data.get("MATH Lvl 5", 0),
136
- },
137
- "gpqa": {
138
- "name": "GPQA",
139
- "value": data.get("GPQA Raw", 0),
140
- "normalized_score": data.get("GPQA", 0),
141
- },
142
- "musr": {
143
- "name": "MUSR",
144
- "value": data.get("MUSR Raw", 0),
145
- "normalized_score": data.get("MUSR", 0),
146
  },
147
- "mmlu_pro": {
148
- "name": "MMLU-PRO",
149
- "value": data.get("MMLU-PRO Raw", 0),
150
- "normalized_score": data.get("MMLU-PRO", 0),
151
  },
152
  }
153
 
 
116
  )
117
 
118
  # Create unique ID combining model name, precision, sha and chat template status
119
+
120
  unique_id = f"{data.get('fullname', 'Unknown')}_{data.get('Precision', 'Unknown')}_{data.get('Model sha', 'Unknown')}_{str(data.get('Chat Template', False))}"
121
 
122
+ print(data)
123
+
124
  evaluations = {
125
+ "safetensors": {
126
+ "name": "SafeTensors",
127
+ "value": data.get("safetensors", 0),
128
+ "normalized_score": data.get("safetensors", 0),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  },
130
+ "secure_coding": {
131
+ "name": "Secure Coding",
132
+ "value": data.get("secure_coding", 0),
133
+ "normalized_score": data.get("secure_coding", 0),
134
  },
135
  }
136
 
docker-compose.yml CHANGED
@@ -19,10 +19,10 @@ services:
19
  build:
20
  context: ./frontend
21
  dockerfile: Dockerfile.dev
 
 
22
  ports:
23
  - "${FRONTEND_PORT:-7861}:7861"
24
- args:
25
- - OAUTH_CLIENT_ID=${OAUTH_CLIENT_ID}
26
  volumes:
27
  - ./frontend:/app
28
  - /app/node_modules
 
19
  build:
20
  context: ./frontend
21
  dockerfile: Dockerfile.dev
22
+ args:
23
+ - OAUTH_CLIENT_ID=${OAUTH_CLIENT_ID}
24
  ports:
25
  - "${FRONTEND_PORT:-7861}:7861"
 
 
26
  volumes:
27
  - ./frontend:/app
28
  - /app/node_modules
frontend/src/pages/LeaderboardPage/components/Leaderboard/components/Table/hooks/useDataProcessing.js CHANGED
@@ -37,6 +37,8 @@ export const useDataProcessing = (
37
  const processedData = useProcessedData(data, averageMode, visibleColumns);
38
  const columnVisibility = useColumnVisibility(visibleColumns);
39
 
 
 
40
  // Memoize filters
41
  const filterConfig = useMemo(
42
  () => ({
 
37
  const processedData = useProcessedData(data, averageMode, visibleColumns);
38
  const columnVisibility = useColumnVisibility(visibleColumns);
39
 
40
+ console.log({visibleColumns});
41
+
42
  // Memoize filters
43
  const filterConfig = useMemo(
44
  () => ({
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED
@@ -117,49 +117,25 @@ const COLUMNS = {
117
  defaultVisible: true,
118
  label: "Model",
119
  },
120
- "model.average_score": {
121
- group: "fixed",
122
- size: COLUMN_SIZES.AVERAGE_SCORE,
123
- defaultVisible: true,
124
- label: "Average Score",
125
- },
126
  },
127
  EVALUATION: {
128
- "evaluations.ifeval.normalized_score": {
129
- group: "evaluation",
130
- size: COLUMN_SIZES.BENCHMARK,
131
- defaultVisible: true,
132
- label: "IFEval",
133
- },
134
- "evaluations.bbh.normalized_score": {
135
- group: "evaluation",
136
- size: COLUMN_SIZES.BENCHMARK,
137
- defaultVisible: true,
138
- label: "BBH",
139
- },
140
- "evaluations.math.normalized_score": {
141
- group: "evaluation",
142
- size: COLUMN_SIZES.BENCHMARK,
143
- defaultVisible: true,
144
- label: "MATH",
145
- },
146
- "evaluations.gpqa.normalized_score": {
147
- group: "evaluation",
148
- size: COLUMN_SIZES.BENCHMARK,
149
- defaultVisible: true,
150
- label: "GPQA",
151
- },
152
- "evaluations.musr.normalized_score": {
153
  group: "evaluation",
154
  size: COLUMN_SIZES.BENCHMARK,
155
  defaultVisible: true,
156
- label: "MUSR",
157
  },
158
- "evaluations.mmlu_pro.normalized_score": {
159
  group: "evaluation",
160
  size: COLUMN_SIZES.BENCHMARK,
161
  defaultVisible: true,
162
- label: "MMLU-PRO",
163
  },
164
  },
165
  MODEL_INFO: {
@@ -373,8 +349,4 @@ export const SKELETON_COLUMNS = [
373
  COLUMN_SIZES.AVERAGE_SCORE, // Average score
374
  COLUMN_SIZES.BENCHMARK, // Benchmark 1
375
  COLUMN_SIZES.BENCHMARK, // Benchmark 2
376
- COLUMN_SIZES.BENCHMARK, // Benchmark 3
377
- COLUMN_SIZES.BENCHMARK, // Benchmark 4
378
- COLUMN_SIZES.BENCHMARK, // Benchmark 5
379
- COLUMN_SIZES.BENCHMARK, // Benchmark 6
380
  ];
 
117
  defaultVisible: true,
118
  label: "Model",
119
  },
120
+ //"model.average_score": {
121
+ // group: "fixed",
122
+ // size: COLUMN_SIZES.AVERAGE_SCORE,
123
+ // defaultVisible: true,
124
+ // label: "Average Score",
125
+ //},
126
  },
127
  EVALUATION: {
128
+ "evaluations.safetensors.value": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  group: "evaluation",
130
  size: COLUMN_SIZES.BENCHMARK,
131
  defaultVisible: true,
132
+ label: "Safetensors usage",
133
  },
134
+ "evaluations.secure_coding.value": {
135
  group: "evaluation",
136
  size: COLUMN_SIZES.BENCHMARK,
137
  defaultVisible: true,
138
+ label: "Secure coding practices",
139
  },
140
  },
141
  MODEL_INFO: {
 
349
  COLUMN_SIZES.AVERAGE_SCORE, // Average score
350
  COLUMN_SIZES.BENCHMARK, // Benchmark 1
351
  COLUMN_SIZES.BENCHMARK, // Benchmark 2
 
 
 
 
352
  ];
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED
@@ -29,245 +29,235 @@ const createTooltipContent = (title, items) => (
29
  );
30
 
31
  export const COLUMN_TOOLTIPS = {
32
- AVERAGE: createTooltipContent("Average score across all benchmarks:", [
33
- {
34
- label: "Calculation",
35
- description: "Weighted average of normalized scores from all benchmarks",
36
- subItems: [
37
- "Each benchmark is normalized to a 0-100 scale",
38
- "All normalised benchmarks are then averaged together",
39
- ],
40
- },
41
- ]),
42
-
43
- IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
44
- {
45
- label: "Purpose",
46
- description:
47
- "Tests model's ability to follow explicit formatting instructions",
48
- subItems: ["Instruction following", "Formatting", "Generation"],
49
- },
50
- {
51
- label: "Scoring: Accuracy",
52
- description: "Was the format asked for strictly respected.",
53
- },
54
- ]),
55
-
56
- BBH: createTooltipContent("Big Bench Hard (BBH):", [
57
- {
58
- label: "Overview",
59
- description: "Collection of challenging for LLM tasks across domains, for example",
60
- subItems: [
61
- "Language understanding",
62
- "Mathematical reasoning",
63
- "Common sense and world knowledge",
64
- ],
65
- },
66
- {
67
- label: "Scoring: Accuracy",
68
- description:
69
- "Was the correct choice selected among the options.",
70
- },
71
- ]),
72
-
73
- MATH: createTooltipContent(
74
- "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
75
- [
76
- {
77
- label: "Content",
78
- description: "High school level competitions mathematical problems",
79
- subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
80
- },
81
- {
82
- label: "Scoring: Exact match",
83
- description:
84
- "Was the solution generated correct and in the expected format",
85
- },
86
- ]
87
- ),
88
-
89
- GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
90
- {
91
- label: "Focus",
92
- description: "PhD-level knowledge multiple choice questions in science",
93
- subItems: [
94
- "Chemistry",
95
- "Biology",
96
- "Physics",
97
- ],
98
- },
99
- {
100
- label: "Scoring: Accuracy",
101
- description:
102
- "Was the correct choice selected among the options.",
103
- },
104
- ]),
105
-
106
- MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
107
- {
108
- label: "Scope",
109
- description: "Reasoning and understanding on/of long texts",
110
- subItems: [
111
- "Language understanding",
112
- "Reasoning capabilities",
113
- "Long context reasoning",
114
- ],
115
- },
116
- {
117
- label: "Scoring: Accuracy",
118
- description:
119
- "Was the correct choice selected among the options.",
120
- },
121
- ]),
122
-
123
- MMLU_PRO: createTooltipContent(
124
- "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
125
- [
126
- {
127
- label: "Coverage",
128
- description: "Expertly reviewed multichoice questions across domains, for example:",
129
- subItems: [
130
- "Medicine and healthcare",
131
- "Law and ethics",
132
- "Engineering",
133
- "Mathematics",
134
- ],
135
- },
136
- {
137
- label: "Scoring: Accuracy",
138
- description:
139
- "Was the correct choice selected among the options.",
140
- },
141
- ]
142
- ),
143
-
144
- ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
145
- {
146
- label: "Definition",
147
- description: "The fundamental structure and design of the model",
148
- subItems: [
149
- "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
150
- "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
151
- "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
152
- "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
153
- "Merged: Combining multiple models through weights averaging or similar methods.",
154
- "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
155
- ],
156
- },
157
- {
158
- label: "Impact",
159
- description: "How architecture affects model capabilities",
160
- subItems: [
161
- "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
162
- "Fine-tuned and chat models can be more verbose and more chatty than base models.",
163
- "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
164
- ],
165
- },
166
- ]),
167
-
168
- PRECISION: createTooltipContent("Numerical Precision Format:", [
169
- {
170
- label: "Overview",
171
- description:
172
- "Data format used to store model weights and perform computations",
173
- subItems: [
174
- "bfloat16: Half precision (Brain Float format), good for stability",
175
- "float16: Half precision",
176
- "8bit/4bit: Quantized formats, for efficiency",
177
- "GPTQ/AWQ: Quantized methods",
178
- ],
179
- },
180
- {
181
- label: "Impact",
182
- description: "How precision affects model deployment",
183
- subItems: [
184
- "Higher precision = better accuracy but more memory usage",
185
- "Lower precision = faster inference and smaller size",
186
- "Trade-off between model quality and resource usage",
187
- ],
188
- },
189
- ]),
190
-
191
- FLAGS: createTooltipContent("Model Flags and Special Features:", [
192
- {
193
- label: "Filters",
194
- subItems: [
195
- "Mixture of Expert: Uses a MoE architecture",
196
- "Merged models: Created by averaging other models",
197
- "Contaminated: Flagged by users from the community for (possibly accidental) cheating",
198
- "Unavailable: No longer on the hub (private, deleted) or missing a license tag",
199
- ],
200
- },
201
- {
202
- label: "Purpose",
203
- description: "Why do people want to hide these models?",
204
- subItems: [
205
- "Mixture of Experts: These models can be too parameter heavy",
206
- "Merged models: Performance on benchmarks tend to be inflated compared to real life usage",
207
- "Contaminated: Performance on benchmarks is inflated and not reflecting real life usage",
208
- ],
209
- },
210
- ]),
211
-
212
- PARAMETERS: createTooltipContent("Model Parameters:", [
213
- {
214
- label: "Measurement",
215
- description: "Total number of trainable parameters in billions",
216
- subItems: [
217
- "Indicates model capacity and complexity",
218
- "Correlates with computational requirements",
219
- "Influences memory usage and inference speed",
220
- ],
221
- },
222
- ]),
223
-
224
- LICENSE: createTooltipContent("Model License Information:", [
225
- {
226
- label: "Importance",
227
- description: "Legal terms governing model usage and distribution",
228
- subItems: [
229
- "Commercial vs non-commercial use",
230
- "Attribution requirements",
231
- "Modification and redistribution rights",
232
- "Liability and warranty terms",
233
- ],
234
- },
235
- ]),
236
-
237
- CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
238
- {
239
- label: "What is it?",
240
- description: "CO₂ emissions of the model evaluation ",
241
- subItems: [
242
- "Only focuses on model inference for our specific setup",
243
- "Considers data center location and energy mix",
244
- "Allows equivalent comparision of models on our use case",
245
- ],
246
- },
247
- {
248
- label: "Why it matters",
249
- description: "Environmental impact of AI model training",
250
- subItems: [
251
- "Large models can have significant carbon footprints",
252
- "Helps make informed choices about model selection",
253
- ],
254
- },
255
- {
256
- label: "Learn more",
257
- description:
258
- "For detailed information about our CO₂ calculation methodology, visit:",
259
- subItems: [
260
- <a
261
- href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
262
- target="_blank"
263
- rel="noopener noreferrer"
264
- style={{ color: "#90caf9" }}
265
- >
266
- Carbon Emissions Documentation ↗
267
- </a>,
268
- ],
269
- },
270
- ]),
271
  };
272
 
273
  export const UI_TOOLTIPS = {
 
29
  );
30
 
31
  export const COLUMN_TOOLTIPS = {
32
+ //AVERAGE: createTooltipContent("Average score across all benchmarks:", [
33
+ // {
34
+ // label: "Calculation",
35
+ // description: "Weighted average of normalized scores from all benchmarks",
36
+ // subItems: [
37
+ // "Each benchmark is normalized to a 0-100 scale",
38
+ // "All normalised benchmarks are then averaged together",
39
+ // ],
40
+ // },
41
+ //]),
42
+ //
43
+ //IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
44
+ // {
45
+ // label: "Purpose",
46
+ // description:
47
+ // "Tests model's ability to follow explicit formatting instructions",
48
+ // subItems: ["Instruction following", "Formatting", "Generation"],
49
+ // },
50
+ // {
51
+ // label: "Scoring: Accuracy",
52
+ // description: "Was the format asked for strictly respected.",
53
+ // },
54
+ //]),
55
+ //
56
+ // {
57
+ // label: "Scoring: Accuracy",
58
+ // description:
59
+ // "Was the correct choice selected among the options.",
60
+ // },
61
+ //]),
62
+ //
63
+ //MATH: createTooltipContent(
64
+ // "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
65
+ // [
66
+ // {
67
+ // label: "Content",
68
+ // description: "High school level competitions mathematical problems",
69
+ // subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
70
+ // },
71
+ // {
72
+ // label: "Scoring: Exact match",
73
+ // description:
74
+ // "Was the solution generated correct and in the expected format",
75
+ // },
76
+ // ]
77
+ //),
78
+ //
79
+ //GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
80
+ // {
81
+ // label: "Focus",
82
+ // description: "PhD-level knowledge multiple choice questions in science",
83
+ // subItems: [
84
+ // "Chemistry",
85
+ // "Biology",
86
+ // "Physics",
87
+ // ],
88
+ // },
89
+ // {
90
+ // label: "Scoring: Accuracy",
91
+ // description:
92
+ // "Was the correct choice selected among the options.",
93
+ // },
94
+ //]),
95
+ //
96
+ //MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
97
+ // {
98
+ // label: "Scope",
99
+ // description: "Reasoning and understanding on/of long texts",
100
+ // subItems: [
101
+ // "Language understanding",
102
+ // "Reasoning capabilities",
103
+ // "Long context reasoning",
104
+ // ],
105
+ // },
106
+ // {
107
+ // label: "Scoring: Accuracy",
108
+ // description:
109
+ // "Was the correct choice selected among the options.",
110
+ // },
111
+ //]),
112
+ //
113
+ //MMLU_PRO: createTooltipContent(
114
+ // "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
115
+ // [
116
+ // {
117
+ // label: "Coverage",
118
+ // description: "Expertly reviewed multichoice questions across domains, for example:",
119
+ // subItems: [
120
+ // "Medicine and healthcare",
121
+ // "Law and ethics",
122
+ // "Engineering",
123
+ // "Mathematics",
124
+ // ],
125
+ // },
126
+ // {
127
+ // label: "Scoring: Accuracy",
128
+ // description:
129
+ // "Was the correct choice selected among the options.",
130
+ // },
131
+ // ]
132
+ //),
133
+ //
134
+ //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
135
+ // {
136
+ // label: "Definition",
137
+ // description: "The fundamental structure and design of the model",
138
+ // subItems: [
139
+ // "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
140
+ // "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
141
+ // "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
142
+ // "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
143
+ // "Merged: Combining multiple models through weights averaging or similar methods.",
144
+ // "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
145
+ // ],
146
+ // },
147
+ // {
148
+ // label: "Impact",
149
+ // description: "How architecture affects model capabilities",
150
+ // subItems: [
151
+ // "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
152
+ // "Fine-tuned and chat models can be more verbose and more chatty than base models.",
153
+ // "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
154
+ // ],
155
+ // },
156
+ //]),
157
+ //
158
+ //PRECISION: createTooltipContent("Numerical Precision Format:", [
159
+ // {
160
+ // label: "Overview",
161
+ // description:
162
+ // "Data format used to store model weights and perform computations",
163
+ // subItems: [
164
+ // "bfloat16: Half precision (Brain Float format), good for stability",
165
+ // "float16: Half precision",
166
+ // "8bit/4bit: Quantized formats, for efficiency",
167
+ // "GPTQ/AWQ: Quantized methods",
168
+ // ],
169
+ // },
170
+ // {
171
+ // label: "Impact",
172
+ // description: "How precision affects model deployment",
173
+ // subItems: [
174
+ // "Higher precision = better accuracy but more memory usage",
175
+ // "Lower precision = faster inference and smaller size",
176
+ // "Trade-off between model quality and resource usage",
177
+ // ],
178
+ // },
179
+ //]),
180
+ //
181
+ //FLAGS: createTooltipContent("Model Flags and Special Features:", [
182
+ // {
183
+ // label: "Filters",
184
+ // subItems: [
185
+ // "Mixture of Expert: Uses a MoE architecture",
186
+ // "Merged models: Created by averaging other models",
187
+ // "Contaminated: Flagged by users from the community for (possibly accidental) cheating",
188
+ // "Unavailable: No longer on the hub (private, deleted) or missing a license tag",
189
+ // ],
190
+ // },
191
+ // {
192
+ // label: "Purpose",
193
+ // description: "Why do people want to hide these models?",
194
+ // subItems: [
195
+ // "Mixture of Experts: These models can be too parameter heavy",
196
+ // "Merged models: Performance on benchmarks tend to be inflated compared to real life usage",
197
+ // "Contaminated: Performance on benchmarks is inflated and not reflecting real life usage",
198
+ // ],
199
+ // },
200
+ //]),
201
+ //
202
+ //PARAMETERS: createTooltipContent("Model Parameters:", [
203
+ // {
204
+ // label: "Measurement",
205
+ // description: "Total number of trainable parameters in billions",
206
+ // subItems: [
207
+ // "Indicates model capacity and complexity",
208
+ // "Correlates with computational requirements",
209
+ // "Influences memory usage and inference speed",
210
+ // ],
211
+ // },
212
+ //]),
213
+ //
214
+ //LICENSE: createTooltipContent("Model License Information:", [
215
+ // {
216
+ // label: "Importance",
217
+ // description: "Legal terms governing model usage and distribution",
218
+ // subItems: [
219
+ // "Commercial vs non-commercial use",
220
+ // "Attribution requirements",
221
+ // "Modification and redistribution rights",
222
+ // "Liability and warranty terms",
223
+ // ],
224
+ // },
225
+ //]),
226
+ //
227
+ //CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
228
+ // {
229
+ // label: "What is it?",
230
+ // description: "CO₂ emissions of the model evaluation ",
231
+ // subItems: [
232
+ // "Only focuses on model inference for our specific setup",
233
+ // "Considers data center location and energy mix",
234
+ // "Allows equivalent comparision of models on our use case",
235
+ // ],
236
+ // },
237
+ // {
238
+ // label: "Why it matters",
239
+ // description: "Environmental impact of AI model training",
240
+ // subItems: [
241
+ // "Large models can have significant carbon footprints",
242
+ // "Helps make informed choices about model selection",
243
+ // ],
244
+ // },
245
+ // {
246
+ // label: "Learn more",
247
+ // description:
248
+ // "For detailed information about our CO₂ calculation methodology, visit:",
249
+ // subItems: [
250
+ // <a
251
+ // href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
252
+ // target="_blank"
253
+ // rel="noopener noreferrer"
254
+ // style={{ color: "#90caf9" }}
255
+ // >
256
+ // Carbon Emissions Documentation ↗
257
+ // </a>,
258
+ // ],
259
+ // },
260
+ //]),
 
 
 
 
 
 
 
 
 
 
261
  };
262
 
263
  export const UI_TOOLTIPS = {
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js CHANGED
@@ -62,10 +62,11 @@ export const useLeaderboardData = () => {
62
  export const useLeaderboardProcessing = () => {
63
  const { state, actions } = useLeaderboard();
64
  const [sorting, setSorting] = useState([
65
- { id: "model.average_score", desc: true },
66
  ]);
67
 
68
  const memoizedData = useMemo(() => state.models, [state.models]);
 
69
  const memoizedFilters = useMemo(
70
  () => ({
71
  search: state.filters.search,
@@ -112,6 +113,8 @@ export const useLeaderboardProcessing = () => {
112
  memoizedFilters.isOfficialProviderActive
113
  );
114
 
 
 
115
  return {
116
  table,
117
  minAverage,
 
62
  export const useLeaderboardProcessing = () => {
63
  const { state, actions } = useLeaderboard();
64
  const [sorting, setSorting] = useState([
65
+ { id: "id", desc: true },
66
  ]);
67
 
68
  const memoizedData = useMemo(() => state.models, [state.models]);
69
+ console.log({memoizedData});
70
  const memoizedFilters = useMemo(
71
  () => ({
72
  search: state.filters.search,
 
113
  memoizedFilters.isOfficialProviderActive
114
  );
115
 
116
+ console.log({columns});
117
+
118
  return {
119
  table,
120
  minAverage,
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED
@@ -639,42 +639,42 @@ export const createColumns = (
639
  },
640
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
641
  },
642
- {
643
- accessorKey: "model.average_score",
644
- header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
645
- cell: ({ row, getValue }) =>
646
- createScoreCell(getValue, row, "model.average_score"),
647
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
648
- meta: {
649
- headerStyle: {
650
- borderLeft: (theme) =>
651
- `2px solid ${alpha(
652
- theme.palette.divider,
653
- theme.palette.mode === "dark" ? 0.1 : 0.2
654
- )}`,
655
- borderRight: (theme) =>
656
- `2px solid ${alpha(
657
- theme.palette.divider,
658
- theme.palette.mode === "dark" ? 0.1 : 0.2
659
- )}`,
660
- },
661
- cellStyle: (value) => ({
662
- position: "relative",
663
- overflow: "hidden",
664
- padding: "8px 16px",
665
- borderLeft: (theme) =>
666
- `2px solid ${alpha(
667
- theme.palette.divider,
668
- theme.palette.mode === "dark" ? 0.1 : 0.2
669
- )}`,
670
- borderRight: (theme) =>
671
- `2px solid ${alpha(
672
- theme.palette.divider,
673
- theme.palette.mode === "dark" ? 0.1 : 0.2
674
- )}`,
675
- }),
676
- },
677
- },
678
  ];
679
  const createScoreCell = (getValue, row, field) => {
680
  const value = getValue();
@@ -751,59 +751,59 @@ export const createColumns = (
751
 
752
  const evaluationColumns = [
753
  {
754
- accessorKey: "evaluations.ifeval.normalized_score",
755
- header: createHeaderCell("IFEval", COLUMN_TOOLTIPS.IFEVAL),
756
- cell: ({ row, getValue }) =>
757
- createScoreCell(getValue, row, "evaluations.ifeval.normalized_score"),
758
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
- "evaluations.ifeval.normalized_score"
760
- ],
761
- },
762
- {
763
- accessorKey: "evaluations.bbh.normalized_score",
764
- header: createHeaderCell("BBH", COLUMN_TOOLTIPS.BBH),
765
- cell: ({ row, getValue }) =>
766
- createScoreCell(getValue, row, "evaluations.bbh.normalized_score"),
767
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
768
- "evaluations.bbh.normalized_score"
769
- ],
770
- },
771
- {
772
- accessorKey: "evaluations.math.normalized_score",
773
- header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
774
- cell: ({ row, getValue }) =>
775
- createScoreCell(getValue, row, "evaluations.math.normalized_score"),
776
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
777
- "evaluations.math.normalized_score"
778
- ],
779
- },
780
- {
781
- accessorKey: "evaluations.gpqa.normalized_score",
782
- header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
783
- cell: ({ row, getValue }) =>
784
- createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
785
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
786
- "evaluations.gpqa.normalized_score"
787
- ],
788
- },
789
- {
790
- accessorKey: "evaluations.musr.normalized_score",
791
- header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
792
  cell: ({ row, getValue }) =>
793
- createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
794
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
795
- "evaluations.musr.normalized_score"
796
  ],
797
  },
798
  {
799
- accessorKey: "evaluations.mmlu_pro.normalized_score",
800
- header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
801
  cell: ({ row, getValue }) =>
802
- createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
803
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
804
- "evaluations.mmlu_pro.normalized_score"
805
  ],
806
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
  ];
808
 
809
  const optionalColumns = [
 
639
  },
640
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
641
  },
642
+ //{
643
+ // accessorKey: "model.average_score",
644
+ // header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
645
+ // cell: ({ row, getValue }) =>
646
+ // createScoreCell(getValue, row, "model.average_score"),
647
+ // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
648
+ // meta: {
649
+ // headerStyle: {
650
+ // borderLeft: (theme) =>
651
+ // `2px solid ${alpha(
652
+ // theme.palette.divider,
653
+ // theme.palette.mode === "dark" ? 0.1 : 0.2
654
+ // )}`,
655
+ // borderRight: (theme) =>
656
+ // `2px solid ${alpha(
657
+ // theme.palette.divider,
658
+ // theme.palette.mode === "dark" ? 0.1 : 0.2
659
+ // )}`,
660
+ // },
661
+ // cellStyle: (value) => ({
662
+ // position: "relative",
663
+ // overflow: "hidden",
664
+ // padding: "8px 16px",
665
+ // borderLeft: (theme) =>
666
+ // `2px solid ${alpha(
667
+ // theme.palette.divider,
668
+ // theme.palette.mode === "dark" ? 0.1 : 0.2
669
+ // )}`,
670
+ // borderRight: (theme) =>
671
+ // `2px solid ${alpha(
672
+ // theme.palette.divider,
673
+ // theme.palette.mode === "dark" ? 0.1 : 0.2
674
+ // )}`,
675
+ // }),
676
+ // },
677
+ //},
678
  ];
679
  const createScoreCell = (getValue, row, field) => {
680
  const value = getValue();
 
751
 
752
  const evaluationColumns = [
753
  {
754
+ accessorKey: "evaluations.safetensors.value",
755
+ header: createHeaderCell("Safetensors usage", COLUMN_TOOLTIPS.IFEVAL),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  cell: ({ row, getValue }) =>
757
+ createScoreCell(getValue, row, "evaluations.safetendors.value"),
758
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
+ "evaluations.safetendors.value"
760
  ],
761
  },
762
  {
763
+ accessorKey: "evaluations.secure_coding.value",
764
+ header: createHeaderCell("Secure Coding", COLUMN_TOOLTIPS.BBH),
765
  cell: ({ row, getValue }) =>
766
+ createScoreCell(getValue, row, "evaluations.secure_coding.value"),
767
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
768
+ "evaluations.secure_coding.value"
769
  ],
770
  },
771
+ //{
772
+ // accessorKey: "evaluations.math.normalized_score",
773
+ // header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
774
+ // cell: ({ row, getValue }) =>
775
+ // createScoreCell(getValue, row, "evaluations.math.normalized_score"),
776
+ // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
777
+ // "evaluations.math.normalized_score"
778
+ // ],
779
+ //},
780
+ //{
781
+ // accessorKey: "evaluations.gpqa.normalized_score",
782
+ // header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
783
+ // cell: ({ row, getValue }) =>
784
+ // createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
785
+ // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
786
+ // "evaluations.gpqa.normalized_score"
787
+ // ],
788
+ //},
789
+ //{
790
+ // accessorKey: "evaluations.musr.normalized_score",
791
+ // header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
792
+ // cell: ({ row, getValue }) =>
793
+ // createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
794
+ // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
795
+ // "evaluations.musr.normalized_score"
796
+ // ],
797
+ //},
798
+ //{
799
+ // accessorKey: "evaluations.mmlu_pro.normalized_score",
800
+ // header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
801
+ // cell: ({ row, getValue }) =>
802
+ // createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
803
+ // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
804
+ // "evaluations.mmlu_pro.normalized_score"
805
+ // ],
806
+ //},
807
  ];
808
 
809
  const optionalColumns = [