eleftherias commited on
Commit
a466b7a
·
1 Parent(s): 5e95327

Add average to table

Browse files
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED
@@ -129,7 +129,7 @@ const COLUMNS = {
129
  group: "evaluation",
130
  size: COLUMN_SIZES.BENCHMARK,
131
  defaultVisible: true,
132
- label: "Safetensors usage",
133
  },
134
  "evaluations.secure_coding.value": {
135
  group: "evaluation",
 
129
  group: "evaluation",
130
  size: COLUMN_SIZES.BENCHMARK,
131
  defaultVisible: true,
132
+ label: "Safetensors Usage",
133
  },
134
  "evaluations.secure_coding.value": {
135
  group: "evaluation",
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED
@@ -29,107 +29,16 @@ const createTooltipContent = (title, items) => (
29
  );
30
 
31
  export const COLUMN_TOOLTIPS = {
32
- //AVERAGE: createTooltipContent("Average score across all benchmarks:", [
33
- // {
34
- // label: "Calculation",
35
- // description: "Weighted average of normalized scores from all benchmarks",
36
- // subItems: [
37
- // "Each benchmark is normalized to a 0-100 scale",
38
- // "All normalised benchmarks are then averaged together",
39
- // ],
40
- // },
41
- //]),
42
- //
43
- //IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
44
- // {
45
- // label: "Purpose",
46
- // description:
47
- // "Tests model's ability to follow explicit formatting instructions",
48
- // subItems: ["Instruction following", "Formatting", "Generation"],
49
- // },
50
- // {
51
- // label: "Scoring: Accuracy",
52
- // description: "Was the format asked for strictly respected.",
53
- // },
54
- //]),
55
- //
56
- // {
57
- // label: "Scoring: Accuracy",
58
- // description:
59
- // "Was the correct choice selected among the options.",
60
- // },
61
- //]),
62
- //
63
- //MATH: createTooltipContent(
64
- // "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
65
- // [
66
- // {
67
- // label: "Content",
68
- // description: "High school level competitions mathematical problems",
69
- // subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
70
- // },
71
- // {
72
- // label: "Scoring: Exact match",
73
- // description:
74
- // "Was the solution generated correct and in the expected format",
75
- // },
76
- // ]
77
- //),
78
- //
79
- //GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
80
- // {
81
- // label: "Focus",
82
- // description: "PhD-level knowledge multiple choice questions in science",
83
- // subItems: [
84
- // "Chemistry",
85
- // "Biology",
86
- // "Physics",
87
- // ],
88
- // },
89
- // {
90
- // label: "Scoring: Accuracy",
91
- // description:
92
- // "Was the correct choice selected among the options.",
93
- // },
94
- //]),
95
- //
96
- //MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
97
- // {
98
- // label: "Scope",
99
- // description: "Reasoning and understanding on/of long texts",
100
- // subItems: [
101
- // "Language understanding",
102
- // "Reasoning capabilities",
103
- // "Long context reasoning",
104
- // ],
105
- // },
106
- // {
107
- // label: "Scoring: Accuracy",
108
- // description:
109
- // "Was the correct choice selected among the options.",
110
- // },
111
- //]),
112
- //
113
- //MMLU_PRO: createTooltipContent(
114
- // "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
115
- // [
116
- // {
117
- // label: "Coverage",
118
- // description: "Expertly reviewed multichoice questions across domains, for example:",
119
- // subItems: [
120
- // "Medicine and healthcare",
121
- // "Law and ethics",
122
- // "Engineering",
123
- // "Mathematics",
124
- // ],
125
- // },
126
- // {
127
- // label: "Scoring: Accuracy",
128
- // description:
129
- // "Was the correct choice selected among the options.",
130
- // },
131
- // ]
132
- //),
133
  //
134
  //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
135
  // {
 
29
  );
30
 
31
  export const COLUMN_TOOLTIPS = {
32
+ AVERAGE: createTooltipContent("Average score across all benchmarks:", [
33
+ {
34
+ label: "Calculation",
35
+ description: "Weighted average of normalized scores from all benchmarks",
36
+ subItems: [
37
+ "Each benchmark is normalized to a 0-100 scale",
38
+ "All normalised benchmarks are then averaged together",
39
+ ],
40
+ },
41
+ ]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  //
43
  //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
44
  // {
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED
@@ -639,42 +639,42 @@ export const createColumns = (
639
  },
640
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
641
  },
642
- //{
643
- // accessorKey: "model.average_score",
644
- // header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
645
- // cell: ({ row, getValue }) =>
646
- // createScoreCell(getValue, row, "model.average_score"),
647
- // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
648
- // meta: {
649
- // headerStyle: {
650
- // borderLeft: (theme) =>
651
- // `2px solid ${alpha(
652
- // theme.palette.divider,
653
- // theme.palette.mode === "dark" ? 0.1 : 0.2
654
- // )}`,
655
- // borderRight: (theme) =>
656
- // `2px solid ${alpha(
657
- // theme.palette.divider,
658
- // theme.palette.mode === "dark" ? 0.1 : 0.2
659
- // )}`,
660
- // },
661
- // cellStyle: (value) => ({
662
- // position: "relative",
663
- // overflow: "hidden",
664
- // padding: "8px 16px",
665
- // borderLeft: (theme) =>
666
- // `2px solid ${alpha(
667
- // theme.palette.divider,
668
- // theme.palette.mode === "dark" ? 0.1 : 0.2
669
- // )}`,
670
- // borderRight: (theme) =>
671
- // `2px solid ${alpha(
672
- // theme.palette.divider,
673
- // theme.palette.mode === "dark" ? 0.1 : 0.2
674
- // )}`,
675
- // }),
676
- // },
677
- //},
678
  ];
679
  const createScoreCell = (getValue, row, field) => {
680
  const value = getValue();
@@ -751,59 +751,23 @@ export const createColumns = (
751
 
752
  const evaluationColumns = [
753
  {
754
- accessorKey: "evaluations.safetensors.value",
755
- header: createHeaderCell("Safetensors usage", COLUMN_TOOLTIPS.IFEVAL),
756
  cell: ({ row, getValue }) =>
757
- createScoreCell(getValue, row, "evaluations.safetendors.value"),
758
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
- "evaluations.safetendors.value"
760
  ],
761
  },
762
  {
763
- accessorKey: "evaluations.secure_coding.value",
764
- header: createHeaderCell("Secure Coding", COLUMN_TOOLTIPS.BBH),
765
  cell: ({ row, getValue }) =>
766
- createScoreCell(getValue, row, "evaluations.secure_coding.value"),
767
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
768
- "evaluations.secure_coding.value"
769
  ],
770
  },
771
- //{
772
- // accessorKey: "evaluations.math.normalized_score",
773
- // header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
774
- // cell: ({ row, getValue }) =>
775
- // createScoreCell(getValue, row, "evaluations.math.normalized_score"),
776
- // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
777
- // "evaluations.math.normalized_score"
778
- // ],
779
- //},
780
- //{
781
- // accessorKey: "evaluations.gpqa.normalized_score",
782
- // header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
783
- // cell: ({ row, getValue }) =>
784
- // createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
785
- // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
786
- // "evaluations.gpqa.normalized_score"
787
- // ],
788
- //},
789
- //{
790
- // accessorKey: "evaluations.musr.normalized_score",
791
- // header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
792
- // cell: ({ row, getValue }) =>
793
- // createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
794
- // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
795
- // "evaluations.musr.normalized_score"
796
- // ],
797
- //},
798
- //{
799
- // accessorKey: "evaluations.mmlu_pro.normalized_score",
800
- // header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
801
- // cell: ({ row, getValue }) =>
802
- // createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
803
- // size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
804
- // "evaluations.mmlu_pro.normalized_score"
805
- // ],
806
- //},
807
  ];
808
 
809
  const optionalColumns = [
 
639
  },
640
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
641
  },
642
+ {
643
+ accessorKey: "model.average_score",
644
+ header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
645
+ cell: ({ row, getValue }) =>
646
+ createScoreCell(getValue, row, "model.average_score"),
647
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
648
+ meta: {
649
+ headerStyle: {
650
+ borderLeft: (theme) =>
651
+ `2px solid ${alpha(
652
+ theme.palette.divider,
653
+ theme.palette.mode === "dark" ? 0.1 : 0.2
654
+ )}`,
655
+ borderRight: (theme) =>
656
+ `2px solid ${alpha(
657
+ theme.palette.divider,
658
+ theme.palette.mode === "dark" ? 0.1 : 0.2
659
+ )}`,
660
+ },
661
+ cellStyle: (value) => ({
662
+ position: "relative",
663
+ overflow: "hidden",
664
+ padding: "8px 16px",
665
+ borderLeft: (theme) =>
666
+ `2px solid ${alpha(
667
+ theme.palette.divider,
668
+ theme.palette.mode === "dark" ? 0.1 : 0.2
669
+ )}`,
670
+ borderRight: (theme) =>
671
+ `2px solid ${alpha(
672
+ theme.palette.divider,
673
+ theme.palette.mode === "dark" ? 0.1 : 0.2
674
+ )}`,
675
+ }),
676
+ },
677
+ },
678
  ];
679
  const createScoreCell = (getValue, row, field) => {
680
  const value = getValue();
 
751
 
752
  const evaluationColumns = [
753
  {
754
+ accessorKey: "evaluations.secure_coding.value",
755
+ header: createHeaderCell("Bad Package Detection"),
756
  cell: ({ row, getValue }) =>
757
+ createScoreCell(getValue, row, "evaluations.secure_coding.value"),
758
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
+ "evaluations.secure_coding.value"
760
  ],
761
  },
762
  {
763
+ accessorKey: "evaluations.safetensors.value",
764
+ header: createHeaderCell("Safetensors Usage"),
765
  cell: ({ row, getValue }) =>
766
+ createScoreCell(getValue, row, "evaluations.safetensors.value"),
767
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
768
+ "evaluations.safetensors.value"
769
  ],
770
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  ];
772
 
773
  const optionalColumns = [