llm-security-leaderboard

Running

App Files Files Community

eleftherias commited on Mar 14

Commit

a466b7a

1 Parent(s): 5e95327

Add average to table

Browse files

Files changed (3) hide show

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js +1 -1
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js +10 -101
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js +44 -80

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED Viewed

@@ -129,7 +129,7 @@ const COLUMNS = {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
-      label: "Safetensors usage",
     },
     "evaluations.secure_coding.value": {
       group: "evaluation",

       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
+      label: "Safetensors Usage",
     },
     "evaluations.secure_coding.value": {
       group: "evaluation",

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED Viewed

@@ -29,107 +29,16 @@ const createTooltipContent = (title, items) => (
 );
 export const COLUMN_TOOLTIPS = {
-  //AVERAGE: createTooltipContent("Average score across all benchmarks:", [
-  //  {
-  //    label: "Calculation",
-  //    description: "Weighted average of normalized scores from all benchmarks",
-  //    subItems: [
-  //      "Each benchmark is normalized to a 0-100 scale",
-  //      "All normalised benchmarks are then averaged together",
-  //    ],
-  //  },
-  //]),
-  //
-  //IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
-  //  {
-  //    label: "Purpose",
-  //    description:
-  //      "Tests model's ability to follow explicit formatting instructions",
-  //    subItems: ["Instruction following", "Formatting", "Generation"],
-  //  },
-  //  {
-  //    label: "Scoring: Accuracy",
-  //    description: "Was the format asked for strictly respected.",
-  //  },
-  //]),
-  //
-  //  {
-  //    label: "Scoring: Accuracy",
-  //    description:
-  //      "Was the correct choice selected among the options.",
-  //  },
-  //]),
-  //
-  //MATH: createTooltipContent(
-  //  "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
-  //  [
-  //    {
-  //      label: "Content",
-  //      description: "High school level competitions mathematical problems",
-  //      subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
-  //    },
-  //    {
-  //      label: "Scoring: Exact match",
-  //      description:
-  //        "Was the solution generated correct and in the expected format",
-  //    },
-  //  ]
-  //),
-  //
-  //GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
-  //  {
-  //    label: "Focus",
-  //    description: "PhD-level knowledge multiple choice questions in science",
-  //    subItems: [
-  //      "Chemistry",
-  //      "Biology",
-  //      "Physics",
-  //    ],
-  //  },
-  //  {
-  //    label: "Scoring: Accuracy",
-  //    description:
-  //      "Was the correct choice selected among the options.",
-  //  },
-  //]),
-  //
-  //MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
-  //  {
-  //    label: "Scope",
-  //    description: "Reasoning and understanding on/of long texts",
-  //    subItems: [
-  //      "Language understanding",
-  //      "Reasoning capabilities",
-  //      "Long context reasoning",
-  //    ],
-  //  },
-  //  {
-  //    label: "Scoring: Accuracy",
-  //    description:
-  //      "Was the correct choice selected among the options.",
-  //  },
-  //]),
-  //
-  //MMLU_PRO: createTooltipContent(
-  //  "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
-  //  [
-  //    {
-  //      label: "Coverage",
-  //      description: "Expertly reviewed multichoice questions across domains, for example:",
-  //      subItems: [
-  //        "Medicine and healthcare",
-  //        "Law and ethics",
-  //        "Engineering",
-  //        "Mathematics",
-  //      ],
-  //    },
-  //    {
-  //      label: "Scoring: Accuracy",
-  //      description:
-  //        "Was the correct choice selected among the options.",
-  //    },
-  //  ]
-  //),
   //
   //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
   //  {

 );
 export const COLUMN_TOOLTIPS = {
+  AVERAGE: createTooltipContent("Average score across all benchmarks:", [
+   {
+     label: "Calculation",
+     description: "Weighted average of normalized scores from all benchmarks",
+     subItems: [
+       "Each benchmark is normalized to a 0-100 scale",
+       "All normalised benchmarks are then averaged together",
+     ],
+   },
+  ]),
   //
   //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
   //  {

frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED Viewed

@@ -639,42 +639,42 @@ export const createColumns = (
       },
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
     },
-    //{
-    //  accessorKey: "model.average_score",
-    //  header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
-    //  cell: ({ row, getValue }) =>
-    //    createScoreCell(getValue, row, "model.average_score"),
-    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
-    //  meta: {
-    //    headerStyle: {
-    //      borderLeft: (theme) =>
-    //        `2px solid ${alpha(
-    //          theme.palette.divider,
-    //          theme.palette.mode === "dark" ? 0.1 : 0.2
-    //        )}`,
-    //      borderRight: (theme) =>
-    //        `2px solid ${alpha(
-    //          theme.palette.divider,
-    //          theme.palette.mode === "dark" ? 0.1 : 0.2
-    //        )}`,
-    //    },
-    //    cellStyle: (value) => ({
-    //      position: "relative",
-    //      overflow: "hidden",
-    //      padding: "8px 16px",
-    //      borderLeft: (theme) =>
-    //        `2px solid ${alpha(
-    //          theme.palette.divider,
-    //          theme.palette.mode === "dark" ? 0.1 : 0.2
-    //        )}`,
-    //      borderRight: (theme) =>
-    //        `2px solid ${alpha(
-    //          theme.palette.divider,
-    //          theme.palette.mode === "dark" ? 0.1 : 0.2
-    //        )}`,
-    //    }),
-    //  },
-    //},
   ];
   const createScoreCell = (getValue, row, field) => {
     const value = getValue();
@@ -751,59 +751,23 @@ export const createColumns = (
   const evaluationColumns = [
     {
-      accessorKey: "evaluations.safetensors.value",
-      header: createHeaderCell("Safetensors usage", COLUMN_TOOLTIPS.IFEVAL),
       cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.safetendors.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.safetendors.value"
       ],
     },
     {
-      accessorKey: "evaluations.secure_coding.value",
-      header: createHeaderCell("Secure Coding", COLUMN_TOOLTIPS.BBH),
       cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.secure_coding.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.secure_coding.value"
       ],
     },
-    //{
-    //  accessorKey: "evaluations.math.normalized_score",
-    //  header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
-    //  cell: ({ row, getValue }) =>
-    //    createScoreCell(getValue, row, "evaluations.math.normalized_score"),
-    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-    //    "evaluations.math.normalized_score"
-    //  ],
-    //},
-    //{
-    //  accessorKey: "evaluations.gpqa.normalized_score",
-    //  header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
-    //  cell: ({ row, getValue }) =>
-    //    createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
-    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-    //    "evaluations.gpqa.normalized_score"
-    //  ],
-    //},
-    //{
-    //  accessorKey: "evaluations.musr.normalized_score",
-    //  header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
-    //  cell: ({ row, getValue }) =>
-    //    createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
-    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-    //    "evaluations.musr.normalized_score"
-    //  ],
-    //},
-    //{
-    //  accessorKey: "evaluations.mmlu_pro.normalized_score",
-    //  header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
-    //  cell: ({ row, getValue }) =>
-    //    createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
-    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-    //    "evaluations.mmlu_pro.normalized_score"
-    //  ],
-    //},
   ];
   const optionalColumns = [

       },
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
     },
+    {
+     accessorKey: "model.average_score",
+     header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
+     cell: ({ row, getValue }) =>
+       createScoreCell(getValue, row, "model.average_score"),
+     size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
+     meta: {
+       headerStyle: {
+         borderLeft: (theme) =>
+           `2px solid ${alpha(
+             theme.palette.divider,
+             theme.palette.mode === "dark" ? 0.1 : 0.2
+           )}`,
+         borderRight: (theme) =>
+           `2px solid ${alpha(
+             theme.palette.divider,
+             theme.palette.mode === "dark" ? 0.1 : 0.2
+           )}`,
+       },
+       cellStyle: (value) => ({
+         position: "relative",
+         overflow: "hidden",
+         padding: "8px 16px",
+         borderLeft: (theme) =>
+           `2px solid ${alpha(
+             theme.palette.divider,
+             theme.palette.mode === "dark" ? 0.1 : 0.2
+           )}`,
+         borderRight: (theme) =>
+           `2px solid ${alpha(
+             theme.palette.divider,
+             theme.palette.mode === "dark" ? 0.1 : 0.2
+           )}`,
+       }),
+     },
+    },
   ];
   const createScoreCell = (getValue, row, field) => {
     const value = getValue();
   const evaluationColumns = [
     {
+      accessorKey: "evaluations.secure_coding.value",
+      header: createHeaderCell("Bad Package Detection"),
       cell: ({ row, getValue }) =>
+        createScoreCell(getValue, row, "evaluations.secure_coding.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+        "evaluations.secure_coding.value"
       ],
     },
     {
+      accessorKey: "evaluations.safetensors.value",
+      header: createHeaderCell("Safetensors Usage"),
       cell: ({ row, getValue }) =>
+        createScoreCell(getValue, row, "evaluations.safetensors.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+        "evaluations.safetensors.value"
       ],
     },
   ];
   const optionalColumns = [