llm-security-leaderboard

Running

App Files Files Community

Daniel Kantor commited on Mar 7

Commit

dd9222d

1 Parent(s): c2f7972

fix columns

Browse files

Files changed (7) hide show

backend/app/services/leaderboard.py +11 -28
docker-compose.yml +2 -2
frontend/src/pages/LeaderboardPage/components/Leaderboard/components/Table/hooks/useDataProcessing.js +2 -0
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js +10 -38
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js +229 -239
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js +4 -1
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js +80 -80

backend/app/services/leaderboard.py CHANGED Viewed

@@ -116,38 +116,21 @@ class LeaderboardService:
             )
             # Create unique ID combining model name, precision, sha and chat template status
             unique_id = f"{data.get('fullname', 'Unknown')}_{data.get('Precision', 'Unknown')}_{data.get('Model sha', 'Unknown')}_{str(data.get('Chat Template', False))}"
             evaluations = {
-                "ifeval": {
-                    "name": "IFEval",
-                    "value": data.get("IFEval Raw", 0),
-                    "normalized_score": data.get("IFEval", 0),
-                },
-                "bbh": {
-                    "name": "BBH",
-                    "value": data.get("BBH Raw", 0),
-                    "normalized_score": data.get("BBH", 0),
-                },
-                "math": {
-                    "name": "MATH Level 5",
-                    "value": data.get("MATH Lvl 5 Raw", 0),
-                    "normalized_score": data.get("MATH Lvl 5", 0),
-                },
-                "gpqa": {
-                    "name": "GPQA",
-                    "value": data.get("GPQA Raw", 0),
-                    "normalized_score": data.get("GPQA", 0),
-                },
-                "musr": {
-                    "name": "MUSR",
-                    "value": data.get("MUSR Raw", 0),
-                    "normalized_score": data.get("MUSR", 0),
                 },
-                "mmlu_pro": {
-                    "name": "MMLU-PRO",
-                    "value": data.get("MMLU-PRO Raw", 0),
-                    "normalized_score": data.get("MMLU-PRO", 0),
                 },
             }

             )
             # Create unique ID combining model name, precision, sha and chat template status
             unique_id = f"{data.get('fullname', 'Unknown')}_{data.get('Precision', 'Unknown')}_{data.get('Model sha', 'Unknown')}_{str(data.get('Chat Template', False))}"
+            print(data)
             evaluations = {
+                "safetensors": {
+                    "name": "SafeTensors",
+                    "value": data.get("safetensors", 0),
+                    "normalized_score": data.get("safetensors", 0),
                 },
+                "secure_coding": {
+                    "name": "Secure Coding",
+                    "value": data.get("secure_coding", 0),
+                    "normalized_score": data.get("secure_coding", 0),
                 },
             }

docker-compose.yml CHANGED Viewed

@@ -19,10 +19,10 @@ services:
     build:
       context: ./frontend
       dockerfile: Dockerfile.dev
     ports:
       - "${FRONTEND_PORT:-7861}:7861"
-    args:
-      - OAUTH_CLIENT_ID=${OAUTH_CLIENT_ID}
     volumes:
       - ./frontend:/app
       - /app/node_modules

     build:
       context: ./frontend
       dockerfile: Dockerfile.dev
+      args:
+        - OAUTH_CLIENT_ID=${OAUTH_CLIENT_ID}
     ports:
       - "${FRONTEND_PORT:-7861}:7861"
     volumes:
       - ./frontend:/app
       - /app/node_modules

frontend/src/pages/LeaderboardPage/components/Leaderboard/components/Table/hooks/useDataProcessing.js CHANGED Viewed

@@ -37,6 +37,8 @@ export const useDataProcessing = (
   const processedData = useProcessedData(data, averageMode, visibleColumns);
   const columnVisibility = useColumnVisibility(visibleColumns);
   // Memoize filters
   const filterConfig = useMemo(
     () => ({

   const processedData = useProcessedData(data, averageMode, visibleColumns);
   const columnVisibility = useColumnVisibility(visibleColumns);
+  console.log({visibleColumns});
   // Memoize filters
   const filterConfig = useMemo(
     () => ({

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED Viewed

@@ -117,49 +117,25 @@ const COLUMNS = {
       defaultVisible: true,
       label: "Model",
     },
-    "model.average_score": {
-      group: "fixed",
-      size: COLUMN_SIZES.AVERAGE_SCORE,
-      defaultVisible: true,
-      label: "Average Score",
-    },
   },
   EVALUATION: {
-    "evaluations.ifeval.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "IFEval",
-    },
-    "evaluations.bbh.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "BBH",
-    },
-    "evaluations.math.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "MATH",
-    },
-    "evaluations.gpqa.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "GPQA",
-    },
-    "evaluations.musr.normalized_score": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
-      label: "MUSR",
     },
-    "evaluations.mmlu_pro.normalized_score": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
-      label: "MMLU-PRO",
     },
   },
   MODEL_INFO: {
@@ -373,8 +349,4 @@ export const SKELETON_COLUMNS = [
   COLUMN_SIZES.AVERAGE_SCORE, // Average score
   COLUMN_SIZES.BENCHMARK, // Benchmark 1
   COLUMN_SIZES.BENCHMARK, // Benchmark 2
-  COLUMN_SIZES.BENCHMARK, // Benchmark 3
-  COLUMN_SIZES.BENCHMARK, // Benchmark 4
-  COLUMN_SIZES.BENCHMARK, // Benchmark 5
-  COLUMN_SIZES.BENCHMARK, // Benchmark 6
 ];

       defaultVisible: true,
       label: "Model",
     },
+    //"model.average_score": {
+    //  group: "fixed",
+    //  size: COLUMN_SIZES.AVERAGE_SCORE,
+    //  defaultVisible: true,
+    //  label: "Average Score",
+    //},
   },
   EVALUATION: {
+    "evaluations.safetensors.value": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
+      label: "Safetensors usage",
     },
+    "evaluations.secure_coding.value": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
+      label: "Secure coding practices",
     },
   },
   MODEL_INFO: {
   COLUMN_SIZES.AVERAGE_SCORE, // Average score
   COLUMN_SIZES.BENCHMARK, // Benchmark 1
   COLUMN_SIZES.BENCHMARK, // Benchmark 2
 ];

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED Viewed

@@ -29,245 +29,235 @@ const createTooltipContent = (title, items) => (
 );
 export const COLUMN_TOOLTIPS = {
-  AVERAGE: createTooltipContent("Average score across all benchmarks:", [
-    {
-      label: "Calculation",
-      description: "Weighted average of normalized scores from all benchmarks",
-      subItems: [
-        "Each benchmark is normalized to a 0-100 scale",
-        "All normalised benchmarks are then averaged together",
-      ],
-    },
-  ]),
-  IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
-    {
-      label: "Purpose",
-      description:
-        "Tests model's ability to follow explicit formatting instructions",
-      subItems: ["Instruction following", "Formatting", "Generation"],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description: "Was the format asked for strictly respected.",
-    },
-  ]),
-  BBH: createTooltipContent("Big Bench Hard (BBH):", [
-    {
-      label: "Overview",
-      description: "Collection of challenging for LLM tasks across domains, for example",
-      subItems: [
-        "Language understanding",
-        "Mathematical reasoning",
-        "Common sense and world knowledge",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MATH: createTooltipContent(
-    "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
-    [
-      {
-        label: "Content",
-        description: "High school level competitions mathematical problems",
-        subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
-      },
-      {
-        label: "Scoring: Exact match",
-        description:
-          "Was the solution generated correct and in the expected format",
-      },
-    ]
-  ),
-  GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
-    {
-      label: "Focus",
-      description: "PhD-level knowledge multiple choice questions in science",
-      subItems: [
-        "Chemistry",
-        "Biology",
-        "Physics",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
-    {
-      label: "Scope",
-      description: "Reasoning and understanding on/of long texts",
-      subItems: [
-        "Language understanding",
-        "Reasoning capabilities",
-        "Long context reasoning",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MMLU_PRO: createTooltipContent(
-    "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
-    [
-      {
-        label: "Coverage",
-        description: "Expertly reviewed multichoice questions across domains, for example:",
-        subItems: [
-          "Medicine and healthcare",
-          "Law and ethics",
-          "Engineering",
-          "Mathematics",
-        ],
-      },
-      {
-        label: "Scoring: Accuracy",
-        description:
-          "Was the correct choice selected among the options.",
-      },
-    ]
-  ),
-  ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
-    {
-      label: "Definition",
-      description: "The fundamental structure and design of the model",
-      subItems: [
-        "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
-        "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
-        "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
-        "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
-        "Merged: Combining multiple models through weights averaging or similar methods.",
-        "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
-      ],
-    },
-    {
-      label: "Impact",
-      description: "How architecture affects model capabilities",
-      subItems: [
-        "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
-        "Fine-tuned and chat models can be more verbose and more chatty than base models.",
-        "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
-      ],
-    },
-  ]),
-  PRECISION: createTooltipContent("Numerical Precision Format:", [
-    {
-      label: "Overview",
-      description:
-        "Data format used to store model weights and perform computations",
-      subItems: [
-        "bfloat16: Half precision (Brain Float format), good for stability",
-        "float16: Half precision",
-        "8bit/4bit: Quantized formats, for efficiency",
-        "GPTQ/AWQ: Quantized methods",
-      ],
-    },
-    {
-      label: "Impact",
-      description: "How precision affects model deployment",
-      subItems: [
-        "Higher precision = better accuracy but more memory usage",
-        "Lower precision = faster inference and smaller size",
-        "Trade-off between model quality and resource usage",
-      ],
-    },
-  ]),
-  FLAGS: createTooltipContent("Model Flags and Special Features:", [
-    {
-      label: "Filters",
-      subItems: [
-        "Mixture of Expert: Uses a MoE architecture",
-        "Merged models: Created by averaging other models",
-        "Contaminated: Flagged by users from the community for (possibly accidental) cheating",
-        "Unavailable: No longer on the hub (private, deleted) or missing a license tag",
-      ],
-    },
-    {
-      label: "Purpose",
-      description: "Why do people want to hide these models?",
-      subItems: [
-        "Mixture of Experts: These models can be too parameter heavy",
-        "Merged models: Performance on benchmarks tend to be inflated compared to real life usage",
-        "Contaminated: Performance on benchmarks is inflated and not reflecting real life usage",
-      ],
-    },
-  ]),
-  PARAMETERS: createTooltipContent("Model Parameters:", [
-    {
-      label: "Measurement",
-      description: "Total number of trainable parameters in billions",
-      subItems: [
-        "Indicates model capacity and complexity",
-        "Correlates with computational requirements",
-        "Influences memory usage and inference speed",
-      ],
-    },
-  ]),
-  LICENSE: createTooltipContent("Model License Information:", [
-    {
-      label: "Importance",
-      description: "Legal terms governing model usage and distribution",
-      subItems: [
-        "Commercial vs non-commercial use",
-        "Attribution requirements",
-        "Modification and redistribution rights",
-        "Liability and warranty terms",
-      ],
-    },
-  ]),
-  CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
-    {
-      label: "What is it?",
-      description: "CO₂ emissions of the model evaluation ",
-      subItems: [
-        "Only focuses on model inference for our specific setup",
-        "Considers data center location and energy mix",
-        "Allows equivalent comparision of models on our use case",
-      ],
-    },
-    {
-      label: "Why it matters",
-      description: "Environmental impact of AI model training",
-      subItems: [
-        "Large models can have significant carbon footprints",
-        "Helps make informed choices about model selection",
-      ],
-    },
-    {
-      label: "Learn more",
-      description:
-        "For detailed information about our CO₂ calculation methodology, visit:",
-      subItems: [
-        <a
-          href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
-          target="_blank"
-          rel="noopener noreferrer"
-          style={{ color: "#90caf9" }}
-        >
-          Carbon Emissions Documentation ↗
-        </a>,
-      ],
-    },
-  ]),
 };
 export const UI_TOOLTIPS = {

 );
 export const COLUMN_TOOLTIPS = {
+  //AVERAGE: createTooltipContent("Average score across all benchmarks:", [
+  //  {
+  //    label: "Calculation",
+  //    description: "Weighted average of normalized scores from all benchmarks",
+  //    subItems: [
+  //      "Each benchmark is normalized to a 0-100 scale",
+  //      "All normalised benchmarks are then averaged together",
+  //    ],
+  //  },
+  //]),
+  //
+  //IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
+  //  {
+  //    label: "Purpose",
+  //    description:
+  //      "Tests model's ability to follow explicit formatting instructions",
+  //    subItems: ["Instruction following", "Formatting", "Generation"],
+  //  },
+  //  {
+  //    label: "Scoring: Accuracy",
+  //    description: "Was the format asked for strictly respected.",
+  //  },
+  //]),
+  //
+  //  {
+  //    label: "Scoring: Accuracy",
+  //    description:
+  //      "Was the correct choice selected among the options.",
+  //  },
+  //]),
+  //
+  //MATH: createTooltipContent(
+  //  "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
+  //  [
+  //    {
+  //      label: "Content",
+  //      description: "High school level competitions mathematical problems",
+  //      subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
+  //    },
+  //    {
+  //      label: "Scoring: Exact match",
+  //      description:
+  //        "Was the solution generated correct and in the expected format",
+  //    },
+  //  ]
+  //),
+  //
+  //GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
+  //  {
+  //    label: "Focus",
+  //    description: "PhD-level knowledge multiple choice questions in science",
+  //    subItems: [
+  //      "Chemistry",
+  //      "Biology",
+  //      "Physics",
+  //    ],
+  //  },
+  //  {
+  //    label: "Scoring: Accuracy",
+  //    description:
+  //      "Was the correct choice selected among the options.",
+  //  },
+  //]),
+  //
+  //MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
+  //  {
+  //    label: "Scope",
+  //    description: "Reasoning and understanding on/of long texts",
+  //    subItems: [
+  //      "Language understanding",
+  //      "Reasoning capabilities",
+  //      "Long context reasoning",
+  //    ],
+  //  },
+  //  {
+  //    label: "Scoring: Accuracy",
+  //    description:
+  //      "Was the correct choice selected among the options.",
+  //  },
+  //]),
+  //
+  //MMLU_PRO: createTooltipContent(
+  //  "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
+  //  [
+  //    {
+  //      label: "Coverage",
+  //      description: "Expertly reviewed multichoice questions across domains, for example:",
+  //      subItems: [
+  //        "Medicine and healthcare",
+  //        "Law and ethics",
+  //        "Engineering",
+  //        "Mathematics",
+  //      ],
+  //    },
+  //    {
+  //      label: "Scoring: Accuracy",
+  //      description:
+  //        "Was the correct choice selected among the options.",
+  //    },
+  //  ]
+  //),
+  //
+  //ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
+  //  {
+  //    label: "Definition",
+  //    description: "The fundamental structure and design of the model",
+  //    subItems: [
+  //      "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
+  //      "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
+  //      "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
+  //      "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
+  //      "Merged: Combining multiple models through weights averaging or similar methods.",
+  //      "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
+  //    ],
+  //  },
+  //  {
+  //    label: "Impact",
+  //    description: "How architecture affects model capabilities",
+  //    subItems: [
+  //      "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
+  //      "Fine-tuned and chat models can be more verbose and more chatty than base models.",
+  //      "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
+  //    ],
+  //  },
+  //]),
+  //
+  //PRECISION: createTooltipContent("Numerical Precision Format:", [
+  //  {
+  //    label: "Overview",
+  //    description:
+  //      "Data format used to store model weights and perform computations",
+  //    subItems: [
+  //      "bfloat16: Half precision (Brain Float format), good for stability",
+  //      "float16: Half precision",
+  //      "8bit/4bit: Quantized formats, for efficiency",
+  //      "GPTQ/AWQ: Quantized methods",
+  //    ],
+  //  },
+  //  {
+  //    label: "Impact",
+  //    description: "How precision affects model deployment",
+  //    subItems: [
+  //      "Higher precision = better accuracy but more memory usage",
+  //      "Lower precision = faster inference and smaller size",
+  //      "Trade-off between model quality and resource usage",
+  //    ],
+  //  },
+  //]),
+  //
+  //FLAGS: createTooltipContent("Model Flags and Special Features:", [
+  //  {
+  //    label: "Filters",
+  //    subItems: [
+  //      "Mixture of Expert: Uses a MoE architecture",
+  //      "Merged models: Created by averaging other models",
+  //      "Contaminated: Flagged by users from the community for (possibly accidental) cheating",
+  //      "Unavailable: No longer on the hub (private, deleted) or missing a license tag",
+  //    ],
+  //  },
+  //  {
+  //    label: "Purpose",
+  //    description: "Why do people want to hide these models?",
+  //    subItems: [
+  //      "Mixture of Experts: These models can be too parameter heavy",
+  //      "Merged models: Performance on benchmarks tend to be inflated compared to real life usage",
+  //      "Contaminated: Performance on benchmarks is inflated and not reflecting real life usage",
+  //    ],
+  //  },
+  //]),
+  //
+  //PARAMETERS: createTooltipContent("Model Parameters:", [
+  //  {
+  //    label: "Measurement",
+  //    description: "Total number of trainable parameters in billions",
+  //    subItems: [
+  //      "Indicates model capacity and complexity",
+  //      "Correlates with computational requirements",
+  //      "Influences memory usage and inference speed",
+  //    ],
+  //  },
+  //]),
+  //
+  //LICENSE: createTooltipContent("Model License Information:", [
+  //  {
+  //    label: "Importance",
+  //    description: "Legal terms governing model usage and distribution",
+  //    subItems: [
+  //      "Commercial vs non-commercial use",
+  //      "Attribution requirements",
+  //      "Modification and redistribution rights",
+  //      "Liability and warranty terms",
+  //    ],
+  //  },
+  //]),
+  //
+  //CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
+  //  {
+  //    label: "What is it?",
+  //    description: "CO₂ emissions of the model evaluation ",
+  //    subItems: [
+  //      "Only focuses on model inference for our specific setup",
+  //      "Considers data center location and energy mix",
+  //      "Allows equivalent comparision of models on our use case",
+  //    ],
+  //  },
+  //  {
+  //    label: "Why it matters",
+  //    description: "Environmental impact of AI model training",
+  //    subItems: [
+  //      "Large models can have significant carbon footprints",
+  //      "Helps make informed choices about model selection",
+  //    ],
+  //  },
+  //  {
+  //    label: "Learn more",
+  //    description:
+  //      "For detailed information about our CO₂ calculation methodology, visit:",
+  //    subItems: [
+  //      <a
+  //        href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
+  //        target="_blank"
+  //        rel="noopener noreferrer"
+  //        style={{ color: "#90caf9" }}
+  //      >
+  //        Carbon Emissions Documentation ↗
+  //      </a>,
+  //    ],
+  //  },
+  //]),
 };
 export const UI_TOOLTIPS = {

frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js CHANGED Viewed

@@ -62,10 +62,11 @@ export const useLeaderboardData = () => {
 export const useLeaderboardProcessing = () => {
   const { state, actions } = useLeaderboard();
   const [sorting, setSorting] = useState([
-    { id: "model.average_score", desc: true },
   ]);
   const memoizedData = useMemo(() => state.models, [state.models]);
   const memoizedFilters = useMemo(
     () => ({
       search: state.filters.search,
@@ -112,6 +113,8 @@ export const useLeaderboardProcessing = () => {
     memoizedFilters.isOfficialProviderActive
   );
   return {
     table,
     minAverage,

 export const useLeaderboardProcessing = () => {
   const { state, actions } = useLeaderboard();
   const [sorting, setSorting] = useState([
+    { id: "id", desc: true },
   ]);
   const memoizedData = useMemo(() => state.models, [state.models]);
+  console.log({memoizedData});
   const memoizedFilters = useMemo(
     () => ({
       search: state.filters.search,
     memoizedFilters.isOfficialProviderActive
   );
+  console.log({columns});
   return {
     table,
     minAverage,

frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED Viewed

@@ -639,42 +639,42 @@ export const createColumns = (
       },
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
     },
-    {
-      accessorKey: "model.average_score",
-      header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "model.average_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
-      meta: {
-        headerStyle: {
-          borderLeft: (theme) =>
-            `2px solid ${alpha(
-              theme.palette.divider,
-              theme.palette.mode === "dark" ? 0.1 : 0.2
-            )}`,
-          borderRight: (theme) =>
-            `2px solid ${alpha(
-              theme.palette.divider,
-              theme.palette.mode === "dark" ? 0.1 : 0.2
-            )}`,
-        },
-        cellStyle: (value) => ({
-          position: "relative",
-          overflow: "hidden",
-          padding: "8px 16px",
-          borderLeft: (theme) =>
-            `2px solid ${alpha(
-              theme.palette.divider,
-              theme.palette.mode === "dark" ? 0.1 : 0.2
-            )}`,
-          borderRight: (theme) =>
-            `2px solid ${alpha(
-              theme.palette.divider,
-              theme.palette.mode === "dark" ? 0.1 : 0.2
-            )}`,
-        }),
-      },
-    },
   ];
   const createScoreCell = (getValue, row, field) => {
     const value = getValue();
@@ -751,59 +751,59 @@ export const createColumns = (
   const evaluationColumns = [
     {
-      accessorKey: "evaluations.ifeval.normalized_score",
-      header: createHeaderCell("IFEval", COLUMN_TOOLTIPS.IFEVAL),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.ifeval.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.ifeval.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.bbh.normalized_score",
-      header: createHeaderCell("BBH", COLUMN_TOOLTIPS.BBH),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.bbh.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.bbh.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.math.normalized_score",
-      header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.math.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.math.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.gpqa.normalized_score",
-      header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.gpqa.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.musr.normalized_score",
-      header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
       cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.musr.normalized_score"
       ],
     },
     {
-      accessorKey: "evaluations.mmlu_pro.normalized_score",
-      header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
       cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.mmlu_pro.normalized_score"
       ],
     },
   ];
   const optionalColumns = [

       },
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["id"],
     },
+    //{
+    //  accessorKey: "model.average_score",
+    //  header: createHeaderCell("Average", COLUMN_TOOLTIPS.AVERAGE),
+    //  cell: ({ row, getValue }) =>
+    //    createScoreCell(getValue, row, "model.average_score"),
+    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"],
+    //  meta: {
+    //    headerStyle: {
+    //      borderLeft: (theme) =>
+    //        `2px solid ${alpha(
+    //          theme.palette.divider,
+    //          theme.palette.mode === "dark" ? 0.1 : 0.2
+    //        )}`,
+    //      borderRight: (theme) =>
+    //        `2px solid ${alpha(
+    //          theme.palette.divider,
+    //          theme.palette.mode === "dark" ? 0.1 : 0.2
+    //        )}`,
+    //    },
+    //    cellStyle: (value) => ({
+    //      position: "relative",
+    //      overflow: "hidden",
+    //      padding: "8px 16px",
+    //      borderLeft: (theme) =>
+    //        `2px solid ${alpha(
+    //          theme.palette.divider,
+    //          theme.palette.mode === "dark" ? 0.1 : 0.2
+    //        )}`,
+    //      borderRight: (theme) =>
+    //        `2px solid ${alpha(
+    //          theme.palette.divider,
+    //          theme.palette.mode === "dark" ? 0.1 : 0.2
+    //        )}`,
+    //    }),
+    //  },
+    //},
   ];
   const createScoreCell = (getValue, row, field) => {
     const value = getValue();
   const evaluationColumns = [
     {
+      accessorKey: "evaluations.safetensors.value",
+      header: createHeaderCell("Safetensors usage", COLUMN_TOOLTIPS.IFEVAL),
       cell: ({ row, getValue }) =>
+        createScoreCell(getValue, row, "evaluations.safetendors.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+        "evaluations.safetendors.value"
       ],
     },
     {
+      accessorKey: "evaluations.secure_coding.value",
+      header: createHeaderCell("Secure Coding", COLUMN_TOOLTIPS.BBH),
       cell: ({ row, getValue }) =>
+        createScoreCell(getValue, row, "evaluations.secure_coding.value"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+        "evaluations.secure_coding.value"
       ],
     },
+    //{
+    //  accessorKey: "evaluations.math.normalized_score",
+    //  header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
+    //  cell: ({ row, getValue }) =>
+    //    createScoreCell(getValue, row, "evaluations.math.normalized_score"),
+    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+    //    "evaluations.math.normalized_score"
+    //  ],
+    //},
+    //{
+    //  accessorKey: "evaluations.gpqa.normalized_score",
+    //  header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
+    //  cell: ({ row, getValue }) =>
+    //    createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
+    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+    //    "evaluations.gpqa.normalized_score"
+    //  ],
+    //},
+    //{
+    //  accessorKey: "evaluations.musr.normalized_score",
+    //  header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
+    //  cell: ({ row, getValue }) =>
+    //    createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
+    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+    //    "evaluations.musr.normalized_score"
+    //  ],
+    //},
+    //{
+    //  accessorKey: "evaluations.mmlu_pro.normalized_score",
+    //  header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
+    //  cell: ({ row, getValue }) =>
+    //    createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
+    //  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+    //    "evaluations.mmlu_pro.normalized_score"
+    //  ],
+    //},
   ];
   const optionalColumns = [