eleftherias commited on
Commit
9e08e77
·
1 Parent(s): 98d2a44

Add Methodology page

Browse files
frontend/src/App.js CHANGED
@@ -17,6 +17,7 @@ import getTheme from "./config/theme";
17
  import { useThemeMode } from "./hooks/useThemeMode";
18
  import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
19
  import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
 
20
 
21
  const queryClient = new QueryClient({
22
  defaultOptions: {
@@ -109,6 +110,7 @@ function App() {
109
  <Route path="/add" element={<AddModelPage />} />
110
  <Route path="/quote" element={<QuotePage />} />
111
  <Route path="/vote" element={<VoteModelPage />} />
 
112
  </Routes>
113
  </Box>
114
  </Box>
 
17
  import { useThemeMode } from "./hooks/useThemeMode";
18
  import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
19
  import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
20
+ import MethodologyPage from "./pages/MethodologyPage/MethodologyPage";
21
 
22
  const queryClient = new QueryClient({
23
  defaultOptions: {
 
110
  <Route path="/add" element={<AddModelPage />} />
111
  <Route path="/quote" element={<QuotePage />} />
112
  <Route path="/vote" element={<VoteModelPage />} />
113
+ <Route path="/methodology" element={<MethodologyPage />} />
114
  </Routes>
115
  </Box>
116
  </Box>
frontend/src/components/Navigation/Navigation.js CHANGED
@@ -410,6 +410,12 @@ const Navigation = ({ onToggleTheme, mode }) => {
410
  >
411
  Citations
412
  </Box>
 
 
 
 
 
 
413
  </Box>
414
 
415
  <Separator />
 
410
  >
411
  Citations
412
  </Box>
413
+ <Box
414
+ onClick={handleNavigation("/methodology")}
415
+ sx={linkStyle(location.pathname === "/methodology")}
416
+ >
417
+ Methodology
418
+ </Box>
419
  </Box>
420
 
421
  <Separator />
frontend/src/pages/MethodologyPage/MethodologyPage.js ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from "react";
2
+ import { usePageTracking } from "../../hooks/usePageTracking";
3
+ import PageHeader from "../../components/shared/PageHeader";
4
+ import {
5
+ Box,
6
+ Typography,
7
+ Link,
8
+ } from "@mui/material";
9
+
10
+ function MethodologyPage() {
11
+ usePageTracking();
12
+
13
+ const metrics = [
14
+ {
15
+ title: "SafeTensors Implementation",
16
+ description: <>We check whether models use the SafeTensors format for storing weights.
17
+ SafeTensors protect against several attack vectors compared to traditional pickle-based formats, which can contain arbitrary code execution vulnerabilities.
18
+ Models receive a 100% score for this metric if they are implemented using SafeTensors.</>
19
+ },
20
+ {
21
+ title: "Insecure Package Detection",
22
+ description: <>This evaluation tests a model's awareness of malicious or deprecated packages in the NPM and PyPI ecosystems.
23
+ We prompt models with 156 requests to install known problematic packages and observe their responses.
24
+ Models receive a score based on how many of our examples they recognize as problematic packages.</>
25
+ },
26
+ {
27
+ title: "CVE Knowledge Assessment",
28
+ description: <>We evaluate a model's understanding of Common Vulnerabilities and Exposures (CVEs) in the NPM and PyPI ecosystems by asking the model to describe 80 CVEs.
29
+ We use <Link href="https://wandb.ai/byyoung3/Generative-AI/reports/Evaluating-AI-Generated-Text-with-ROUGE--VmlldzoxMDc0Mzc5OA" target="_blank" rel="noopener">ROUGE unigram scoring</Link> to compare the model's description to the official CVE record.
30
+ This score reflects how accurately models can recall and explain known security vulnerabilities.</>
31
+ },
32
+ {
33
+ title: "Vulnerable Code Recognition",
34
+ description: <>Using a subset of Meta's <Link href="https://ai.meta.com/research/publications/cyberseceval-3-advancing-the-evaluation-of-cybersecurity-risks-and-capabilities-in-large-language-models/" target="_blank" rel="noopener">CyberSecEval</Link> benchmark dataset, we test models' ability to identify security flaws in code samples.
35
+ Models are presented with 595 snippets of code containing known vulnerabilities and must correctly identify the security issues.
36
+ We use cosine similarity to compare the model's response against the known vulnerability in the code.
37
+ This approach measures their capability to assist in secure development practices.</>
38
+ }
39
+ ];
40
+
41
+ return (
42
+ <Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", py: 4, px: 0 }}>
43
+ <PageHeader
44
+ title="Methodology"
45
+ subtitle="How models are evaluated in the LLM Security Leaderboard"
46
+ />
47
+ <Typography variant="h5" sx={{mb: 3}}>
48
+ Evaluation Metrics
49
+ </Typography>
50
+ <Box sx={{display: "flex", flexDirection: "column", gap: 4, mb: 3}}>
51
+ {metrics.map((metric, index) => (
52
+ <Box key={index}>
53
+ <Typography variant="h6" sx={{mb: 1, fontWeight: 600}}>
54
+ {metric.title}
55
+ </Typography>
56
+ <Typography variant="body1" color="text.secondary" component="div">
57
+ {metric.description}
58
+ </Typography>
59
+ </Box>
60
+ ))}
61
+ </Box>
62
+
63
+ <Typography variant="h5" sx={{mb: 3}}>
64
+ Evaluation Infrastructure
65
+ </Typography>
66
+ <Box sx={{mb: 4}}>
67
+ <Typography variant="body1" sx={{mb: 2}}>
68
+ All model evaluations are performed using the <Link href="https://github.com/vllm-project/vllm"
69
+ target="_blank" rel="noopener">vLLM library</Link> with
70
+ 4-bit quantization.
71
+ This approach allows us to efficiently run evaluations on multiple models while maintaining reasonable
72
+ inference speed and accuracy.
73
+ </Typography>
74
+ </Box>
75
+
76
+ <Typography variant="h5" sx={{ mb: 3 }}>
77
+ Additional Resources
78
+ </Typography>
79
+ <Box sx={{mb: 4}}>
80
+ <Typography variant="body1">
81
+ For complete transparency, we provide access to our <Link
82
+ href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-data" target="_blank"
83
+ rel="noopener">full dataset</Link> containing
84
+ all packages, CVEs, and code samples used in these evaluations.
85
+ You can also explore the <Link
86
+ href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-contents" target="_blank"
87
+ rel="noopener">detailed evaluation results</Link> which
88
+ include the exact prompts and responses from each model.
89
+ </Typography>
90
+ </Box>
91
+ </Box>
92
+ );
93
+ }
94
+
95
+ export default MethodologyPage;