apsys commited on
Commit
b5d5c8b
·
1 Parent(s): 0d1b00a

style theme

Browse files
Files changed (2) hide show
  1. app.py +4 -2
  2. src/about.py +5 -25
app.py CHANGED
@@ -731,6 +731,8 @@ with demo:
731
  inputs=[column_selector],
732
  outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
733
  )
 
 
734
 
735
  with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
736
  with gr.Row():
@@ -781,8 +783,8 @@ with demo:
781
  outputs=[model_selector]
782
  )
783
 
784
- with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
785
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
786
 
787
  with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
788
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
731
  inputs=[column_selector],
732
  outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
733
  )
734
+ with gr.Row():
735
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
736
 
737
  with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
738
  with gr.Row():
 
783
  outputs=[model_selector]
784
  )
785
 
786
+ # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
787
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
788
 
789
  with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
790
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -12,7 +12,7 @@ INTRODUCTION_TEXT = """
12
  ## Introduction
13
 
14
  GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
15
- This leaderboard tracks model performance across various safety categories, including harmful content detection,
16
  jailbreak resistance, and more.
17
 
18
  Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
@@ -20,33 +20,13 @@ across multiple categories and test scenarios.
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
23
- ## GuardBench Evaluation Methodology
24
 
25
- GuardBench evaluates models across multiple safety categories:
26
 
27
- - **Criminal, Violent, and Terrorist Activity**
28
- - **Manipulation, Deception, and Misinformation**
29
- - **Creative Content Involving Illicit Themes**
30
- - **Sexual Content and Violence**
31
- - **Political Corruption and Legal Evasion**
32
- - **Labor Exploitation and Human Trafficking**
33
- - **Environmental and Industrial Harm**
34
- - **Animal Cruelty and Exploitation**
35
- - **Self-Harm and Suicidal Ideation**
36
- - **Safe Prompts**
37
 
38
- Each category is tested with:
39
- - **Default Prompts**: Standard harmful requests
40
- - **Jailbreaked Prompts**: Adversarial attempts to bypass safety measures
41
- - **Default Answers**: Evaluation of harmful content in responses
42
- - **Jailbreaked Answers**: Evaluation of harmful content in responses to jailbreak attempts
43
-
44
- Metrics include:
45
- - **F1 Binary**: Harmonic mean of precision and recall
46
- - **Recall Binary**: Proportion of actual harmful content correctly identified
47
- - **Precision Binary**: Proportion of identified harmful content that was actually harmful
48
- - **Error Ratio**: Rate of errors during evaluation
49
- - **Average Runtime**: Average processing time in milliseconds
50
  """
51
 
52
  EVALUATION_QUEUE_TEXT = """
 
12
  ## Introduction
13
 
14
  GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
15
+ This leaderboard tracks model performance across various safety categories, including harmful content detection,
16
  jailbreak resistance, and more.
17
 
18
  Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
 
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
23
+ ## GuardBench evaluation methodology
24
 
25
+ GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
26
 
27
+ Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
 
 
 
 
 
 
 
 
 
28
 
29
+ We track how accurate they are, how often they make mistakes, and how fast they respond.
 
 
 
 
 
 
 
 
 
 
 
30
  """
31
 
32
  EVALUATION_QUEUE_TEXT = """