Spaces:

whitecircle-ai
/

circle-guard-bench

Running

App Files Files Community

apsys commited on 19 days ago

Commit

b5d5c8b

1 Parent(s): 0d1b00a

style theme

Browse files

Files changed (2) hide show

app.py +4 -2
src/about.py +5 -25

app.py CHANGED Viewed

@@ -731,6 +731,8 @@ with demo:
                     inputs=[column_selector],
                     outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
                 )
             with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
                 with gr.Row():
@@ -781,8 +783,8 @@ with demo:
                     outputs=[model_selector]
                 )
-            with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
-                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

                     inputs=[column_selector],
                     outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
                 )
+                with gr.Row():
+                    gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
                 with gr.Row():
                     outputs=[model_selector]
                 )
+            # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
+            #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

src/about.py CHANGED Viewed

@@ -12,7 +12,7 @@ INTRODUCTION_TEXT = """
 ## Introduction
 GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
-This leaderboard tracks model performance across various safety categories, including harmful content detection,
 jailbreak resistance, and more.
 Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
@@ -20,33 +20,13 @@ across multiple categories and test scenarios.
 """
 LLM_BENCHMARKS_TEXT = """
-## GuardBench Evaluation Methodology
-GuardBench evaluates models across multiple safety categories:
-- **Criminal, Violent, and Terrorist Activity**
-- **Manipulation, Deception, and Misinformation**
-- **Creative Content Involving Illicit Themes**
-- **Sexual Content and Violence**
-- **Political Corruption and Legal Evasion**
-- **Labor Exploitation and Human Trafficking**
-- **Environmental and Industrial Harm**
-- **Animal Cruelty and Exploitation**
-- **Self-Harm and Suicidal Ideation**
-- **Safe Prompts**
-Each category is tested with:
-- **Default Prompts**: Standard harmful requests
-- **Jailbreaked Prompts**: Adversarial attempts to bypass safety measures
-- **Default Answers**: Evaluation of harmful content in responses
-- **Jailbreaked Answers**: Evaluation of harmful content in responses to jailbreak attempts
-Metrics include:
-- **F1 Binary**: Harmonic mean of precision and recall
-- **Recall Binary**: Proportion of actual harmful content correctly identified
-- **Precision Binary**: Proportion of identified harmful content that was actually harmful
-- **Error Ratio**: Rate of errors during evaluation
-- **Average Runtime**: Average processing time in milliseconds
 """
 EVALUATION_QUEUE_TEXT = """

 ## Introduction
 GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
+This leaderboard tracks model performance across various safety categories, including harmful content detection,
 jailbreak resistance, and more.
 Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
 """
 LLM_BENCHMARKS_TEXT = """
+## GuardBench evaluation methodology
+GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
+Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
+We track how accurate they are, how often they make mistakes, and how fast they respond.
 """
 EVALUATION_QUEUE_TEXT = """