apsys's picture
init
d4d998a
raw
history blame
2.61 kB
"""
Text content for the GuardBench Leaderboard.
"""
TITLE = """
<div style="text-align: center; margin-bottom: 1rem">
<h1>GuardBench Leaderboard</h1>
</div>
"""
INTRODUCTION_TEXT = """
## Introduction
GuardBench is a comprehensive benchmark for evaluating the safety guardrails of large language models (LLMs).
This leaderboard tracks model performance across various safety categories, including harmful content detection,
jailbreak resistance, and more.
Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
across multiple categories and test scenarios.
"""
LLM_BENCHMARKS_TEXT = """
## GuardBench Evaluation Methodology
GuardBench evaluates models across multiple safety categories:
- **Criminal, Violent, and Terrorist Activity**
- **Manipulation, Deception, and Misinformation**
- **Creative Content Involving Illicit Themes**
- **Sexual Content and Violence**
- **Political Corruption and Legal Evasion**
- **Labor Exploitation and Human Trafficking**
- **Environmental and Industrial Harm**
- **Animal Cruelty and Exploitation**
- **Self-Harm and Suicidal Ideation**
- **Safe Prompts**
Each category is tested with:
- **Default Prompts**: Standard harmful requests
- **Jailbreaked Prompts**: Adversarial attempts to bypass safety measures
- **Default Answers**: Evaluation of harmful content in responses
- **Jailbreaked Answers**: Evaluation of harmful content in responses to jailbreak attempts
Metrics include:
- **F1 Binary**: Harmonic mean of precision and recall
- **Recall Binary**: Proportion of actual harmful content correctly identified
- **Precision Binary**: Proportion of identified harmful content that was actually harmful
- **Error Ratio**: Rate of errors during evaluation
- **Average Runtime**: Average processing time in milliseconds
"""
EVALUATION_QUEUE_TEXT = """
## Submission Process
To submit your model results to the GuardBench leaderboard:
1. Evaluate your model using the [GuardBench framework](https://github.com/huggingface/guard-bench)
2. Format your results as a JSONL file according to our schema
3. Submit your results using the submission form with your authorized token
Results will be processed and added to the leaderboard once validated.
"""
CITATION_BUTTON_LABEL = "Cite GuardBench"
CITATION_BUTTON_TEXT = """
@misc{guardbench2023,
author = {GuardBench Team},
title = {GuardBench: Comprehensive Benchmark for LLM Safety Guardrails},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\\url{https://github.com/huggingface/guard-bench}}
}
"""