Spaces:
Running
Running
File size: 19,643 Bytes
7e9159f 28cb369 7e9159f 28cb369 dac23c7 28cb369 a386c90 28cb369 54b4b29 1be7b41 01bffdb 1be7b41 6b6db8c 54b4b29 32d5f3f 6ba3079 32d5f3f 54b4b29 7e9159f 28cb369 7e9159f 28cb369 7e9159f 28cb369 7e9159f 28cb369 c683b58 994b096 c683b58 28cb369 7e9159f 28cb369 7e9159f 28cb369 7e9159f 28cb369 7e9159f 28cb369 44c12b6 c683b58 44c12b6 c683b58 44c12b6 c683b58 44c12b6 c683b58 28cb369 7e9159f c683b58 28cb369 7e9159f c683b58 28cb369 c683b58 28cb369 cd84e2f 28cb369 cd84e2f 28cb369 cd84e2f 28cb369 cd84e2f 7e9159f cd84e2f 07e8e35 cd84e2f 28cb369 8464311 28cb369 5983adb 28cb369 5983adb 28cb369 cd84e2f 28cb369 cd84e2f c683b58 28cb369 a386c90 28cb369 5a674b6 c683b58 5a674b6 c683b58 28cb369 94acf0b 28cb369 39329a7 7e9159f 28cb369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 |
import gradio as gr
import os
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import re
import torch
# Set environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", None)
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">JudgeLRM</h1>
<p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
<p>Enter an instruction and two responses, and the model will think, reason and score them on a scale of 1-10 (higher is better).</p>
<p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
</div>
'''
LICENSE = """
<div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
@misc{nuo2025judgelrm,
title={JudgeLRM: Large Reasoning Models as a Judge},
author={Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
year={2025},
eprint={2504.00050},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2504.00050},
}
@misc{wang2025assessingjudgingbias,
title = {Assessing Judging Bias in Large Reasoning Models: An Empirical Study},
author = {Qian Wang, Zhanzhi Lou, Zhenheng Tang, Nuo Chen, Xuandong Zhao, Wenxuan Zhang, Dawn Song, Bingsheng He},
year={2025},
eprint={2504.09946},
archivePrefix={arXiv},
primaryClass={cs.CY},
url={https://arxiv.org/abs/2504.09946},
}
</div>
"""
PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">JudgeLRM</h1>
<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter an instruction and two responses, I'll evaluate their quality...</p>
</div>
"""
css = """
h1 {
text-align: center;
display: block;
}
#duplicate-button {
margin: auto;
color: white;
background: #1565c0;
border-radius: 100vh;
}
"""
# Model paths
MODEL_PATHS = {
"JudgeLRM-3B": "nuojohnchen/JudgeLRM-3B",
"JudgeLRM-7B": "nuojohnchen/JudgeLRM-7B"
}
# Popular models for dropdown selection
POPULAR_MODELS = [
"Qwen/Qwen2.5-7B-Instruct",
"01-ai/Yi-6B-Chat",
"FreedomIntelligence/Apollo-7B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"openchat/openchat-3.5-0106"
]
# Global variables for model and tokenizer
tokenizer = None
model = None
current_model_name = None
response_model_1 = None
response_model_2 = None
response_tokenizer_1 = None
response_tokenizer_2 = None
def extract_scores(text):
"""Extract scores from generated text"""
pattern = r'<answer>(\d+)</answer><answer>(\d+)</answer>'
match = re.search(pattern, text)
if match:
return int(match.group(1)), int(match.group(2))
return None, None
# Function to determine which model path to use
def get_model_path(dropdown_value, custom_value):
"""Return custom value if provided, otherwise return dropdown value"""
if custom_value and custom_value.strip():
return custom_value.strip()
return dropdown_value
# Function to generate response from a model
def generate_response(instruction, model_path, progress=gr.Progress()):
"""Generate a response from a specified model"""
progress(0, desc=f"Loading model {model_path}...")
try:
# Load model and tokenizer
response_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
response_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
progress(0.5, desc=f"Generating response from {model_path}...")
# Create prompt (simple format, adjust as needed for specific models)
prompt = f"<|user|>\n{instruction}\n<|assistant|>"
# Generate response
input_ids = response_tokenizer.encode(prompt, return_tensors="pt").to(response_model.device)
output = response_model.generate(
input_ids=input_ids,
max_new_tokens=1024,
temperature=0.7,
do_sample=True
)
full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
if "<|user|>" in clean_response:
clean_response = clean_response.split("<|user|>")[0].strip()
for token in ["<user>", "User:", "Human:"]:
if token in clean_response:
clean_response = clean_response.split(token)[0].strip()
# If the model doesn't use these exact tokens, try to extract just the assistant's response
if clean_response == full_response:
# Try to find where the assistant's response starts
parts = full_response.split(instruction)
if len(parts) > 1:
clean_response = parts[1].strip()
# Further clean any remaining tokens
for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
clean_response = clean_response.replace(token, "").strip()
# Clean up resources
del response_model
del response_tokenizer
torch.cuda.empty_cache()
progress(1.0, desc=f"Response from {model_path} generated")
return clean_response
except Exception as e:
return f"Error generating response: {str(e)}"
@spaces.GPU(duration=200)
def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
"""
Evaluate the quality of two responses
Args:
instruction (str): Instruction/question
response1 (str): First response
response2 (str): Second response
model_name (str): Model to use for evaluation
temperature (float): Generation temperature
max_new_tokens (int): Maximum number of tokens to generate
Returns:
str: Generated evaluation result
"""
global tokenizer, model, current_model_name
# Load model on demand if it's not already loaded or if a different model is requested
if model is None or model_name != current_model_name:
# Clear GPU memory if a model is already loaded
if model is not None:
del model
del tokenizer
torch.cuda.empty_cache()
# Update status
yield f"Loading {model_name}... Please wait."
# Load the requested model
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], use_fast=False)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATHS[model_name], device_map="auto")
current_model_name = model_name
except Exception as e:
yield f"Error loading model: {str(e)}"
return
# Build prompt
prompt = """<|im_start|>system\nYou are a helpful assistant. The assistant first performs a detailed, step-by-step reasoning process in its mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> detailed reasoning process here, explaining each step of your evaluation for both assistants </think><answer> answer here </answer>. Now the user asks you to judge the performance of two AI assistants in response to the question. Score assistants 1-10 (higher=better). Criteria includes helpfulness, relevance, accuracy, and level of detail. Avoid order, length, style or other bias. After thinking, when you finally reach a conclusion, clearly provide your evaluation scores within <answer> </answer> tags, i.e. for example,<answer>3</answer><answer>5</answer>\n<|im_end|>\n<|im_start|>user\n[Question]\n{question}\n\n[Assistant 1's Answer]\n{answer_1}\n\n[Assistant 2's Answer]\n{answer_2}\n<|im_end|>\n<|im_start|>assistant\n"""
formatted_prompt = prompt.format(question=instruction, answer_1=response1, answer_2=response2)
# Set up streaming output
input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
)
if temperature == 0:
generate_kwargs['do_sample'] = False
# Run generation in a separate thread
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# Collect generated text
outputs = []
for text in streamer:
outputs.append(text)
full_text = "".join(outputs)
# Try to extract scores
score1, score2 = extract_scores(full_text)
if score1 and score2:
result = f"{full_text}\n\n**Evaluation Results:** Response 1 Score: {score1}/10, Response 2 Score: {score2}/10"
else:
result = full_text
yield result
@spaces.GPU(duration=200)
def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
"""Generate responses from two models and judge them"""
progress(0, desc="Starting generation process")
# Determine which model paths to use
model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
# Generate responses from both models
progress(0.1, desc=f"Generating response from {model_path_1}")
response1 = generate_response(instruction, model_path_1, progress)
progress(0.4, desc=f"Generating response from {model_path_2}")
response2 = generate_response(instruction, model_path_2, progress)
# # Update the response textboxes
# progress(0.7, desc="Evaluating responses")
# # Use the judge_responses generator but collect all outputs
# evaluation_results = ""
# for result in judge_responses(instruction, response1, response2, judge_model_name, temperature, max_new_tokens):
# evaluation_results = result
# progress(1.0, desc="Evaluation complete")
return response1, response2, None
# Function to stream evaluation results after responses are generated
@spaces.GPU(duration=200)
def stream_evaluation(instruction, response1, response2, judge_model_name, temperature=0.1, max_new_tokens=2048):
"""Stream evaluation results after responses are generated"""
for result in judge_responses(instruction, response1, response2, judge_model_name, temperature, max_new_tokens):
yield result
# Create Gradio interface
with gr.Blocks(fill_height=True, css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tabs():
# Auto-Generate Responses tab (now first)
with gr.TabItem("Auto-Generate Responses"):
with gr.Row():
with gr.Column(scale=1):
# Model selection for judge
auto_model_dropdown = gr.Dropdown(
choices=list(MODEL_PATHS.keys()),
value="JudgeLRM-7B", # Default selection
label="Select Judge Model"
)
auto_instruction = gr.Textbox(label="Instruction/Question",placeholder="Will a computer science PhD graduate be unemployed?", lines=3)
# Model 1 selection
with gr.Row():
model_dropdown_1 = gr.Dropdown(
choices=POPULAR_MODELS,
value=POPULAR_MODELS[0],
label="Select Model 1",
scale=2
)
custom_model_1 = gr.Textbox(
label="Or enter custom model path",
placeholder="e.g., Qwen/Qwen2.5-7B-Instruct",
scale=3
)
# Model 2 selection
with gr.Row():
model_dropdown_2 = gr.Dropdown(
choices=POPULAR_MODELS,
value=POPULAR_MODELS[1],
label="Select Model 2",
scale=2
)
custom_model_2 = gr.Textbox(
label="Or enter custom model path",
placeholder="e.g., deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
scale=3
)
with gr.Accordion("⚙️ Parameters", open=False):
auto_temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Judge Temperature")
auto_max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Judge Max Tokens")
auto_submit_btn = gr.Button("Generate Responses and Evaluate")
with gr.Row():
with gr.Column():
auto_response1 = gr.Textbox(label="Response from Model 1", lines=10)
with gr.Column():
auto_response2 = gr.Textbox(label="Response from Model 2", lines=10)
with gr.Row():
auto_output = gr.Textbox(label="Evaluation Results", lines=15)
# # Handle auto-generation and evaluation
# auto_submit_btn.click(
# fn=generate_and_judge,
# inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
# outputs=[auto_response1, auto_response2, auto_output]
# )
auto_submit_btn.click(
fn=generate_and_judge,
inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
outputs=[auto_response1, auto_response2, auto_output]
).then(
fn=stream_evaluation,
inputs=[auto_instruction, auto_response1, auto_response2, auto_model_dropdown, auto_temperature, auto_max_tokens],
outputs=auto_output
)
# Examples for auto-generation
auto_examples = [
["Write a short poem about artificial intelligence",
"Qwen/Qwen2.5-7B-Instruct",
"01-ai/Yi-6B-Chat"],
["我听说有些人有高血压却没有任何症状。这是真的吗?",
"FreedomIntelligence/Apollo-7B",
"openchat/openchat-3.5-0106"]
]
# 创建一个函数来处理示例点击
def process_example(instruction, model1, model2):
"""处理示例点击,将模型名称填入下拉菜单,清空自定义输入框"""
return instruction, model1, model1, model2, model2
# 使用gr.Examples的处理函数
gr.Examples(
examples=auto_examples,
inputs=[auto_instruction, model_dropdown_1, model_dropdown_2],
fn=process_example,
outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
)
# Manual Evaluation tab (now second)
with gr.TabItem("Manual Evaluation"):
with gr.Row():
with gr.Column():
# Model selection
model_dropdown = gr.Dropdown(
choices=list(MODEL_PATHS.keys()),
value="JudgeLRM-7B", # Default selection
label="Select Judge Model"
)
instruction = gr.Textbox(label="Instruction/Question", lines=3)
response1 = gr.Textbox(label="Response 1", lines=8)
response2 = gr.Textbox(label="Response 2", lines=8)
with gr.Accordion("⚙️ Parameters", open=False):
temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Temperature")
max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Max Tokens")
submit_btn = gr.Button("Evaluate Responses")
with gr.Column():
output = gr.Textbox(label="Evaluation Results", lines=20)
# Handle evaluation
submit_btn.click(
fn=judge_responses,
inputs=[instruction, response1, response2, model_dropdown, temperature, max_tokens],
outputs=output
)
# Examples
examples = [
["Include important study notes and key points that someone should know about the given subject. \n history of the USA",
"The history of the United States is one of the most influential and influential countries in the world. It has had a profound impact on the world and continues to shape the world today. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a profound impact on the world and continues to shape the world today.",
"1. The United States of America was founded in 1776.\n2. The Declaration of Independence was signed in 1776.\n3. The Constitution of the United States of America was signed in 1787.\n4. The Civil War began in 1861.\n5. The Emancipation Proclamation was issued in 1863.\n6. The 13th Amendment was ratified in 1865.\n7. The 14th Amendment was ratified in 1868.\n8. The 15th Amendment was ratified in 1870.\n9. The 16th Amendment was ratified in 1913.\n10. The 17th Amendment was ratified in 1913.\n11. The 18th Amendment was ratified in 1919.\n12. The 19th Amendment was ratified in 1920.\n13. The 20th Amendment was ratified in 1933.\n14. The 21st Amendment was ratified in 1933.\n15. The 22nd Amendment was ratified in"]
]
gr.Examples(examples=examples, inputs=[instruction, response1, response2])
gr.HTML(LICENSE)
if __name__ == "__main__":
demo.launch() |