File size: 19,643 Bytes
7e9159f
28cb369
 
 
 
 
 
7e9159f
28cb369
 
 
 
 
dac23c7
28cb369
a386c90
28cb369
 
 
 
 
54b4b29
1be7b41
 
 
 
01bffdb
1be7b41
 
6b6db8c
54b4b29
32d5f3f
 
 
6ba3079
 
 
 
 
32d5f3f
54b4b29
7e9159f
28cb369
 
 
 
 
 
7e9159f
 
28cb369
 
 
 
 
 
 
 
 
 
 
 
7e9159f
28cb369
 
 
 
 
7e9159f
28cb369
 
 
 
c683b58
994b096
c683b58
28cb369
7e9159f
28cb369
 
 
 
 
 
 
 
7e9159f
28cb369
 
 
 
 
 
 
7e9159f
28cb369
 
 
 
 
 
7e9159f
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44c12b6
c683b58
44c12b6
c683b58
44c12b6
 
c683b58
44c12b6
 
 
c683b58
 
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e9159f
c683b58
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e9159f
c683b58
28cb369
 
c683b58
 
28cb369
 
 
 
 
 
 
 
 
 
 
cd84e2f
 
28cb369
cd84e2f
 
 
 
28cb369
cd84e2f
28cb369
cd84e2f
7e9159f
cd84e2f
07e8e35
cd84e2f
 
 
 
 
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8464311
28cb369
 
 
 
 
 
 
 
 
 
 
5983adb
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
5983adb
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd84e2f
 
 
 
 
 
28cb369
cd84e2f
 
 
 
 
 
 
 
c683b58
28cb369
 
 
 
 
 
a386c90
28cb369
 
5a674b6
 
 
 
 
 
c683b58
 
5a674b6
 
 
c683b58
28cb369
 
94acf0b
28cb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39329a7
7e9159f
 
28cb369
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import gradio as gr
import os
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import re
import torch

# Set environment variables
HF_TOKEN = os.environ.get("HF_TOKEN", None)

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">JudgeLRM</h1>
<p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
<p>Enter an instruction and two responses, and the model will think, reason and score them on a scale of 1-10 (higher is better).</p>
<p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
</div>
'''

LICENSE = """
<div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
@misc{nuo2025judgelrm,
      title={JudgeLRM: Large Reasoning Models as a Judge}, 
      author={Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
      year={2025},
      eprint={2504.00050},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2504.00050}, 
}
@misc{wang2025assessingjudgingbias,
  title     = {Assessing Judging Bias in Large Reasoning Models: An Empirical Study},
  author    = {Qian Wang, Zhanzhi Lou, Zhenheng Tang, Nuo Chen, Xuandong Zhao, Wenxuan Zhang, Dawn Song, Bingsheng He},
  year={2025},
  eprint={2504.09946},
  archivePrefix={arXiv},
  primaryClass={cs.CY},
  url={https://arxiv.org/abs/2504.09946}, 
}
</div>
"""

PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">JudgeLRM</h1>
   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter an instruction and two responses, I'll evaluate their quality...</p>
</div>
"""

css = """
h1 {
  text-align: center;
  display: block;
}
#duplicate-button {
  margin: auto;
  color: white;
  background: #1565c0;
  border-radius: 100vh;
}
"""

# Model paths
MODEL_PATHS = {
    "JudgeLRM-3B": "nuojohnchen/JudgeLRM-3B",
    "JudgeLRM-7B": "nuojohnchen/JudgeLRM-7B"
}

# Popular models for dropdown selection
POPULAR_MODELS = [
    "Qwen/Qwen2.5-7B-Instruct",
    "01-ai/Yi-6B-Chat",
    "FreedomIntelligence/Apollo-7B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "openchat/openchat-3.5-0106"
]

# Global variables for model and tokenizer
tokenizer = None
model = None
current_model_name = None
response_model_1 = None
response_model_2 = None
response_tokenizer_1 = None
response_tokenizer_2 = None

def extract_scores(text):
    """Extract scores from generated text"""
    pattern = r'<answer>(\d+)</answer><answer>(\d+)</answer>'
    match = re.search(pattern, text)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

# Function to determine which model path to use
def get_model_path(dropdown_value, custom_value):
    """Return custom value if provided, otherwise return dropdown value"""
    if custom_value and custom_value.strip():
        return custom_value.strip()
    return dropdown_value

# Function to generate response from a model
def generate_response(instruction, model_path, progress=gr.Progress()):
    """Generate a response from a specified model"""
    progress(0, desc=f"Loading model {model_path}...")
    try:
        # Load model and tokenizer
        response_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
        response_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
        
        progress(0.5, desc=f"Generating response from {model_path}...")
        
        # Create prompt (simple format, adjust as needed for specific models)
        prompt = f"<|user|>\n{instruction}\n<|assistant|>"
        
        # Generate response
        input_ids = response_tokenizer.encode(prompt, return_tensors="pt").to(response_model.device)
        output = response_model.generate(
            input_ids=input_ids,
            max_new_tokens=1024,
            temperature=0.7,
            do_sample=True
        )
        
        full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
    
        clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
    
        if "<|user|>" in clean_response:
            clean_response = clean_response.split("<|user|>")[0].strip()
    
        for token in ["<user>", "User:", "Human:"]:
            if token in clean_response:
                clean_response = clean_response.split(token)[0].strip()
    

        
        # If the model doesn't use these exact tokens, try to extract just the assistant's response
        if clean_response == full_response:
            # Try to find where the assistant's response starts
            parts = full_response.split(instruction)
            if len(parts) > 1:
                clean_response = parts[1].strip()
                # Further clean any remaining tokens
                for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
                    clean_response = clean_response.replace(token, "").strip()
        
        # Clean up resources
        del response_model
        del response_tokenizer
        torch.cuda.empty_cache()
        
        progress(1.0, desc=f"Response from {model_path} generated")
        return clean_response
    except Exception as e:
        return f"Error generating response: {str(e)}"

@spaces.GPU(duration=200)
def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
    """
    Evaluate the quality of two responses
    Args:
        instruction (str): Instruction/question
        response1 (str): First response
        response2 (str): Second response
        model_name (str): Model to use for evaluation
        temperature (float): Generation temperature
        max_new_tokens (int): Maximum number of tokens to generate
    Returns:
        str: Generated evaluation result
    """
    global tokenizer, model, current_model_name
    
    # Load model on demand if it's not already loaded or if a different model is requested
    if model is None or model_name != current_model_name:
        # Clear GPU memory if a model is already loaded
        if model is not None:
            del model
            del tokenizer
            torch.cuda.empty_cache()
        
        # Update status
        yield f"Loading {model_name}... Please wait."
        
        # Load the requested model
        try:
            tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], use_fast=False)
            model = AutoModelForCausalLM.from_pretrained(MODEL_PATHS[model_name], device_map="auto")
            current_model_name = model_name
        except Exception as e:
            yield f"Error loading model: {str(e)}"
            return
    
    # Build prompt
    prompt = """<|im_start|>system\nYou are a helpful assistant. The assistant first performs a detailed, step-by-step reasoning process in its mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> detailed reasoning process here, explaining each step of your evaluation for both assistants </think><answer> answer here </answer>. Now the user asks you to judge the performance of two AI assistants in response to the question. Score assistants 1-10 (higher=better). Criteria includes helpfulness, relevance, accuracy, and level of detail. Avoid order, length, style or other bias. After thinking, when you finally reach a conclusion, clearly  provide your evaluation scores within <answer> </answer> tags, i.e. for example,<answer>3</answer><answer>5</answer>\n<|im_end|>\n<|im_start|>user\n[Question]\n{question}\n\n[Assistant 1's Answer]\n{answer_1}\n\n[Assistant 2's Answer]\n{answer_2}\n<|im_end|>\n<|im_start|>assistant\n"""
    
    formatted_prompt = prompt.format(question=instruction, answer_1=response1, answer_2=response2)
    
    # Set up streaming output
    input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt").to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
    
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
    )
    
    if temperature == 0:
        generate_kwargs['do_sample'] = False
    
    # Run generation in a separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    # Collect generated text
    outputs = []
    for text in streamer:
        outputs.append(text)
        full_text = "".join(outputs)
        
        # Try to extract scores
        score1, score2 = extract_scores(full_text)
        if score1 and score2:
            result = f"{full_text}\n\n**Evaluation Results:** Response 1 Score: {score1}/10, Response 2 Score: {score2}/10"
        else:
            result = full_text
            
        yield result

@spaces.GPU(duration=200)
def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
    """Generate responses from two models and judge them"""
    progress(0, desc="Starting generation process")
    
    # Determine which model paths to use
    model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
    model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
    
    # Generate responses from both models
    progress(0.1, desc=f"Generating response from {model_path_1}")
    response1 = generate_response(instruction, model_path_1, progress)
    
    progress(0.4, desc=f"Generating response from {model_path_2}")
    response2 = generate_response(instruction, model_path_2, progress)
    
    # # Update the response textboxes
    # progress(0.7, desc="Evaluating responses")
    
    # # Use the judge_responses generator but collect all outputs
    # evaluation_results = ""
    # for result in judge_responses(instruction, response1, response2, judge_model_name, temperature, max_new_tokens):
    #     evaluation_results = result
    
    # progress(1.0, desc="Evaluation complete")
    
    return response1, response2, None

# Function to stream evaluation results after responses are generated
@spaces.GPU(duration=200)
def stream_evaluation(instruction, response1, response2, judge_model_name, temperature=0.1, max_new_tokens=2048):
    """Stream evaluation results after responses are generated"""
    for result in judge_responses(instruction, response1, response2, judge_model_name, temperature, max_new_tokens):
        yield result
        
# Create Gradio interface
with gr.Blocks(fill_height=True, css=css) as demo:
    gr.Markdown(DESCRIPTION)
    
    with gr.Tabs():
        # Auto-Generate Responses tab (now first)
        with gr.TabItem("Auto-Generate Responses"):
            with gr.Row():
                with gr.Column(scale=1):
                    # Model selection for judge
                    auto_model_dropdown = gr.Dropdown(
                        choices=list(MODEL_PATHS.keys()),
                        value="JudgeLRM-7B",  # Default selection
                        label="Select Judge Model"
                    )
                    
                    auto_instruction = gr.Textbox(label="Instruction/Question",placeholder="Will a computer science PhD graduate be unemployed?", lines=3)
                    
                    # Model 1 selection
                    with gr.Row():
                        model_dropdown_1 = gr.Dropdown(
                            choices=POPULAR_MODELS,
                            value=POPULAR_MODELS[0],
                            label="Select Model 1",
                            scale=2
                        )
                        custom_model_1 = gr.Textbox(
                            label="Or enter custom model path",
                            placeholder="e.g., Qwen/Qwen2.5-7B-Instruct",
                            scale=3
                        )
                    
                    # Model 2 selection
                    with gr.Row():
                        model_dropdown_2 = gr.Dropdown(
                            choices=POPULAR_MODELS,
                            value=POPULAR_MODELS[1],
                            label="Select Model 2",
                            scale=2
                        )
                        custom_model_2 = gr.Textbox(
                            label="Or enter custom model path",
                            placeholder="e.g., deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
                            scale=3
                        )
                    
                    with gr.Accordion("⚙️ Parameters", open=False):
                        auto_temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Judge Temperature")
                        auto_max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Judge Max Tokens")
                    
                    auto_submit_btn = gr.Button("Generate Responses and Evaluate")
            
            with gr.Row():
                with gr.Column():
                    auto_response1 = gr.Textbox(label="Response from Model 1", lines=10)
                
                with gr.Column():
                    auto_response2 = gr.Textbox(label="Response from Model 2", lines=10)
            
            with gr.Row():
                auto_output = gr.Textbox(label="Evaluation Results", lines=15)
            
            # # Handle auto-generation and evaluation
            # auto_submit_btn.click(
            #     fn=generate_and_judge,
            #     inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
            #     outputs=[auto_response1, auto_response2, auto_output]
            # )
            auto_submit_btn.click(
               fn=generate_and_judge,
               inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
               outputs=[auto_response1, auto_response2, auto_output]
           ).then(
               fn=stream_evaluation,
               inputs=[auto_instruction, auto_response1, auto_response2, auto_model_dropdown, auto_temperature, auto_max_tokens],
               outputs=auto_output
           )
            # Examples for auto-generation
            auto_examples = [
                ["Write a short poem about artificial intelligence",
                 "Qwen/Qwen2.5-7B-Instruct",
                 "01-ai/Yi-6B-Chat"],
                ["我听说有些人有高血压却没有任何症状。这是真的吗?",
                 "FreedomIntelligence/Apollo-7B",
                 "openchat/openchat-3.5-0106"]
            ]
            
            # 创建一个函数来处理示例点击
            def process_example(instruction, model1, model2):
                """处理示例点击,将模型名称填入下拉菜单,清空自定义输入框"""
                return instruction, model1, model1, model2, model2
            
            # 使用gr.Examples的处理函数
            gr.Examples(
                examples=auto_examples,
                inputs=[auto_instruction, model_dropdown_1, model_dropdown_2],
                fn=process_example,
                outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
            )
        
        # Manual Evaluation tab (now second)
        with gr.TabItem("Manual Evaluation"):
            with gr.Row():
                with gr.Column():
                    # Model selection
                    model_dropdown = gr.Dropdown(
                        choices=list(MODEL_PATHS.keys()),
                        value="JudgeLRM-7B",  # Default selection
                        label="Select Judge Model"
                    )
                    
                    instruction = gr.Textbox(label="Instruction/Question", lines=3)
                    response1 = gr.Textbox(label="Response 1", lines=8)
                    response2 = gr.Textbox(label="Response 2", lines=8)
                    
                    with gr.Accordion("⚙️ Parameters", open=False):
                        temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Temperature")
                        max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Max Tokens")
                    
                    submit_btn = gr.Button("Evaluate Responses")
                
                with gr.Column():
                    output = gr.Textbox(label="Evaluation Results", lines=20)
            
            # Handle evaluation
            submit_btn.click(
                fn=judge_responses,
                inputs=[instruction, response1, response2, model_dropdown, temperature, max_tokens],
                outputs=output
            )
            
            # Examples
            examples = [
                ["Include important study notes and key points that someone should know about the given subject. \n history of the USA",
                 "The history of the United States is one of the most influential and influential countries in the world. It has had a profound impact on the world and continues to shape the world today. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a profound impact on the world and continues to shape the world today.",
                 "1. The United States of America was founded in 1776.\n2. The Declaration of Independence was signed in 1776.\n3. The Constitution of the United States of America was signed in 1787.\n4. The Civil War began in 1861.\n5. The Emancipation Proclamation was issued in 1863.\n6. The 13th Amendment was ratified in 1865.\n7. The 14th Amendment was ratified in 1868.\n8. The 15th Amendment was ratified in 1870.\n9. The 16th Amendment was ratified in 1913.\n10. The 17th Amendment was ratified in 1913.\n11. The 18th Amendment was ratified in 1919.\n12. The 19th Amendment was ratified in 1920.\n13. The 20th Amendment was ratified in 1933.\n14. The 21st Amendment was ratified in 1933.\n15. The 22nd Amendment was ratified in"]
            ]
            
            gr.Examples(examples=examples, inputs=[instruction, response1, response2])
    
    gr.HTML(LICENSE)

if __name__ == "__main__":
    demo.launch()