nuojohnchen commited on
Commit
28cb369
·
verified ·
1 Parent(s): 43af5ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +361 -48
app.py CHANGED
@@ -1,64 +1,377 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
 
 
 
 
6
  """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
27
 
28
- response = ""
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ import os
3
+ import spaces
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
5
+ from threading import Thread
6
+ import re
7
+ import torch
8
 
9
+ # Set environment variables
10
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
+
12
+ DESCRIPTION = '''
13
+ <div>
14
+ <h1 style="text-align: center;">JudgeLRM</h1>
15
+ <p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
16
+ <p>Enter an instruction and two responses, and the model will score them on a scale of 1-10 (higher is better).</p>
17
+ <p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
18
+ </div>
19
+ '''
20
+
21
+ LICENSE = """
22
+ <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
23
+ @misc{XtraGPT,
24
+ title = {JudgeLRM},
25
+ url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
26
+ author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
27
+ month = {March},
28
+ year = {2025}
29
+ }
30
+ </div>
31
  """
32
+
33
+ PLACEHOLDER = """
34
+ <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
35
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">JudgeLRM</h1>
36
+ <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter an instruction and two responses, I'll evaluate their quality...</p>
37
+ </div>
38
  """
 
39
 
40
+ css = """
41
+ h1 {
42
+ text-align: center;
43
+ display: block;
44
+ }
45
+ #duplicate-button {
46
+ margin: auto;
47
+ color: white;
48
+ background: #1565c0;
49
+ border-radius: 100vh;
50
+ }
51
+ """
52
 
53
+ # Model paths
54
+ MODEL_PATHS = {
55
+ "JudgeLRM-3B": "nuojohnchen/JudgeLRM-3B",
56
+ "JudgeLRM-7B": "nuojohnchen/JudgeLRM-7B"
57
+ }
 
 
 
 
58
 
59
+ # Popular models for dropdown selection
60
+ POPULAR_MODELS = [
61
+ "Qwen/Qwen2.5-7B-Instruct",
62
+ "01-ai/Yi-6B-Chat",
63
+ "microsoft/phi-2",
64
+ "FreedomIntelligence/Apollo-7B",
65
+ "tiiuae/falcon-7b-instruct",
66
+ "HuggingFaceH4/zephyr-7b-beta",
67
+ "stabilityai/stablelm-3b-4e1t",
68
+ "openchat/openchat-3.5-0106"
69
+ ]
70
 
71
+ # Global variables for model and tokenizer
72
+ tokenizer = None
73
+ model = None
74
+ current_model_name = None
75
+ response_model_1 = None
76
+ response_model_2 = None
77
+ response_tokenizer_1 = None
78
+ response_tokenizer_2 = None
79
 
80
+ def extract_scores(text):
81
+ """Extract scores from generated text"""
82
+ pattern = r'<answer>(\d+)</answer><answer>(\d+)</answer>'
83
+ match = re.search(pattern, text)
84
+ if match:
85
+ return int(match.group(1)), int(match.group(2))
86
+ return None, None
87
 
88
+ # Function to determine which model path to use
89
+ def get_model_path(dropdown_value, custom_value):
90
+ """Return custom value if provided, otherwise return dropdown value"""
91
+ if custom_value and custom_value.strip():
92
+ return custom_value.strip()
93
+ return dropdown_value
 
 
94
 
95
+ # Function to generate response from a model
96
+ def generate_response(instruction, model_path, progress=gr.Progress()):
97
+ """Generate a response from a specified model"""
98
+ progress(0, desc=f"Loading model {model_path}...")
99
+ try:
100
+ # Load model and tokenizer
101
+ response_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
102
+ response_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
103
+
104
+ progress(0.5, desc=f"Generating response from {model_path}...")
105
+
106
+ # Create prompt (simple format, adjust as needed for specific models)
107
+ prompt = f"<|user|>\n{instruction}\n<|assistant|>"
108
+
109
+ # Generate response
110
+ input_ids = response_tokenizer.encode(prompt, return_tensors="pt").to(response_model.device)
111
+ output = response_model.generate(
112
+ input_ids=input_ids,
113
+ max_new_tokens=1024,
114
+ temperature=0.7,
115
+ do_sample=True
116
+ )
117
+
118
+ # Decode response and clean it
119
+ full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
120
+
121
+ # Remove the prompt part from the response
122
+ clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
123
+
124
+ # If the model doesn't use these exact tokens, try to extract just the assistant's response
125
+ if clean_response == full_response:
126
+ # Try to find where the assistant's response starts
127
+ parts = full_response.split(instruction)
128
+ if len(parts) > 1:
129
+ clean_response = parts[1].strip()
130
+ # Further clean any remaining tokens
131
+ for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
132
+ clean_response = clean_response.replace(token, "").strip()
133
+
134
+ # Clean up resources
135
+ del response_model
136
+ del response_tokenizer
137
+ torch.cuda.empty_cache()
138
+
139
+ progress(1.0, desc=f"Response from {model_path} generated")
140
+ return clean_response
141
+ except Exception as e:
142
+ return f"Error generating response: {str(e)}"
143
 
144
+ @spaces.GPU(duration=120)
145
+ def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
146
+ """
147
+ Evaluate the quality of two responses
148
+ Args:
149
+ instruction (str): Instruction/question
150
+ response1 (str): First response
151
+ response2 (str): Second response
152
+ model_name (str): Model to use for evaluation
153
+ temperature (float): Generation temperature
154
+ max_new_tokens (int): Maximum number of tokens to generate
155
+ Returns:
156
+ str: Generated evaluation result
157
+ """
158
+ global tokenizer, model, current_model_name
159
+
160
+ # Load model on demand if it's not already loaded or if a different model is requested
161
+ if model is None or model_name != current_model_name:
162
+ # Clear GPU memory if a model is already loaded
163
+ if model is not None:
164
+ del model
165
+ del tokenizer
166
+ torch.cuda.empty_cache()
167
+
168
+ # Update status
169
+ yield f"Loading {model_name}... Please wait."
170
+
171
+ # Load the requested model
172
+ try:
173
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], use_fast=False)
174
+ model = AutoModelForCausalLM.from_pretrained(MODEL_PATHS[model_name], device_map="auto")
175
+ current_model_name = model_name
176
+ except Exception as e:
177
+ yield f"Error loading model: {str(e)}"
178
+ return
179
+
180
+ # Build prompt
181
+ prompt = """<|im_start|>system\nYou are a helpful assistant. The assistant first performs a detailed, step-by-step reasoning process in its mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> detailed reasoning process here, explaining each step of your evaluation for both assistants </think><answer> answer here </answer>. Now the user asks you to judge the performance of two AI assistants in response to the question. Score assistants 1-10 (higher=better). Criteria includes helpfulness, relevance, accuracy, and level of detail. Avoid order, length, style or other bias. After thinking, when you finally reach a conclusion, clearly provide your evaluation scores within <answer> </answer> tags, i.e. for example,<answer>3</answer><answer>5</answer>\n<|im_end|>\n<|im_start|>user\n[Question]\n{question}\n\n[Assistant 1's Answer]\n{answer_1}\n\n[Assistant 2's Answer]\n{answer_2}\n<|im_end|>\n<|im_start|>assistant\n"""
182
+
183
+ formatted_prompt = prompt.format(question=instruction, answer_1=response1, answer_2=response2)
184
+
185
+ # Set up streaming output
186
+ input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt").to(model.device)
187
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
188
+
189
+ generate_kwargs = dict(
190
+ input_ids=input_ids,
191
+ streamer=streamer,
192
+ max_new_tokens=max_new_tokens,
193
+ do_sample=True,
194
+ temperature=temperature,
195
+ )
196
+
197
+ if temperature == 0:
198
+ generate_kwargs['do_sample'] = False
199
+
200
+ # Run generation in a separate thread
201
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
202
+ t.start()
203
+
204
+ # Collect generated text
205
+ outputs = []
206
+ for text in streamer:
207
+ outputs.append(text)
208
+ full_text = "".join(outputs)
209
+
210
+ # Try to extract scores
211
+ score1, score2 = extract_scores(full_text)
212
+ if score1 and score2:
213
+ result = f"{full_text}\n\n**Evaluation Results:** Response 1 Score: {score1}/10, Response 2 Score: {score2}/10"
214
+ else:
215
+ result = full_text
216
+
217
+ yield result
218
 
219
+ @spaces.GPU(duration=120)
220
+ def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
221
+ """Generate responses from two models and judge them"""
222
+ progress(0, desc="Starting generation process")
223
+
224
+ # Determine which model paths to use
225
+ model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
226
+ model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
227
+
228
+ # Generate responses from both models
229
+ progress(0.1, desc=f"Generating response from {model_path_1}")
230
+ response1 = generate_response(instruction, model_path_1, progress)
231
+
232
+ progress(0.4, desc=f"Generating response from {model_path_2}")
233
+ response2 = generate_response(instruction, model_path_2, progress)
234
+
235
+ # Update the response textboxes
236
+ progress(0.7, desc="Evaluating responses")
237
+
238
+ # Use the judge_responses generator but collect all outputs
239
+ evaluation_results = ""
240
+ for result in judge_responses(instruction, response1, response2, judge_model_name, temperature, max_new_tokens):
241
+ evaluation_results = result
242
+
243
+ progress(1.0, desc="Evaluation complete")
244
+
245
+ return response1, response2, evaluation_results
246
 
247
+ # Create Gradio interface
248
+ with gr.Blocks(fill_height=True, css=css) as demo:
249
+ gr.Markdown(DESCRIPTION)
250
+
251
+ with gr.Tabs():
252
+ # Auto-Generate Responses tab (now first)
253
+ with gr.TabItem("Auto-Generate Responses"):
254
+ with gr.Row():
255
+ with gr.Column(scale=1):
256
+ # Model selection for judge
257
+ auto_model_dropdown = gr.Dropdown(
258
+ choices=list(MODEL_PATHS.keys()),
259
+ value="JudgeLRM-7B", # Default selection
260
+ label="Select Judge Model"
261
+ )
262
+
263
+ auto_instruction = gr.Textbox(label="Instruction/Question", lines=3)
264
+
265
+ # Model 1 selection
266
+ with gr.Row():
267
+ model_dropdown_1 = gr.Dropdown(
268
+ choices=POPULAR_MODELS,
269
+ value=POPULAR_MODELS[0],
270
+ label="Select Model 1",
271
+ scale=2
272
+ )
273
+ custom_model_1 = gr.Textbox(
274
+ label="Or enter custom model path",
275
+ placeholder="e.g., meta-llama/Llama-2-7b-chat-hf",
276
+ scale=3
277
+ )
278
+
279
+ # Model 2 selection
280
+ with gr.Row():
281
+ model_dropdown_2 = gr.Dropdown(
282
+ choices=POPULAR_MODELS,
283
+ value=POPULAR_MODELS[1],
284
+ label="Select Model 2",
285
+ scale=2
286
+ )
287
+ custom_model_2 = gr.Textbox(
288
+ label="Or enter custom model path",
289
+ placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
290
+ scale=3
291
+ )
292
+
293
+ with gr.Accordion("⚙️ Parameters", open=False):
294
+ auto_temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Judge Temperature")
295
+ auto_max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Judge Max Tokens")
296
+
297
+ auto_submit_btn = gr.Button("Generate Responses and Evaluate")
298
+
299
+ with gr.Row():
300
+ with gr.Column():
301
+ auto_response1 = gr.Textbox(label="Response from Model 1", lines=10)
302
+
303
+ with gr.Column():
304
+ auto_response2 = gr.Textbox(label="Response from Model 2", lines=10)
305
+
306
+ with gr.Row():
307
+ auto_output = gr.Textbox(label="Evaluation Results", lines=15)
308
+
309
+ # Handle auto-generation and evaluation
310
+ auto_submit_btn.click(
311
+ fn=generate_and_judge,
312
+ inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
313
+ outputs=[auto_response1, auto_response2, auto_output]
314
+ )
315
+ # Examples for auto-generation
316
+ auto_examples = [
317
+ ["Write a short poem about artificial intelligence",
318
+ "Qwen/Qwen2.5-7B-Instruct",
319
+ "Qwen/Qwen2.5-7B-Instruct",
320
+ "01-ai/Yi-6B-Chat",
321
+ "01-ai/Yi-6B-Chat"],
322
+ ["我听说有些人有高血压却没有任何症状。这是真的吗?",
323
+ "FreedomIntelligence/Apollo-7B",
324
+ "FreedomIntelligence/Apollo-7B",
325
+ "microsoft/phi-2",
326
+ "microsoft/phi-2"]
327
+ ]
328
+
329
+ gr.Examples(
330
+ examples=auto_examples,
331
+ inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
332
+ )
333
+
334
+ # Manual Evaluation tab (now second)
335
+ with gr.TabItem("Manual Evaluation"):
336
+ with gr.Row():
337
+ with gr.Column():
338
+ # Model selection
339
+ model_dropdown = gr.Dropdown(
340
+ choices=list(MODEL_PATHS.keys()),
341
+ value="JudgeLRM-7B", # Default selection
342
+ label="Select Judge Model"
343
+ )
344
+
345
+ instruction = gr.Textbox(label="Instruction/Question", lines=3)
346
+ response1 = gr.Textbox(label="Response 1", lines=8)
347
+ response2 = gr.Textbox(label="Response 2", lines=8)
348
+
349
+ with gr.Accordion("⚙️ Parameters", open=False):
350
+ temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Temperature")
351
+ max_tokens = gr.Slider(minimum=128, maximum=4096, step=1, value=2048, label="Max Tokens")
352
+
353
+ submit_btn = gr.Button("Evaluate Responses")
354
+
355
+ with gr.Column():
356
+ output = gr.Textbox(label="Evaluation Results", lines=20)
357
+
358
+ # Handle evaluation
359
+ submit_btn.click(
360
+ fn=judge_responses,
361
+ inputs=[instruction, response1, response2, model_dropdown, temperature, max_tokens],
362
+ outputs=output
363
+ )
364
+
365
+ # Examples
366
+ examples = [
367
+ ["Include important study notes and key points that someone should know about the given subject. \n history of the USA",
368
+ "The history of the United States is one of the most influential and influential countries in the world. It has had a profound impact on the world and continues to shape the world today. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a tumultuous and tumultuous history, from the Revolutionary War to the Civil War and beyond. The United States has had a long and storied history, from its founding in 1775 to the present day. It has had a profound impact on the world and continues to shape the world today.",
369
+ "1. The United States of America was founded in 1776.\n2. The Declaration of Independence was signed in 1776.\n3. The Constitution of the United States of America was signed in 1787.\n4. The Civil War began in 1861.\n5. The Emancipation Proclamation was issued in 1863.\n6. The 13th Amendment was ratified in 1865.\n7. The 14th Amendment was ratified in 1868.\n8. The 15th Amendment was ratified in 1870.\n9. The 16th Amendment was ratified in 1913.\n10. The 17th Amendment was ratified in 1913.\n11. The 18th Amendment was ratified in 1919.\n12. The 19th Amendment was ratified in 1920.\n13. The 20th Amendment was ratified in 1933.\n14. The 21st Amendment was ratified in 1933.\n15. The 22nd Amendment was ratified in"]
370
+ ]
371
+
372
+ gr.Examples(examples=examples, inputs=[instruction, response1, response2])
373
+
374
+ gr.Markdown(LICENSE)
375
 
376
  if __name__ == "__main__":
377
+ demo.launch()