nuojohnchen commited on
Commit
c683b58
·
verified ·
1 Parent(s): 54b4b29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -67
app.py CHANGED
@@ -1,3 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  import spaces
@@ -20,7 +36,7 @@ DESCRIPTION = '''
20
 
21
  LICENSE = """
22
  <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
23
- @misc{JudgeLRM,
24
  title = {JudgeLRM},
25
  url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
26
  author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
@@ -60,8 +76,8 @@ MODEL_PATHS = {
60
  POPULAR_MODELS = [
61
  "Qwen/Qwen2.5-7B-Instruct",
62
  "01-ai/Yi-6B-Chat",
63
- "openchat/openchat-3.5-0106",
64
- "FreedomIntelligence/Apollo-7B"
65
  ]
66
 
67
  # Global variables for model and tokenizer
@@ -88,31 +104,6 @@ def get_model_path(dropdown_value, custom_value):
88
  return custom_value.strip()
89
  return dropdown_value
90
 
91
- # Function to clean model response
92
- def clean_response_text(text):
93
- """Remove conversation markers and other artifacts from model response"""
94
- # Remove any <|user|> or <|assistant|> markers and subsequent conversations
95
- user_pattern = r'<\|user\|>.*'
96
- assistant_pattern = r'<\|assistant\|>.*'
97
-
98
- # Try to clean with regex first (using re.DOTALL to match across lines)
99
- cleaned = re.sub(user_pattern, '', text, flags=re.DOTALL)
100
- cleaned = re.sub(assistant_pattern, '', cleaned, flags=re.DOTALL)
101
-
102
- # If that didn't work well, try a more aggressive approach
103
- if '<|user|>' in cleaned or '<|assistant|>' in cleaned:
104
- parts = text.split('<|user|>')
105
- if len(parts) > 0:
106
- cleaned = parts[0].strip()
107
-
108
- # Remove other common markers
109
- markers = ['<user>', '</user>', '<assistant>', '</assistant>',
110
- 'User:', 'Assistant:', 'Human:', 'AI:']
111
- for marker in markers:
112
- cleaned = cleaned.replace(marker, '')
113
-
114
- return cleaned.strip()
115
-
116
  # Function to generate response from a model
117
  def generate_response(instruction, model_path, progress=gr.Progress()):
118
  """Generate a response from a specified model"""
@@ -136,11 +127,22 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
136
  do_sample=True
137
  )
138
 
139
- # Decode response
140
- full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
141
-
142
- # Remove the prompt part from the response
143
- clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  # If the model doesn't use these exact tokens, try to extract just the assistant's response
146
  if clean_response == full_response:
@@ -152,9 +154,6 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
152
  for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
153
  clean_response = clean_response.replace(token, "").strip()
154
 
155
- # Apply additional cleaning to remove conversation markers
156
- clean_response = clean_response_text(clean_response)
157
-
158
  # Clean up resources
159
  del response_model
160
  del response_tokenizer
@@ -165,7 +164,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
165
  except Exception as e:
166
  return f"Error generating response: {str(e)}"
167
 
168
- @spaces.GPU(duration=120)
169
  def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
170
  """
171
  Evaluate the quality of two responses
@@ -240,15 +239,15 @@ def judge_responses(instruction, response1, response2, model_name, temperature=0
240
 
241
  yield result
242
 
243
- @spaces.GPU(duration=120)
244
  def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
245
  """Generate responses from two models and judge them"""
 
 
246
  # Determine which model paths to use
247
  model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
248
  model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
249
 
250
- progress(0, desc="Starting generation process")
251
-
252
  # Generate responses from both models
253
  progress(0.1, desc=f"Generating response from {model_path_1}")
254
  response1 = generate_response(instruction, model_path_1, progress)
@@ -268,14 +267,6 @@ def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_drop
268
 
269
  return response1, response2, evaluation_results
270
 
271
- # Function to process examples for display
272
- def process_example_for_display(example):
273
- """Process example data for display in the interface"""
274
- instruction = example[0]
275
- model1 = example[1]
276
- model2 = example[2]
277
- return f"**Question:** {instruction}\n\n**Model 1:** {model1}\n\n**Model 2:** {model2}"
278
-
279
  # Create Gradio interface
280
  with gr.Blocks(fill_height=True, css=css) as demo:
281
  gr.Markdown(DESCRIPTION)
@@ -344,36 +335,27 @@ with gr.Blocks(fill_height=True, css=css) as demo:
344
  inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
345
  outputs=[auto_response1, auto_response2, auto_output]
346
  )
347
-
348
- # Examples for auto-generation with simplified display
349
  auto_examples = [
350
  ["Write a short poem about artificial intelligence",
351
  "Qwen/Qwen2.5-7B-Instruct",
 
 
352
  "01-ai/Yi-6B-Chat"],
353
  ["我听说有些人有高血压却没有任何症状。这是真的吗?",
354
  "FreedomIntelligence/Apollo-7B",
 
 
355
  "openchat/openchat-3.5-0106"]
356
  ]
357
 
358
- # Custom examples component with simplified display
359
- with gr.Row():
360
- gr.Markdown("### Examples")
361
-
362
- for i, example in enumerate(auto_examples):
363
- with gr.Row():
364
- example_btn = gr.Button(f"Example {i+1}", scale=1)
365
- example_display = gr.Markdown(process_example_for_display(example), scale=4)
366
-
367
- # Set up click handler for this example
368
- example_btn.click(
369
- lambda instruction, model1, model2: [instruction, model1, "", model2, ""],
370
- inputs=None,
371
- outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2],
372
- _js=f"() => [{repr(example[0])}, {repr(example[1])}, '', {repr(example[2])}, '']"
373
- )
374
 
375
  # Manual Evaluation tab (now second)
376
- with gr.TabItem("Manual Evaluation"):
377
  with gr.Row():
378
  with gr.Column():
379
  # Model selection
 
1
+ 此外,尽管我希望auto_examples的填充如 # Examples for auto-generation
2
+ auto_examples = [
3
+ ["Write a short poem about artificial intelligence",
4
+ "Qwen/Qwen2.5-7B-Instruct",
5
+ "Qwen/Qwen2.5-7B-Instruct",
6
+ "01-ai/Yi-6B-Chat",
7
+ "01-ai/Yi-6B-Chat"],
8
+ ["我听说有些人有高血压却没有任何症状。这是真的吗?",
9
+ "FreedomIntelligence/Apollo-7B",
10
+ "FreedomIntelligence/Apollo-7B",
11
+ "microsoft/phi-2",
12
+ "openchat/openchat-3.5-0106"]
13
+ ]这个所示,但是我希望呈现在前端的case只有instruction/question, model 1, model 2三列,这个可以实现吗
14
+
15
+
16
+
17
  import gradio as gr
18
  import os
19
  import spaces
 
36
 
37
  LICENSE = """
38
  <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
39
+ @misc{XtraGPT,
40
  title = {JudgeLRM},
41
  url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
42
  author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
 
76
  POPULAR_MODELS = [
77
  "Qwen/Qwen2.5-7B-Instruct",
78
  "01-ai/Yi-6B-Chat",
79
+ "FreedomIntelligence/Apollo-7B",
80
+ "openchat/openchat-3.5-0106"
81
  ]
82
 
83
  # Global variables for model and tokenizer
 
104
  return custom_value.strip()
105
  return dropdown_value
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Function to generate response from a model
108
  def generate_response(instruction, model_path, progress=gr.Progress()):
109
  """Generate a response from a specified model"""
 
127
  do_sample=True
128
  )
129
 
130
+ # 清理响应,移除提示词部分
131
+ full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
132
+
133
+ # 移除提示部分
134
+ clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
135
+
136
+ # 如果模型生成了多轮对话,只保留第一轮回答
137
+ if "<|user|>" in clean_response:
138
+ clean_response = clean_response.split("<|user|>")[0].strip()
139
+
140
+ # 如果模型使用其他格式的多轮对话标记
141
+ for token in ["<user>", "User:", "Human:"]:
142
+ if token in clean_response:
143
+ clean_response = clean_response.split(token)[0].strip()
144
+
145
+
146
 
147
  # If the model doesn't use these exact tokens, try to extract just the assistant's response
148
  if clean_response == full_response:
 
154
  for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
155
  clean_response = clean_response.replace(token, "").strip()
156
 
 
 
 
157
  # Clean up resources
158
  del response_model
159
  del response_tokenizer
 
164
  except Exception as e:
165
  return f"Error generating response: {str(e)}"
166
 
167
+ @spaces.GPU(duration=200)
168
  def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
169
  """
170
  Evaluate the quality of two responses
 
239
 
240
  yield result
241
 
242
+ @spaces.GPU(duration=200)
243
  def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
244
  """Generate responses from two models and judge them"""
245
+ progress(0, desc="Starting generation process")
246
+
247
  # Determine which model paths to use
248
  model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
249
  model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
250
 
 
 
251
  # Generate responses from both models
252
  progress(0.1, desc=f"Generating response from {model_path_1}")
253
  response1 = generate_response(instruction, model_path_1, progress)
 
267
 
268
  return response1, response2, evaluation_results
269
 
 
 
 
 
 
 
 
 
270
  # Create Gradio interface
271
  with gr.Blocks(fill_height=True, css=css) as demo:
272
  gr.Markdown(DESCRIPTION)
 
335
  inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
336
  outputs=[auto_response1, auto_response2, auto_output]
337
  )
338
+ # Examples for auto-generation
 
339
  auto_examples = [
340
  ["Write a short poem about artificial intelligence",
341
  "Qwen/Qwen2.5-7B-Instruct",
342
+ "Qwen/Qwen2.5-7B-Instruct",
343
+ "01-ai/Yi-6B-Chat",
344
  "01-ai/Yi-6B-Chat"],
345
  ["我听说有些人有高血压却没有任何症状。这是真的吗?",
346
  "FreedomIntelligence/Apollo-7B",
347
+ "FreedomIntelligence/Apollo-7B",
348
+ "openchat/openchat-3.5-0106",
349
  "openchat/openchat-3.5-0106"]
350
  ]
351
 
352
+ gr.Examples(
353
+ examples=auto_examples,
354
+ inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
355
+ )
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  # Manual Evaluation tab (now second)
358
+ with gr.TabItem("Manual Evaluation (Streaming Output)”):
359
  with gr.Row():
360
  with gr.Column():
361
  # Model selection