Spaces:

nuojohnchen
/

JudgeLRMDemo

Running

App Files Files Community

nuojohnchen commited on about 1 month ago

Commit

c683b58

verified ·

1 Parent(s): 54b4b29

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -67

app.py CHANGED Viewed

@@ -1,3 +1,19 @@
 import gradio as gr
 import os
 import spaces
@@ -20,7 +36,7 @@ DESCRIPTION = '''
 LICENSE = """
 <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
-@misc{JudgeLRM,
     title = {JudgeLRM},
     url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
     author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
@@ -60,8 +76,8 @@ MODEL_PATHS = {
 POPULAR_MODELS = [
     "Qwen/Qwen2.5-7B-Instruct",
     "01-ai/Yi-6B-Chat",
-    "openchat/openchat-3.5-0106",
-    "FreedomIntelligence/Apollo-7B"
 ]
 # Global variables for model and tokenizer
@@ -88,31 +104,6 @@ def get_model_path(dropdown_value, custom_value):
         return custom_value.strip()
     return dropdown_value
-# Function to clean model response
-def clean_response_text(text):
-    """Remove conversation markers and other artifacts from model response"""
-    # Remove any <|user|> or <|assistant|> markers and subsequent conversations
-    user_pattern = r'<\|user\|>.*'
-    assistant_pattern = r'<\|assistant\|>.*'
-    # Try to clean with regex first (using re.DOTALL to match across lines)
-    cleaned = re.sub(user_pattern, '', text, flags=re.DOTALL)
-    cleaned = re.sub(assistant_pattern, '', cleaned, flags=re.DOTALL)
-    # If that didn't work well, try a more aggressive approach
-    if '<|user|>' in cleaned or '<|assistant|>' in cleaned:
-        parts = text.split('<|user|>')
-        if len(parts) > 0:
-            cleaned = parts[0].strip()
-    # Remove other common markers
-    markers = ['<user>', '</user>', '<assistant>', '</assistant>',
-               'User:', 'Assistant:', 'Human:', 'AI:']
-    for marker in markers:
-        cleaned = cleaned.replace(marker, '')
-    return cleaned.strip()
 # Function to generate response from a model
 def generate_response(instruction, model_path, progress=gr.Progress()):
     """Generate a response from a specified model"""
@@ -136,11 +127,22 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
             do_sample=True
         )
-        # Decode response
-        full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
-        # Remove the prompt part from the response
-        clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
         # If the model doesn't use these exact tokens, try to extract just the assistant's response
         if clean_response == full_response:
@@ -152,9 +154,6 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
                 for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
                     clean_response = clean_response.replace(token, "").strip()
-        # Apply additional cleaning to remove conversation markers
-        clean_response = clean_response_text(clean_response)
         # Clean up resources
         del response_model
         del response_tokenizer
@@ -165,7 +164,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
     except Exception as e:
         return f"Error generating response: {str(e)}"
-@spaces.GPU(duration=120)
 def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
     """
     Evaluate the quality of two responses
@@ -240,15 +239,15 @@ def judge_responses(instruction, response1, response2, model_name, temperature=0
         yield result
-@spaces.GPU(duration=120)
 def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
     """Generate responses from two models and judge them"""
     # Determine which model paths to use
     model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
     model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
-    progress(0, desc="Starting generation process")
     # Generate responses from both models
     progress(0.1, desc=f"Generating response from {model_path_1}")
     response1 = generate_response(instruction, model_path_1, progress)
@@ -268,14 +267,6 @@ def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_drop
     return response1, response2, evaluation_results
-# Function to process examples for display
-def process_example_for_display(example):
-    """Process example data for display in the interface"""
-    instruction = example[0]
-    model1 = example[1]
-    model2 = example[2]
-    return f"**Question:** {instruction}\n\n**Model 1:** {model1}\n\n**Model 2:** {model2}"
 # Create Gradio interface
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
@@ -344,36 +335,27 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                 inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
                 outputs=[auto_response1, auto_response2, auto_output]
             )
-            # Examples for auto-generation with simplified display
             auto_examples = [
                 ["Write a short poem about artificial intelligence",
                  "Qwen/Qwen2.5-7B-Instruct",
                  "01-ai/Yi-6B-Chat"],
                 ["我听说有些人有高血压却没有任何症状。这是真的吗？",
                  "FreedomIntelligence/Apollo-7B",
                  "openchat/openchat-3.5-0106"]
             ]
-            # Custom examples component with simplified display
-            with gr.Row():
-                gr.Markdown("### Examples")
-            for i, example in enumerate(auto_examples):
-                with gr.Row():
-                    example_btn = gr.Button(f"Example {i+1}", scale=1)
-                    example_display = gr.Markdown(process_example_for_display(example), scale=4)
-                    # Set up click handler for this example
-                    example_btn.click(
-                        lambda instruction, model1, model2: [instruction, model1, "", model2, ""],
-                        inputs=None,
-                        outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2],
-                        _js=f"() => [{repr(example[0])}, {repr(example[1])}, '', {repr(example[2])}, '']"
-                    )
         # Manual Evaluation tab (now second)
-        with gr.TabItem("Manual Evaluation"):
             with gr.Row():
                 with gr.Column():
                     # Model selection

+此外，尽管我希望auto_examples的填充如            # Examples for auto-generation
+            auto_examples = [
+                ["Write a short poem about artificial intelligence",
+                 "Qwen/Qwen2.5-7B-Instruct",
+                 "Qwen/Qwen2.5-7B-Instruct",
+                 "01-ai/Yi-6B-Chat",
+                 "01-ai/Yi-6B-Chat"],
+   ["我听说有些人有高血压却没有任何症状。这是真的吗？",
+                 "FreedomIntelligence/Apollo-7B",
+                 "FreedomIntelligence/Apollo-7B",
+                 "microsoft/phi-2",
+                 "openchat/openchat-3.5-0106"]
+            ]这个所示，但是我希望呈现在前端的case只有instruction/question, model 1,  model 2三列，这个可以实现吗
 import gradio as gr
 import os
 import spaces
 LICENSE = """
 <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
+@misc{XtraGPT,
     title = {JudgeLRM},
     url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
     author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
 POPULAR_MODELS = [
     "Qwen/Qwen2.5-7B-Instruct",
     "01-ai/Yi-6B-Chat",
+    "FreedomIntelligence/Apollo-7B",
+    "openchat/openchat-3.5-0106"
 ]
 # Global variables for model and tokenizer
         return custom_value.strip()
     return dropdown_value
 # Function to generate response from a model
 def generate_response(instruction, model_path, progress=gr.Progress()):
     """Generate a response from a specified model"""
             do_sample=True
         )
+    # 清理响应，移除提示词部分
+    full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
+    # 移除提示部分
+    clean_response = full_response.replace(f"<|user|>\n{instruction}\n<|assistant|>", "").strip()
+    # 如果模型生成了多轮对话，只保留第一轮回答
+    if "<|user|>" in clean_response:
+        clean_response = clean_response.split("<|user|>")[0].strip()
+    # 如果模型使用其他格式的多轮对话标记
+    for token in ["<user>", "User:", "Human:"]:
+        if token in clean_response:
+            clean_response = clean_response.split(token)[0].strip()
         # If the model doesn't use these exact tokens, try to extract just the assistant's response
         if clean_response == full_response:
                 for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
                     clean_response = clean_response.replace(token, "").strip()
         # Clean up resources
         del response_model
         del response_tokenizer
     except Exception as e:
         return f"Error generating response: {str(e)}"
+@spaces.GPU(duration=200)
 def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
     """
     Evaluate the quality of two responses
         yield result
+@spaces.GPU(duration=200)
 def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
     """Generate responses from two models and judge them"""
+    progress(0, desc="Starting generation process")
     # Determine which model paths to use
     model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
     model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
     # Generate responses from both models
     progress(0.1, desc=f"Generating response from {model_path_1}")
     response1 = generate_response(instruction, model_path_1, progress)
     return response1, response2, evaluation_results
 # Create Gradio interface
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
                 inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
                 outputs=[auto_response1, auto_response2, auto_output]
             )
+            # Examples for auto-generation
             auto_examples = [
                 ["Write a short poem about artificial intelligence",
                  "Qwen/Qwen2.5-7B-Instruct",
+                 "Qwen/Qwen2.5-7B-Instruct",
+                 "01-ai/Yi-6B-Chat",
                  "01-ai/Yi-6B-Chat"],
                 ["我听说有些人有高血压却没有任何症状。这是真的吗？",
                  "FreedomIntelligence/Apollo-7B",
+                 "FreedomIntelligence/Apollo-7B",
+                 "openchat/openchat-3.5-0106",
                  "openchat/openchat-3.5-0106"]
             ]
+            gr.Examples(
+                examples=auto_examples,
+                inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
+            )
         # Manual Evaluation tab (now second)
+        with gr.TabItem("Manual Evaluation (Streaming Output)”):
             with gr.Row():
                 with gr.Column():
                     # Model selection