Spaces:

aimeri
/

Qwen2.5-Omni-7B-Demo

Build error

App Files Files Community

aimeri commited on about 1 month ago

Commit

e4a9a7a

1 Parent(s): 1638860

Enhance process_input and create_demo functions in app.py to improve multimodal input handling, including better formatting for user messages and integration of TextStreamer for text response generation.

Browse files

Files changed (1) hide show

app.py +44 -14

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
 from qwen_omni_utils import process_mm_info
 import soundfile as sf
 import tempfile
@@ -51,7 +51,16 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
             if isinstance(item, list) and len(item) == 2:
                 user_msg, bot_msg = item
                 if bot_msg is not None:  # Only add complete message pairs
-                    conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
                     conversation.append({"role": "assistant", "content": bot_msg})
     else:
         # Initialize chat history if it's not a list
@@ -78,14 +87,19 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     )
     inputs = inputs.to(model.device).to(model.dtype)
-    # Generate response
     if enable_audio_output:
         voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
         text_ids, audio = model.generate(
             **inputs,
             use_audio_in_video=False,  # Set to False to avoid audio processing issues
             return_audio=True,
-            spk=voice_type_value
         )
         # Save audio to temporary file
@@ -100,7 +114,12 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
         text_ids = model.generate(
             **inputs,
             use_audio_in_video=False,  # Set to False to avoid audio processing issues
-            return_audio=False
         )
         audio_path = None
@@ -111,17 +130,20 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
         clean_up_tokenization_spaces=False
     )[0]
-    # Clean up text response
     text_response = text_response.strip()
     # Format user message for chat history display
     user_message_for_display = str(text) if text is not None else ""
     if image is not None:
-        user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]"
     if audio is not None:
-        user_message_for_display = (user_message_for_display or "Audio uploaded") + " [Audio]"
     if video is not None:
-        user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]"
     # If empty, provide a default message
     if not user_message_for_display.strip():
@@ -168,7 +190,12 @@ def create_demo():
         # Chat interface
         with gr.Row():
             with gr.Column(scale=3):
-                chatbot = gr.Chatbot(height=600)
                 with gr.Accordion("Advanced Options", open=False):
                     voice_type = gr.Dropdown(
                         choices=list(VOICE_OPTIONS.keys()),
@@ -185,9 +212,11 @@ def create_demo():
                     with gr.TabItem("Text Input"):
                         text_input = gr.Textbox(
                             placeholder="Type your message here...",
-                            label="Text Input"
                         )
-                        text_submit = gr.Button("Send Text")
                     with gr.TabItem("Multimodal Input"):
                         with gr.Row():
@@ -205,9 +234,10 @@ def create_demo():
                             )
                         additional_text = gr.Textbox(
                             placeholder="Additional text message...",
-                            label="Additional Text"
                         )
-                        multimodal_submit = gr.Button("Send Multimodal Input")
                 clear_button = gr.Button("Clear Chat")

 import gradio as gr
 import torch
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, TextStreamer
 from qwen_omni_utils import process_mm_info
 import soundfile as sf
 import tempfile
             if isinstance(item, list) and len(item) == 2:
                 user_msg, bot_msg = item
                 if bot_msg is not None:  # Only add complete message pairs
+                    # Convert display format back to processable format
+                    processed_msg = user_msg
+                    if "[Image]" in user_msg:
+                        processed_msg = {"type": "text", "text": user_msg.replace("[Image]", "").strip()}
+                    if "[Audio]" in user_msg:
+                        processed_msg = {"type": "text", "text": user_msg.replace("[Audio]", "").strip()}
+                    if "[Video]" in user_msg:
+                        processed_msg = {"type": "text", "text": user_msg.replace("[Video]", "").strip()}
+                    conversation.append({"role": "user", "content": processed_msg})
                     conversation.append({"role": "assistant", "content": bot_msg})
     else:
         # Initialize chat history if it's not a list
     )
     inputs = inputs.to(model.device).to(model.dtype)
+    # Generate response with streaming
     if enable_audio_output:
         voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
         text_ids, audio = model.generate(
             **inputs,
             use_audio_in_video=False,  # Set to False to avoid audio processing issues
             return_audio=True,
+            spk=voice_type_value,
+            max_new_tokens=512,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            streamer=TextStreamer(processor, skip_prompt=True)
         )
         # Save audio to temporary file
         text_ids = model.generate(
             **inputs,
             use_audio_in_video=False,  # Set to False to avoid audio processing issues
+            return_audio=False,
+            max_new_tokens=512,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            streamer=TextStreamer(processor, skip_prompt=True)
         )
         audio_path = None
         clean_up_tokenization_spaces=False
     )[0]
+    # Clean up text response by removing system/user messages
     text_response = text_response.strip()
+    text_response = text_response.split("assistant")[-1].strip()
+    if text_response.startswith(":"):
+        text_response = text_response[1:].strip()
     # Format user message for chat history display
     user_message_for_display = str(text) if text is not None else ""
     if image is not None:
+        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Image]"
     if audio is not None:
+        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Audio]"
     if video is not None:
+        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Video]"
     # If empty, provide a default message
     if not user_message_for_display.strip():
         # Chat interface
         with gr.Row():
             with gr.Column(scale=3):
+                chatbot = gr.Chatbot(
+                    height=600,
+                    show_label=False,
+                    avatar_images=["👤", "🤖"],
+                    bubble_full_width=False,
+                )
                 with gr.Accordion("Advanced Options", open=False):
                     voice_type = gr.Dropdown(
                         choices=list(VOICE_OPTIONS.keys()),
                     with gr.TabItem("Text Input"):
                         text_input = gr.Textbox(
                             placeholder="Type your message here...",
+                            label="Text Input",
+                            autofocus=True,
+                            container=False,
                         )
+                        text_submit = gr.Button("Send Text", variant="primary")
                     with gr.TabItem("Multimodal Input"):
                         with gr.Row():
                             )
                         additional_text = gr.Textbox(
                             placeholder="Additional text message...",
+                            label="Additional Text",
+                            container=False,
                         )
+                        multimodal_submit = gr.Button("Send Multimodal Input", variant="primary")
                 clear_button = gr.Button("Clear Chat")