Qwen2.5-Omni-7B-Demo

Runtime error

aimeri commited on Mar 28

Commit

513e0c6

1 Parent(s): f0ab3ba

Enhance process_input function in app.py to ensure multimodal inputs are handled correctly when None; update demo creation to include hidden placeholders for image, audio, and video inputs.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,9 +37,9 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     # Combine multimodal inputs
     user_input = {
         "text": text,
-        "image": image,
-        "audio": audio,
-        "video": video
     }
     # Prepare conversation history
@@ -69,11 +69,12 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     # Generate response
     if enable_audio_output:
         text_ids, audio = model.generate(
             **inputs,
             use_audio_in_video=True,
             return_audio=True,
-            spk=voice_type
         )
         # Save audio to temporary file
@@ -133,6 +134,11 @@ def create_demo():
         gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
         gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
         # Chat interface
         with gr.Row():
             with gr.Column(scale=3):
@@ -221,7 +227,7 @@ def create_demo():
             queue=False
         ).then(
             fn=process_input,
-            inputs=[None, None, None, text_input, chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
         )

     # Combine multimodal inputs
     user_input = {
         "text": text,
+        "image": image if image is not None else None,
+        "audio": audio if audio is not None else None,
+        "video": video if video is not None else None
     }
     # Prepare conversation history
     # Generate response
     if enable_audio_output:
+        voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
         text_ids, audio = model.generate(
             **inputs,
             use_audio_in_video=True,
             return_audio=True,
+            spk=voice_type_value
         )
         # Save audio to temporary file
         gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
         gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
+        # Hidden placeholder components for text-only input
+        placeholder_image = gr.Image(type="filepath", visible=False)
+        placeholder_audio = gr.Audio(type="filepath", visible=False)
+        placeholder_video = gr.Video(visible=False)
         # Chat interface
         with gr.Row():
             with gr.Column(scale=3):
             queue=False
         ).then(
             fn=process_input,
+            inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
         )