Spaces:
Runtime error
Runtime error
Enhance process_input function in app.py to ensure multimodal inputs are handled correctly when None; update demo creation to include hidden placeholders for image, audio, and video inputs.
Browse files
app.py
CHANGED
@@ -37,9 +37,9 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
|
|
37 |
# Combine multimodal inputs
|
38 |
user_input = {
|
39 |
"text": text,
|
40 |
-
"image": image,
|
41 |
-
"audio": audio,
|
42 |
-
"video": video
|
43 |
}
|
44 |
|
45 |
# Prepare conversation history
|
@@ -69,11 +69,12 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
|
|
69 |
|
70 |
# Generate response
|
71 |
if enable_audio_output:
|
|
|
72 |
text_ids, audio = model.generate(
|
73 |
**inputs,
|
74 |
use_audio_in_video=True,
|
75 |
return_audio=True,
|
76 |
-
spk=
|
77 |
)
|
78 |
|
79 |
# Save audio to temporary file
|
@@ -133,6 +134,11 @@ def create_demo():
|
|
133 |
gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
|
134 |
gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
# Chat interface
|
137 |
with gr.Row():
|
138 |
with gr.Column(scale=3):
|
@@ -221,7 +227,7 @@ def create_demo():
|
|
221 |
queue=False
|
222 |
).then(
|
223 |
fn=process_input,
|
224 |
-
inputs=[
|
225 |
outputs=[chatbot, text_output, audio_output]
|
226 |
)
|
227 |
|
|
|
37 |
# Combine multimodal inputs
|
38 |
user_input = {
|
39 |
"text": text,
|
40 |
+
"image": image if image is not None else None,
|
41 |
+
"audio": audio if audio is not None else None,
|
42 |
+
"video": video if video is not None else None
|
43 |
}
|
44 |
|
45 |
# Prepare conversation history
|
|
|
69 |
|
70 |
# Generate response
|
71 |
if enable_audio_output:
|
72 |
+
voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
|
73 |
text_ids, audio = model.generate(
|
74 |
**inputs,
|
75 |
use_audio_in_video=True,
|
76 |
return_audio=True,
|
77 |
+
spk=voice_type_value
|
78 |
)
|
79 |
|
80 |
# Save audio to temporary file
|
|
|
134 |
gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
|
135 |
gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
|
136 |
|
137 |
+
# Hidden placeholder components for text-only input
|
138 |
+
placeholder_image = gr.Image(type="filepath", visible=False)
|
139 |
+
placeholder_audio = gr.Audio(type="filepath", visible=False)
|
140 |
+
placeholder_video = gr.Video(visible=False)
|
141 |
+
|
142 |
# Chat interface
|
143 |
with gr.Row():
|
144 |
with gr.Column(scale=3):
|
|
|
227 |
queue=False
|
228 |
).then(
|
229 |
fn=process_input,
|
230 |
+
inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
|
231 |
outputs=[chatbot, text_output, audio_output]
|
232 |
)
|
233 |
|