aimeri commited on
Commit
513e0c6
·
1 Parent(s): f0ab3ba

Enhance process_input function in app.py to ensure multimodal inputs are handled correctly when None; update demo creation to include hidden placeholders for image, audio, and video inputs.

Browse files
Files changed (1) hide show
  1. app.py +11 -5
app.py CHANGED
@@ -37,9 +37,9 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
37
  # Combine multimodal inputs
38
  user_input = {
39
  "text": text,
40
- "image": image,
41
- "audio": audio,
42
- "video": video
43
  }
44
 
45
  # Prepare conversation history
@@ -69,11 +69,12 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
69
 
70
  # Generate response
71
  if enable_audio_output:
 
72
  text_ids, audio = model.generate(
73
  **inputs,
74
  use_audio_in_video=True,
75
  return_audio=True,
76
- spk=voice_type
77
  )
78
 
79
  # Save audio to temporary file
@@ -133,6 +134,11 @@ def create_demo():
133
  gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
134
  gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
135
 
 
 
 
 
 
136
  # Chat interface
137
  with gr.Row():
138
  with gr.Column(scale=3):
@@ -221,7 +227,7 @@ def create_demo():
221
  queue=False
222
  ).then(
223
  fn=process_input,
224
- inputs=[None, None, None, text_input, chatbot, voice_type, enable_audio_output],
225
  outputs=[chatbot, text_output, audio_output]
226
  )
227
 
 
37
  # Combine multimodal inputs
38
  user_input = {
39
  "text": text,
40
+ "image": image if image is not None else None,
41
+ "audio": audio if audio is not None else None,
42
+ "video": video if video is not None else None
43
  }
44
 
45
  # Prepare conversation history
 
69
 
70
  # Generate response
71
  if enable_audio_output:
72
+ voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
73
  text_ids, audio = model.generate(
74
  **inputs,
75
  use_audio_in_video=True,
76
  return_audio=True,
77
+ spk=voice_type_value
78
  )
79
 
80
  # Save audio to temporary file
 
134
  gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
135
  gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
136
 
137
+ # Hidden placeholder components for text-only input
138
+ placeholder_image = gr.Image(type="filepath", visible=False)
139
+ placeholder_audio = gr.Audio(type="filepath", visible=False)
140
+ placeholder_video = gr.Video(visible=False)
141
+
142
  # Chat interface
143
  with gr.Row():
144
  with gr.Column(scale=3):
 
227
  queue=False
228
  ).then(
229
  fn=process_input,
230
+ inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
231
  outputs=[chatbot, text_output, audio_output]
232
  )
233