aimeri commited on
Commit
f0ab3ba
·
1 Parent(s): f14a0ac

Refactor process_input function in app.py to handle multimodal inputs (image, audio, video, text) and update demo creation logic accordingly.

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -33,7 +33,15 @@ VOICE_OPTIONS = {
33
  }
34
 
35
  @spaces.GPU
36
- def process_input(user_input, chat_history, voice_type, enable_audio_output):
 
 
 
 
 
 
 
 
37
  # Prepare conversation history
38
  conversation = [SYSTEM_PROMPT]
39
 
@@ -213,7 +221,7 @@ def create_demo():
213
  queue=False
214
  ).then(
215
  fn=process_input,
216
- inputs=[text_input, chatbot, voice_type, enable_audio_output],
217
  outputs=[chatbot, text_output, audio_output]
218
  )
219
 
@@ -233,7 +241,7 @@ def create_demo():
233
  queue=False
234
  ).then(
235
  fn=process_input,
236
- inputs=[{"image": image_input, "audio": audio_input, "video": video_input, "text": additional_text},
237
  chatbot, voice_type, enable_audio_output],
238
  outputs=[chatbot, text_output, audio_output]
239
  )
 
33
  }
34
 
35
  @spaces.GPU
36
+ def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
37
+ # Combine multimodal inputs
38
+ user_input = {
39
+ "text": text,
40
+ "image": image,
41
+ "audio": audio,
42
+ "video": video
43
+ }
44
+
45
  # Prepare conversation history
46
  conversation = [SYSTEM_PROMPT]
47
 
 
221
  queue=False
222
  ).then(
223
  fn=process_input,
224
+ inputs=[None, None, None, text_input, chatbot, voice_type, enable_audio_output],
225
  outputs=[chatbot, text_output, audio_output]
226
  )
227
 
 
241
  queue=False
242
  ).then(
243
  fn=process_input,
244
+ inputs=[image_input, audio_input, video_input, additional_text,
245
  chatbot, voice_type, enable_audio_output],
246
  outputs=[chatbot, text_output, audio_output]
247
  )