Spaces:
Runtime error
Runtime error
Refactor process_input function in app.py to handle multimodal inputs (image, audio, video, text) and update demo creation logic accordingly.
Browse files
app.py
CHANGED
@@ -33,7 +33,15 @@ VOICE_OPTIONS = {
|
|
33 |
}
|
34 |
|
35 |
@spaces.GPU
|
36 |
-
def process_input(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Prepare conversation history
|
38 |
conversation = [SYSTEM_PROMPT]
|
39 |
|
@@ -213,7 +221,7 @@ def create_demo():
|
|
213 |
queue=False
|
214 |
).then(
|
215 |
fn=process_input,
|
216 |
-
inputs=[text_input, chatbot, voice_type, enable_audio_output],
|
217 |
outputs=[chatbot, text_output, audio_output]
|
218 |
)
|
219 |
|
@@ -233,7 +241,7 @@ def create_demo():
|
|
233 |
queue=False
|
234 |
).then(
|
235 |
fn=process_input,
|
236 |
-
inputs=[
|
237 |
chatbot, voice_type, enable_audio_output],
|
238 |
outputs=[chatbot, text_output, audio_output]
|
239 |
)
|
|
|
33 |
}
|
34 |
|
35 |
@spaces.GPU
|
36 |
+
def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
|
37 |
+
# Combine multimodal inputs
|
38 |
+
user_input = {
|
39 |
+
"text": text,
|
40 |
+
"image": image,
|
41 |
+
"audio": audio,
|
42 |
+
"video": video
|
43 |
+
}
|
44 |
+
|
45 |
# Prepare conversation history
|
46 |
conversation = [SYSTEM_PROMPT]
|
47 |
|
|
|
221 |
queue=False
|
222 |
).then(
|
223 |
fn=process_input,
|
224 |
+
inputs=[None, None, None, text_input, chatbot, voice_type, enable_audio_output],
|
225 |
outputs=[chatbot, text_output, audio_output]
|
226 |
)
|
227 |
|
|
|
241 |
queue=False
|
242 |
).then(
|
243 |
fn=process_input,
|
244 |
+
inputs=[image_input, audio_input, video_input, additional_text,
|
245 |
chatbot, voice_type, enable_audio_output],
|
246 |
outputs=[chatbot, text_output, audio_output]
|
247 |
)
|