Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,10 +8,10 @@ from PIL import Image
|
|
8 |
import time
|
9 |
import os
|
10 |
|
11 |
-
# Set device for computation
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
-
#
|
15 |
whisper_model = whisper.load_model("small", device=DEVICE)
|
16 |
|
17 |
# Load BLIP model and processor for image captioning
|
@@ -26,7 +26,7 @@ def transcribe(audio_path):
|
|
26 |
start_time = time.time()
|
27 |
try:
|
28 |
audio = whisper.load_audio(audio_path)
|
29 |
-
audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds
|
30 |
mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
|
31 |
_, probs = whisper_model.detect_language(mel)
|
32 |
options = whisper.DecodingOptions(language="en", fp16=True)
|
@@ -62,18 +62,18 @@ def text_to_speech(text, output_path="output.mp3"):
|
|
62 |
# Gradio interface function that processes both audio and image inputs
|
63 |
def process_inputs(audio, image_path):
|
64 |
try:
|
65 |
-
# Process audio
|
66 |
speech_to_text_output = transcribe(audio)
|
67 |
|
68 |
# Process image with transcribed text
|
69 |
if image_path:
|
70 |
-
|
71 |
else:
|
72 |
-
|
73 |
|
74 |
-
# Convert text
|
75 |
-
audio_output_path = text_to_speech(
|
76 |
-
return speech_to_text_output,
|
77 |
except Exception as e:
|
78 |
print(f"Error in process_inputs: {e}")
|
79 |
return "Error", "Error", "Error"
|
@@ -86,9 +86,9 @@ iface = gr.Interface(
|
|
86 |
gr.Image(type="filepath", label="Upload an Image")
|
87 |
],
|
88 |
outputs=[
|
89 |
-
gr.Textbox(label="Speech to Text"),
|
90 |
-
gr.Textbox(label="Image Description (BLIP Output)"),
|
91 |
-
gr.Audio(label="Assistant's Response")
|
92 |
],
|
93 |
title="Multimodal Assistant: Speech, Text, and Image Interaction",
|
94 |
description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
|
@@ -96,3 +96,4 @@ iface = gr.Interface(
|
|
96 |
|
97 |
# Launch the Gradio interface
|
98 |
iface.launch(debug=True)
|
|
|
|
8 |
import time
|
9 |
import os
|
10 |
|
11 |
+
# Set device for computation (use GPU if available, otherwise CPU)
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
+
# Ensure Whisper model is loaded correctly
|
15 |
whisper_model = whisper.load_model("small", device=DEVICE)
|
16 |
|
17 |
# Load BLIP model and processor for image captioning
|
|
|
26 |
start_time = time.time()
|
27 |
try:
|
28 |
audio = whisper.load_audio(audio_path)
|
29 |
+
audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio
|
30 |
mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
|
31 |
_, probs = whisper_model.detect_language(mel)
|
32 |
options = whisper.DecodingOptions(language="en", fp16=True)
|
|
|
62 |
# Gradio interface function that processes both audio and image inputs
|
63 |
def process_inputs(audio, image_path):
|
64 |
try:
|
65 |
+
# Process audio: Convert speech to text
|
66 |
speech_to_text_output = transcribe(audio)
|
67 |
|
68 |
# Process image with transcribed text
|
69 |
if image_path:
|
70 |
+
image_caption = img2txt(image_path, speech_to_text_output)
|
71 |
else:
|
72 |
+
image_caption = "No image provided."
|
73 |
|
74 |
+
# Convert the generated text into speech (audio output)
|
75 |
+
audio_output_path = text_to_speech(image_caption, "Temp.mp3")
|
76 |
+
return speech_to_text_output, image_caption, audio_output_path
|
77 |
except Exception as e:
|
78 |
print(f"Error in process_inputs: {e}")
|
79 |
return "Error", "Error", "Error"
|
|
|
86 |
gr.Image(type="filepath", label="Upload an Image")
|
87 |
],
|
88 |
outputs=[
|
89 |
+
gr.Textbox(label="Speech to Text"), # Output the transcribed text
|
90 |
+
gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption
|
91 |
+
gr.Audio(label="Assistant's Response") # Audio output of the assistant's response
|
92 |
],
|
93 |
title="Multimodal Assistant: Speech, Text, and Image Interaction",
|
94 |
description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
|
|
|
96 |
|
97 |
# Launch the Gradio interface
|
98 |
iface.launch(debug=True)
|
99 |
+
|