Spaces:

aparna29
/

Multimodel

Sleeping

App Files Files Community

aparna29 commited on Mar 28

Commit

4331b58

verified ·

1 Parent(s): 434734f

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -8,10 +8,10 @@ from PIL import Image
 import time
 import os
-# Set device for computation
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Whisper model (use a smaller model for faster performance)
 whisper_model = whisper.load_model("small", device=DEVICE)
 # Load BLIP model and processor for image captioning
@@ -26,7 +26,7 @@ def transcribe(audio_path):
     start_time = time.time()
     try:
         audio = whisper.load_audio(audio_path)
-        audio = whisper.pad_or_trim(audio[:16000 * 30])  # Limit to 30 seconds
         mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
         _, probs = whisper_model.detect_language(mel)
         options = whisper.DecodingOptions(language="en", fp16=True)
@@ -62,18 +62,18 @@ def text_to_speech(text, output_path="output.mp3"):
 # Gradio interface function that processes both audio and image inputs
 def process_inputs(audio, image_path):
     try:
-        # Process audio
         speech_to_text_output = transcribe(audio)
         # Process image with transcribed text
         if image_path:
-            chatgpt_output = img2txt(image_path, speech_to_text_output)
         else:
-            chatgpt_output = "No image provided."
-        # Convert text to speech
-        audio_output_path = text_to_speech(chatgpt_output, "Temp.mp3")
-        return speech_to_text_output, chatgpt_output, audio_output_path
     except Exception as e:
         print(f"Error in process_inputs: {e}")
         return "Error", "Error", "Error"
@@ -86,9 +86,9 @@ iface = gr.Interface(
         gr.Image(type="filepath", label="Upload an Image")
     ],
     outputs=[
-        gr.Textbox(label="Speech to Text"),
-        gr.Textbox(label="Image Description (BLIP Output)"),
-        gr.Audio(label="Assistant's Response")
     ],
     title="Multimodal Assistant: Speech, Text, and Image Interaction",
     description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
@@ -96,3 +96,4 @@ iface = gr.Interface(
 # Launch the Gradio interface
 iface.launch(debug=True)

 import time
 import os
+# Set device for computation (use GPU if available, otherwise CPU)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Ensure Whisper model is loaded correctly
 whisper_model = whisper.load_model("small", device=DEVICE)
 # Load BLIP model and processor for image captioning
     start_time = time.time()
     try:
         audio = whisper.load_audio(audio_path)
+        audio = whisper.pad_or_trim(audio[:16000 * 30])  # Limit to 30 seconds of audio
         mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
         _, probs = whisper_model.detect_language(mel)
         options = whisper.DecodingOptions(language="en", fp16=True)
 # Gradio interface function that processes both audio and image inputs
 def process_inputs(audio, image_path):
     try:
+        # Process audio: Convert speech to text
         speech_to_text_output = transcribe(audio)
         # Process image with transcribed text
         if image_path:
+            image_caption = img2txt(image_path, speech_to_text_output)
         else:
+            image_caption = "No image provided."
+        # Convert the generated text into speech (audio output)
+        audio_output_path = text_to_speech(image_caption, "Temp.mp3")
+        return speech_to_text_output, image_caption, audio_output_path
     except Exception as e:
         print(f"Error in process_inputs: {e}")
         return "Error", "Error", "Error"
         gr.Image(type="filepath", label="Upload an Image")
     ],
     outputs=[
+        gr.Textbox(label="Speech to Text"),  # Output the transcribed text
+        gr.Textbox(label="Image Description (BLIP Output)"),  # Output the image caption
+        gr.Audio(label="Assistant's Response")  # Audio output of the assistant's response
     ],
     title="Multimodal Assistant: Speech, Text, and Image Interaction",
     description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
 # Launch the Gradio interface
 iface.launch(debug=True)