aparna29 commited on
Commit
4331b58
·
verified ·
1 Parent(s): 434734f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -8,10 +8,10 @@ from PIL import Image
8
  import time
9
  import os
10
 
11
- # Set device for computation
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
- # Load Whisper model (use a smaller model for faster performance)
15
  whisper_model = whisper.load_model("small", device=DEVICE)
16
 
17
  # Load BLIP model and processor for image captioning
@@ -26,7 +26,7 @@ def transcribe(audio_path):
26
  start_time = time.time()
27
  try:
28
  audio = whisper.load_audio(audio_path)
29
- audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds
30
  mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
31
  _, probs = whisper_model.detect_language(mel)
32
  options = whisper.DecodingOptions(language="en", fp16=True)
@@ -62,18 +62,18 @@ def text_to_speech(text, output_path="output.mp3"):
62
  # Gradio interface function that processes both audio and image inputs
63
  def process_inputs(audio, image_path):
64
  try:
65
- # Process audio
66
  speech_to_text_output = transcribe(audio)
67
 
68
  # Process image with transcribed text
69
  if image_path:
70
- chatgpt_output = img2txt(image_path, speech_to_text_output)
71
  else:
72
- chatgpt_output = "No image provided."
73
 
74
- # Convert text to speech
75
- audio_output_path = text_to_speech(chatgpt_output, "Temp.mp3")
76
- return speech_to_text_output, chatgpt_output, audio_output_path
77
  except Exception as e:
78
  print(f"Error in process_inputs: {e}")
79
  return "Error", "Error", "Error"
@@ -86,9 +86,9 @@ iface = gr.Interface(
86
  gr.Image(type="filepath", label="Upload an Image")
87
  ],
88
  outputs=[
89
- gr.Textbox(label="Speech to Text"),
90
- gr.Textbox(label="Image Description (BLIP Output)"),
91
- gr.Audio(label="Assistant's Response")
92
  ],
93
  title="Multimodal Assistant: Speech, Text, and Image Interaction",
94
  description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
@@ -96,3 +96,4 @@ iface = gr.Interface(
96
 
97
  # Launch the Gradio interface
98
  iface.launch(debug=True)
 
 
8
  import time
9
  import os
10
 
11
+ # Set device for computation (use GPU if available, otherwise CPU)
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
+ # Ensure Whisper model is loaded correctly
15
  whisper_model = whisper.load_model("small", device=DEVICE)
16
 
17
  # Load BLIP model and processor for image captioning
 
26
  start_time = time.time()
27
  try:
28
  audio = whisper.load_audio(audio_path)
29
+ audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio
30
  mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
31
  _, probs = whisper_model.detect_language(mel)
32
  options = whisper.DecodingOptions(language="en", fp16=True)
 
62
  # Gradio interface function that processes both audio and image inputs
63
  def process_inputs(audio, image_path):
64
  try:
65
+ # Process audio: Convert speech to text
66
  speech_to_text_output = transcribe(audio)
67
 
68
  # Process image with transcribed text
69
  if image_path:
70
+ image_caption = img2txt(image_path, speech_to_text_output)
71
  else:
72
+ image_caption = "No image provided."
73
 
74
+ # Convert the generated text into speech (audio output)
75
+ audio_output_path = text_to_speech(image_caption, "Temp.mp3")
76
+ return speech_to_text_output, image_caption, audio_output_path
77
  except Exception as e:
78
  print(f"Error in process_inputs: {e}")
79
  return "Error", "Error", "Error"
 
86
  gr.Image(type="filepath", label="Upload an Image")
87
  ],
88
  outputs=[
89
+ gr.Textbox(label="Speech to Text"), # Output the transcribed text
90
+ gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption
91
+ gr.Audio(label="Assistant's Response") # Audio output of the assistant's response
92
  ],
93
  title="Multimodal Assistant: Speech, Text, and Image Interaction",
94
  description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
 
96
 
97
  # Launch the Gradio interface
98
  iface.launch(debug=True)
99
+