Multimodel / app.py
aparna29's picture
Update app.py
4331b58 verified
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import whisper
import gradio as gr
import nltk
from PIL import Image
import time
import os
# Set device for computation (use GPU if available, otherwise CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Ensure Whisper model is loaded correctly
whisper_model = whisper.load_model("small", device=DEVICE)
# Load BLIP model and processor for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Download necessary NLTK data
nltk.download('punkt')
# Function to transcribe audio to text using Whisper
def transcribe(audio_path):
start_time = time.time()
try:
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio
mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(language="en", fp16=True)
result = whisper.decode(whisper_model, mel, options)
print(f"Transcription completed in {time.time() - start_time:.2f} seconds")
return result.text
except Exception as e:
print(f"Error in transcribing audio: {e}")
return "Error in transcribing audio"
# Function to generate image captions using BLIP
def img2txt(image_path, input_text):
try:
image = Image.open(image_path)
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
except Exception as e:
print(f"Error in generating image caption: {e}")
return "Error in generating image caption"
# Function to convert text to speech using gTTS
def text_to_speech(text, output_path="output.mp3"):
try:
tts = gTTS(text=text, lang='en', slow=False)
tts.save(output_path)
return output_path
except Exception as e:
print(f"Error in converting text to speech: {e}")
return "Error in text-to-speech conversion"
# Gradio interface function that processes both audio and image inputs
def process_inputs(audio, image_path):
try:
# Process audio: Convert speech to text
speech_to_text_output = transcribe(audio)
# Process image with transcribed text
if image_path:
image_caption = img2txt(image_path, speech_to_text_output)
else:
image_caption = "No image provided."
# Convert the generated text into speech (audio output)
audio_output_path = text_to_speech(image_caption, "Temp.mp3")
return speech_to_text_output, image_caption, audio_output_path
except Exception as e:
print(f"Error in process_inputs: {e}")
return "Error", "Error", "Error"
# Create Gradio interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(type="filepath", label="Record Audio"), # Use microphone for audio recording
gr.Image(type="filepath", label="Upload an Image")
],
outputs=[
gr.Textbox(label="Speech to Text"), # Output the transcribed text
gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption
gr.Audio(label="Assistant's Response") # Audio output of the assistant's response
],
title="Multimodal Assistant: Speech, Text, and Image Interaction",
description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
)
# Launch the Gradio interface
iface.launch(debug=True)