Spaces:
Sleeping
Sleeping
import torch | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from gtts import gTTS | |
import whisper | |
import gradio as gr | |
import nltk | |
from PIL import Image | |
import time | |
import os | |
# Set device for computation (use GPU if available, otherwise CPU) | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# Ensure Whisper model is loaded correctly | |
whisper_model = whisper.load_model("small", device=DEVICE) | |
# Load BLIP model and processor for image captioning | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
# Download necessary NLTK data | |
nltk.download('punkt') | |
# Function to transcribe audio to text using Whisper | |
def transcribe(audio_path): | |
start_time = time.time() | |
try: | |
audio = whisper.load_audio(audio_path) | |
audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio | |
mel = whisper.log_mel_spectrogram(audio).to(DEVICE) | |
_, probs = whisper_model.detect_language(mel) | |
options = whisper.DecodingOptions(language="en", fp16=True) | |
result = whisper.decode(whisper_model, mel, options) | |
print(f"Transcription completed in {time.time() - start_time:.2f} seconds") | |
return result.text | |
except Exception as e: | |
print(f"Error in transcribing audio: {e}") | |
return "Error in transcribing audio" | |
# Function to generate image captions using BLIP | |
def img2txt(image_path, input_text): | |
try: | |
image = Image.open(image_path) | |
inputs = processor(images=image, return_tensors="pt") | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
return caption | |
except Exception as e: | |
print(f"Error in generating image caption: {e}") | |
return "Error in generating image caption" | |
# Function to convert text to speech using gTTS | |
def text_to_speech(text, output_path="output.mp3"): | |
try: | |
tts = gTTS(text=text, lang='en', slow=False) | |
tts.save(output_path) | |
return output_path | |
except Exception as e: | |
print(f"Error in converting text to speech: {e}") | |
return "Error in text-to-speech conversion" | |
# Gradio interface function that processes both audio and image inputs | |
def process_inputs(audio, image_path): | |
try: | |
# Process audio: Convert speech to text | |
speech_to_text_output = transcribe(audio) | |
# Process image with transcribed text | |
if image_path: | |
image_caption = img2txt(image_path, speech_to_text_output) | |
else: | |
image_caption = "No image provided." | |
# Convert the generated text into speech (audio output) | |
audio_output_path = text_to_speech(image_caption, "Temp.mp3") | |
return speech_to_text_output, image_caption, audio_output_path | |
except Exception as e: | |
print(f"Error in process_inputs: {e}") | |
return "Error", "Error", "Error" | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_inputs, | |
inputs=[ | |
gr.Audio(type="filepath", label="Record Audio"), # Use microphone for audio recording | |
gr.Image(type="filepath", label="Upload an Image") | |
], | |
outputs=[ | |
gr.Textbox(label="Speech to Text"), # Output the transcribed text | |
gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption | |
gr.Audio(label="Assistant's Response") # Audio output of the assistant's response | |
], | |
title="Multimodal Assistant: Speech, Text, and Image Interaction", | |
description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio." | |
) | |
# Launch the Gradio interface | |
iface.launch(debug=True) | |