Spaces:
Sleeping
Sleeping
File size: 3,818 Bytes
33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 33ff24b 4331b58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import whisper
import gradio as gr
import nltk
from PIL import Image
import time
import os
# Set device for computation (use GPU if available, otherwise CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Ensure Whisper model is loaded correctly
whisper_model = whisper.load_model("small", device=DEVICE)
# Load BLIP model and processor for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Download necessary NLTK data
nltk.download('punkt')
# Function to transcribe audio to text using Whisper
def transcribe(audio_path):
start_time = time.time()
try:
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio
mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(language="en", fp16=True)
result = whisper.decode(whisper_model, mel, options)
print(f"Transcription completed in {time.time() - start_time:.2f} seconds")
return result.text
except Exception as e:
print(f"Error in transcribing audio: {e}")
return "Error in transcribing audio"
# Function to generate image captions using BLIP
def img2txt(image_path, input_text):
try:
image = Image.open(image_path)
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
except Exception as e:
print(f"Error in generating image caption: {e}")
return "Error in generating image caption"
# Function to convert text to speech using gTTS
def text_to_speech(text, output_path="output.mp3"):
try:
tts = gTTS(text=text, lang='en', slow=False)
tts.save(output_path)
return output_path
except Exception as e:
print(f"Error in converting text to speech: {e}")
return "Error in text-to-speech conversion"
# Gradio interface function that processes both audio and image inputs
def process_inputs(audio, image_path):
try:
# Process audio: Convert speech to text
speech_to_text_output = transcribe(audio)
# Process image with transcribed text
if image_path:
image_caption = img2txt(image_path, speech_to_text_output)
else:
image_caption = "No image provided."
# Convert the generated text into speech (audio output)
audio_output_path = text_to_speech(image_caption, "Temp.mp3")
return speech_to_text_output, image_caption, audio_output_path
except Exception as e:
print(f"Error in process_inputs: {e}")
return "Error", "Error", "Error"
# Create Gradio interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(type="filepath", label="Record Audio"), # Use microphone for audio recording
gr.Image(type="filepath", label="Upload an Image")
],
outputs=[
gr.Textbox(label="Speech to Text"), # Output the transcribed text
gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption
gr.Audio(label="Assistant's Response") # Audio output of the assistant's response
],
title="Multimodal Assistant: Speech, Text, and Image Interaction",
description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
)
# Launch the Gradio interface
iface.launch(debug=True)
|