Spaces:

aparna29
/

Multimodel

Sleeping

File size: 3,818 Bytes

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import whisper
import gradio as gr
import nltk
from PIL import Image
import time
import os

# Set device for computation (use GPU if available, otherwise CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Ensure Whisper model is loaded correctly
whisper_model = whisper.load_model("small", device=DEVICE)

# Load BLIP model and processor for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Download necessary NLTK data
nltk.download('punkt')

# Function to transcribe audio to text using Whisper
def transcribe(audio_path):
    start_time = time.time()
    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio[:16000 * 30])  # Limit to 30 seconds of audio
        mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
        _, probs = whisper_model.detect_language(mel)
        options = whisper.DecodingOptions(language="en", fp16=True)
        result = whisper.decode(whisper_model, mel, options)
        print(f"Transcription completed in {time.time() - start_time:.2f} seconds")
        return result.text
    except Exception as e:
        print(f"Error in transcribing audio: {e}")
        return "Error in transcribing audio"

# Function to generate image captions using BLIP
def img2txt(image_path, input_text):
    try:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt")
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"Error in generating image caption: {e}")
        return "Error in generating image caption"

# Function to convert text to speech using gTTS
def text_to_speech(text, output_path="output.mp3"):
    try:
        tts = gTTS(text=text, lang='en', slow=False)
        tts.save(output_path)
        return output_path
    except Exception as e:
        print(f"Error in converting text to speech: {e}")
        return "Error in text-to-speech conversion"

# Gradio interface function that processes both audio and image inputs
def process_inputs(audio, image_path):
    try:
        # Process audio: Convert speech to text
        speech_to_text_output = transcribe(audio)

        # Process image with transcribed text
        if image_path:
            image_caption = img2txt(image_path, speech_to_text_output)
        else:
            image_caption = "No image provided."

        # Convert the generated text into speech (audio output)
        audio_output_path = text_to_speech(image_caption, "Temp.mp3")
        return speech_to_text_output, image_caption, audio_output_path
    except Exception as e:
        print(f"Error in process_inputs: {e}")
        return "Error", "Error", "Error"

# Create Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(type="filepath", label="Record Audio"),  # Use microphone for audio recording
        gr.Image(type="filepath", label="Upload an Image")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),  # Output the transcribed text
        gr.Textbox(label="Image Description (BLIP Output)"),  # Output the image caption
        gr.Audio(label="Assistant's Response")  # Audio output of the assistant's response
    ],
    title="Multimodal Assistant: Speech, Text, and Image Interaction",
    description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
)

# Launch the Gradio interface
iface.launch(debug=True)