Spaces:

aparna29
/

Multimodel

Sleeping

App Files Files Community

Multimodel / app.py

aparna29

Update app.py

4331b58 verified about 2 months ago

raw

history blame contribute delete

3.82 kB

	import torch
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from gtts import gTTS
	import whisper
	import gradio as gr
	import nltk
	from PIL import Image
	import time
	import os

	# Set device for computation (use GPU if available, otherwise CPU)
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Ensure Whisper model is loaded correctly
	whisper_model = whisper.load_model("small", device=DEVICE)

	# Load BLIP model and processor for image captioning
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

	# Download necessary NLTK data
	nltk.download('punkt')

	# Function to transcribe audio to text using Whisper
	def transcribe(audio_path):
	start_time = time.time()
	try:
	audio = whisper.load_audio(audio_path)
	audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio
	mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
	_, probs = whisper_model.detect_language(mel)
	options = whisper.DecodingOptions(language="en", fp16=True)
	result = whisper.decode(whisper_model, mel, options)
	print(f"Transcription completed in {time.time() - start_time:.2f} seconds")
	return result.text
	except Exception as e:
	print(f"Error in transcribing audio: {e}")
	return "Error in transcribing audio"

	# Function to generate image captions using BLIP
	def img2txt(image_path, input_text):
	try:
	image = Image.open(image_path)
	inputs = processor(images=image, return_tensors="pt")
	out = model.generate(**inputs)
	caption = processor.decode(out[0], skip_special_tokens=True)
	return caption
	except Exception as e:
	print(f"Error in generating image caption: {e}")
	return "Error in generating image caption"

	# Function to convert text to speech using gTTS
	def text_to_speech(text, output_path="output.mp3"):
	try:
	tts = gTTS(text=text, lang='en', slow=False)
	tts.save(output_path)
	return output_path
	except Exception as e:
	print(f"Error in converting text to speech: {e}")
	return "Error in text-to-speech conversion"

	# Gradio interface function that processes both audio and image inputs
	def process_inputs(audio, image_path):
	try:
	# Process audio: Convert speech to text
	speech_to_text_output = transcribe(audio)

	# Process image with transcribed text
	if image_path:
	image_caption = img2txt(image_path, speech_to_text_output)
	else:
	image_caption = "No image provided."

	# Convert the generated text into speech (audio output)
	audio_output_path = text_to_speech(image_caption, "Temp.mp3")
	return speech_to_text_output, image_caption, audio_output_path
	except Exception as e:
	print(f"Error in process_inputs: {e}")
	return "Error", "Error", "Error"

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_inputs,
	inputs=[
	gr.Audio(type="filepath", label="Record Audio"), # Use microphone for audio recording
	gr.Image(type="filepath", label="Upload an Image")
	],
	outputs=[
	gr.Textbox(label="Speech to Text"), # Output the transcribed text
	gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption
	gr.Audio(label="Assistant's Response") # Audio output of the assistant's response
	],
	title="Multimodal Assistant: Speech, Text, and Image Interaction",
	description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio."
	)

	# Launch the Gradio interface
	iface.launch(debug=True)