import torch from transformers import BlipProcessor, BlipForConditionalGeneration from gtts import gTTS import whisper import gradio as gr import nltk from PIL import Image import time import os # Set device for computation (use GPU if available, otherwise CPU) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Ensure Whisper model is loaded correctly whisper_model = whisper.load_model("small", device=DEVICE) # Load BLIP model and processor for image captioning processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Download necessary NLTK data nltk.download('punkt') # Function to transcribe audio to text using Whisper def transcribe(audio_path): start_time = time.time() try: audio = whisper.load_audio(audio_path) audio = whisper.pad_or_trim(audio[:16000 * 30]) # Limit to 30 seconds of audio mel = whisper.log_mel_spectrogram(audio).to(DEVICE) _, probs = whisper_model.detect_language(mel) options = whisper.DecodingOptions(language="en", fp16=True) result = whisper.decode(whisper_model, mel, options) print(f"Transcription completed in {time.time() - start_time:.2f} seconds") return result.text except Exception as e: print(f"Error in transcribing audio: {e}") return "Error in transcribing audio" # Function to generate image captions using BLIP def img2txt(image_path, input_text): try: image = Image.open(image_path) inputs = processor(images=image, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption except Exception as e: print(f"Error in generating image caption: {e}") return "Error in generating image caption" # Function to convert text to speech using gTTS def text_to_speech(text, output_path="output.mp3"): try: tts = gTTS(text=text, lang='en', slow=False) tts.save(output_path) return output_path except Exception as e: print(f"Error in converting text to speech: {e}") return "Error in text-to-speech conversion" # Gradio interface function that processes both audio and image inputs def process_inputs(audio, image_path): try: # Process audio: Convert speech to text speech_to_text_output = transcribe(audio) # Process image with transcribed text if image_path: image_caption = img2txt(image_path, speech_to_text_output) else: image_caption = "No image provided." # Convert the generated text into speech (audio output) audio_output_path = text_to_speech(image_caption, "Temp.mp3") return speech_to_text_output, image_caption, audio_output_path except Exception as e: print(f"Error in process_inputs: {e}") return "Error", "Error", "Error" # Create Gradio interface iface = gr.Interface( fn=process_inputs, inputs=[ gr.Audio(type="filepath", label="Record Audio"), # Use microphone for audio recording gr.Image(type="filepath", label="Upload an Image") ], outputs=[ gr.Textbox(label="Speech to Text"), # Output the transcribed text gr.Textbox(label="Image Description (BLIP Output)"), # Output the image caption gr.Audio(label="Assistant's Response") # Audio output of the assistant's response ], title="Multimodal Assistant: Speech, Text, and Image Interaction", description="Interact with the assistant by recording audio and uploading an image. The assistant will describe the image and respond to your query in audio." ) # Launch the Gradio interface iface.launch(debug=True)