Spaces:

ANASAKHTAR
/

Image_Captions_With_Audio

Running

File size: 2,794 Bytes

9ad8324
 
 
83ed3dd
 
9ad8324
 
 
 
83ed3dd
9ad8324
83ed3dd
9ad8324
83ed3dd
9ad8324
83ed3dd
 
9ad8324
83ed3dd
9ad8324
83ed3dd
 
9ad8324
83ed3dd
 
 
 
 
9ad8324
 
83ed3dd

import torch
import gradio as gr
from PIL import Image
import numpy as np
import os

# Use a pipeline as a high-level helper
from transformers import pipeline

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize the pipelines
caption_image = pipeline("image-to-text",
                       model="Salesforce/blip-image-captioning-large",
                       device=device)

# Using a different TTS model that's more stable
narrator = pipeline("text-to-speech",
                   model="microsoft/speecht5_tts",
                   device=device)

def ensure_output_dir():
    """Ensure the output directory exists"""
    output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def generate_audio(text):
    """
    Generate audio from text and save it
    """
    try:
        # Generate the speech
        speech = narrator(text)
        
        # Create output directory and file path
        output_dir = ensure_output_dir()
        output_path = os.path.join(output_dir, "caption_audio.wav")
        
        # Save the audio file
        with open(output_path, "wb") as f:
            f.write(speech["audio"])
        
        return output_path
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        raise gr.Error(f"Failed to generate audio: {str(e)}")

def caption_my_image(image):
    """
    Generate caption for image and convert it to speech
    """
    try:
        if image is None:
            raise gr.Error("Please upload an image")
            
        # Generate caption
        captions = caption_image(images=image)
        if not captions or len(captions) == 0:
            raise gr.Error("Could not generate caption for this image")
            
        caption_text = captions[0]['generated_text']
        print(f"Generated caption: {caption_text}")
        
        # Generate audio from caption
        audio_path = generate_audio(caption_text)
        
        return [audio_path, caption_text]
    except Exception as e:
        print(f"Error in caption_my_image: {str(e)}")
        raise gr.Error(f"Failed to process image: {str(e)}")

# Create the Gradio interface
demo = gr.Interface(
    fn=caption_my_image,
    inputs=[
        gr.Image(label="Upload Image", type="pil")
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Generated Caption")
    ],
    title="Image Captioning with Audio",
    description="""
    Upload an image and the application will:
    1. Generate a descriptive caption for the image
    2. Convert the caption to speech
    """,
    examples=[],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()