File size: 2,632 Bytes
26d92c7
 
 
a3db14d
26d92c7
f13b8b7
8e3910c
7086887
26d92c7
7086887
26d92c7
 
 
 
 
996b501
26d92c7
7086887
26d92c7
 
654076c
 
 
 
 
 
 
 
26d92c7
7086887
 
26d92c7
a3db14d
07fcd26
f13b8b7
26d92c7
 
a3db14d
f13b8b7
7086887
26d92c7
7086887
a3db14d
f13b8b7
7086887
26d92c7
654076c
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import streamlit as st  # Streamlit for building the web application
from transformers import pipeline  # Hugging Face Transformers pipeline for models
from PIL import Image  # PIL for handling image files

# Function to convert image to text
def img2text(image):
    # Load the image-to-text model
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    # Generate a caption for the image
    text = image_to_text_model(image)[0]["generated_text"]
    return text  # Return the generated caption

# Function to generate a story based on the caption
def text2story(text):
    # Load the text generation model
    story_model = pipeline("text-generation", model="gpt2")
    # Generate a story based on the input text
    story_text = story_model(f"Once upon a time, {text}.", max_length=100, num_return_sequences=1)
    return story_text[0]["generated_text"]  # Return the generated story

# Function to convert text to audio
def text2audio(story_text):
    # Load the text-to-speech model
    text_to_audio_model = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    # Generate audio data from the story text
    audio_data = text_to_audio_model(story_text)
    return audio_data  # Return the audio data

# Main part of the application
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image into an Audio Story")
uploaded_file = st.file_uploader("Select an Image...", type=["jpg", "jpeg", "png"])  # File uploader for images

if uploaded_file is not None:
    # Open and read the uploaded image
    image = Image.open(uploaded_file)  # Use PIL to open the uploaded image
    st.image(image, caption="Uploaded Image", use_container_width=True)  # Display the uploaded image

    # Stage 1: Image to Text
    st.text('Processing image to text...')
    scenario = img2text(image)  # Get the caption for the uploaded image
    st.write("Caption:", scenario)  # Display the generated caption

    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(scenario)  # Generate a story based on the caption
    st.write("Story:", story)  # Display the generated story

    # Stage 3: Story to Audio data
    st.text('Generating audio data...')  # Inform the user about the audio generation stage
    audio_data = text2audio(story)  # Convert the generated story into audio

    # Play button for the audio
    if st.button("Play Audio"):  # Create a button to play the audio
        st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate=audio_data['sampling_rate'])  # Play the audio