File size: 3,117 Bytes
f34973b
815a364
 
 
7e1bb2a
815a364
 
 
 
 
 
 
bb6f5bb
815a364
7e1bb2a
815a364
2da2d23
815a364
 
 
7e1bb2a
815a364
 
 
 
7e1bb2a
815a364
2da2d23
815a364
2da2d23
 
815a364
2da2d23
815a364
7e1bb2a
815a364
 
2da2d23
815a364
7e1bb2a
815a364
 
7e1bb2a
815a364
 
f34973b
815a364
 
 
2da2d23
815a364
 
2da2d23
815a364
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Import necessary libraries
import streamlit as st  # Streamlit for creating the web application
from transformers import pipeline  # Pipeline for using Hugging Face models
from PIL import Image  # PIL for image processing

# Function to load models
def load_models():
    # Load the image to text model
    caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")  # Load pre-trained image to text model
    # Load the text generation model
    story_model = pipeline("text-generation", model="gpt2")  # Load pre-trained text generation model
    # Load the text-to-speech model
    tts_model = pipeline("text-to-speech", model="suno/bark")  # Load a TTS model
    return caption_model, story_model, tts_model  # Return all three models

# Function to generate story from caption
def generate_story(caption, story_model):
    # Generate a story based on the caption
    story = story_model(caption, max_length=100, num_return_sequences=1)[0]['generated_text']  # Generate the story
    return story  # Return the generated story

# Function to convert text to audio
def text_to_audio(text, tts_model):
    audio = tts_model(text)  # Generate audio from text using the TTS model
    return audio  # Return the audio object

# Function to process the uploaded image and generate a story
def process_image(image, caption_model, story_model):
    # Generate a caption from the uploaded image
    result = caption_model(image)  # Get the result from the model
    caption = result[0]['generated_text']  # Access the generated caption
    # Generate a story from the caption
    story = generate_story(caption, story_model)  # Call the story generation function
    return caption, story  # Return both caption and story

# Main part
def main():
    st.set_page_config(page_title="Storytelling Friend", page_icon="🦦")  # Title of the application
    st.write("Upload an image to generate a story!")  # Instructions for the user

    # Upload image section
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])  # File uploader for images

    # Load models once
    caption_model, story_model, tts_model = load_models()  # Load models

    if uploaded_file is not None:
        # Open and read the uploaded image
        image = Image.open(uploaded_file)  # Open the uploaded image file
        st.image(image, caption="Uploaded Image", use_container_width=True)  # Display the uploaded image

        # Process the image and generate story
        caption, story = process_image(image, caption_model, story_model)  # Get caption and story
        st.subheader("Generated Caption:")  # Subheader for caption
        st.write(caption)  # Display the caption
        st.subheader("Generated Story:")  # Subheader for story
        st.write(story)  # Display the generated story

        # Convert story to audio and play it
        audio = text_to_audio(story, tts_model)  # Convert story to audio
        st.audio(audio, format='audio/wav')  # Play the audio

# Run the app
if __name__ == "__main__":
    main()  # Call the main function to run the app