# Import necessary libraries
import streamlit as st  # Streamlit for creating the web application
from transformers import pipeline  # Pipeline for using Hugging Face models
from PIL import Image  # PIL for image processing

# Function to load models
def load_models():
    # Load the image to text model
    caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")  # Load pre-trained image to text model
    # Load the text generation model
    story_model = pipeline("text-generation", model="gpt2")  # Load pre-trained text generation model
    # Load the text-to-speech model
    tts_model = pipeline("text-to-speech", model="suno/bark")  # Load a TTS model
    return caption_model, story_model, tts_model  # Return all three models

# Function to generate story from caption
def generate_story(caption, story_model):
    # Generate a story based on the caption
    story = story_model(caption, max_length=100, num_return_sequences=1)[0]['generated_text']  # Generate the story
    return story  # Return the generated story

# Function to convert text to audio
def text_to_audio(text, tts_model):
    audio = tts_model(text)  # Generate audio from text using the TTS model
    return audio  # Return the audio object

# Function to process the uploaded image and generate a story
def process_image(image, caption_model, story_model):
    # Generate a caption from the uploaded image
    result = caption_model(image)  # Get the result from the model
    caption = result[0]['generated_text']  # Access the generated caption
    # Generate a story from the caption
    story = generate_story(caption, story_model)  # Call the story generation function
    return caption, story  # Return both caption and story

# Main part
def main():
    st.set_page_config(page_title="Storytelling Friend", page_icon="🦦")  # Title of the application
    st.write("Upload an image to generate a story!")  # Instructions for the user

    # Upload image section
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])  # File uploader for images

    # Load models once
    caption_model, story_model, tts_model = load_models()  # Load models

    if uploaded_file is not None:
        # Open and read the uploaded image
        image = Image.open(uploaded_file)  # Open the uploaded image file
        st.image(image, caption="Uploaded Image", use_container_width=True)  # Display the uploaded image

        # Process the image and generate story
        caption, story = process_image(image, caption_model, story_model)  # Get caption and story
        st.subheader("Generated Caption:")  # Subheader for caption
        st.write(caption)  # Display the caption
        st.subheader("Generated Story:")  # Subheader for story
        st.write(story)  # Display the generated story

        # Convert story to audio and play it
        audio = text_to_audio(story, tts_model)  # Convert story to audio
        st.audio(audio, format='audio/wav')  # Play the audio

# Run the app
if __name__ == "__main__":
    main()  # Call the main function to run the app