import streamlit as st from transformers import pipeline from gtts import gTTS import os from PIL import Image # Load models def load_models(): image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") storyteller = pipeline("text-generation", model="databricks/dolly-v2-3b", max_new_tokens=300) return image_to_text, storyteller # Process image to text def generate_caption(image, image_to_text): result = image_to_text(image) return result[0]["generated_text"] if result else "No caption generated." # Generate a narrative story def generate_story(text, storyteller): prompt = f"Write a short and engaging story inspired by this image description: {text}" story = storyteller(prompt, do_sample=True, temperature=0.7, max_new_tokens=300) return story[0]["generated_text"] if story else "No story generated." # Convert text to speech def text_to_speech(text, filename="output.mp3"): tts = gTTS(text) tts.save(filename) return filename # Main Streamlit app def main(): st.title("AI-Powered Image Captioning and Storytelling") image_to_text, storyteller = load_models() uploaded_file = st.file_uploader("Upload an image...", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_container_width=True) with st.spinner("Generating caption..."): caption = generate_caption(image, image_to_text) st.write("### Image Caption:") st.write(caption) with st.spinner("Generating story..."): story = generate_story(caption, storyteller) st.write("### Generated Story:") st.write(story) with st.spinner("Generating speech..."): audio_file = text_to_speech(story) st.audio(audio_file, format="audio/mp3") if __name__ == "__main__": main()