File size: 2,006 Bytes
8d2615a
5866ba2
 
 
3fceda9
5866ba2
8d2615a
 
5866ba2
8d2615a
 
5866ba2
 
 
8d2615a
5866ba2
42facc3
5350c35
 
5866ba2
 
8d2615a
5866ba2
3fceda9
 
 
 
5866ba2
 
8d2615a
 
5350c35
5866ba2
8d2615a
 
5866ba2
 
8d2615a
5866ba2
 
 
 
 
 
8d2615a
 
 
5350c35
5866ba2
 
 
 
5350c35
5866ba2
 
 
 
5350c35
5866ba2
3fceda9
5866ba2
 
 
ef7790c
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# import part
import streamlit as st
from transformers import pipeline
from gtts import gTTS
import io

# function part
# img2text
def img2text(url):
    image_to_text_model = pipeline("image-to-text", 
                                   model="Salesforce/blip-image-captioning-base")
    text = image_to_text_model(url)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    story_pipeline = pipeline("text-generation", model="perplexity-ai/r1-1776", trust_remote_code=True)
    result = story_pipeline(text, max_length=200)
    story_text = result[0]['generated_text']
    return story_text

# text2audio
def text2audio(story_text):
    tts = gTTS(text=story_text, lang='en')
    audio_file = io.BytesIO()
    tts.write_to_fp(audio_file)
    audio_file.seek(0)
    return audio_file

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")  # prepare configuration
st.header("Turn Your Image to Audio Story")

# Upload image
uploaded_file = st.file_uploader("Select an Image...")

# If it is none, skip all the following things
if uploaded_file is not None:
    print(uploaded_file)
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
        file.write(bytes_data)

    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)

    # Stage 1: Image to Text
    st.text('Processing img2text...')
    scenario = img2text(uploaded_file.name)
    st.write(scenario)

    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(scenario)
    st.write(story)

    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_data = text2audio(story)

    # Play button
    if st.button("Play Audio"):
        st.audio(audio_data['audio'],
                    format="audio/wav",
                    start_time=0,
                    sample_rate = audio_data['sampling_rate'])
        st.audio("kids_playing_audio.wav")