File size: 1,781 Bytes
8d2615a
5866ba2
 
67f1091
 
5866ba2
8d2615a
 
5866ba2
67f1091
5866ba2
 
 
8d2615a
5866ba2
bcc6534
67f1091
 
5866ba2
8d2615a
5866ba2
67f1091
 
 
 
 
 
 
 
5866ba2
8d2615a
67f1091
5866ba2
 
 
 
 
 
 
 
67f1091
8d2615a
67f1091
5866ba2
 
 
 
67f1091
5866ba2
 
 
 
67f1091
5866ba2
67f1091
5866ba2
 
 
67f1091
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# import part
import streamlit as st
from transformers import pipeline
import soundfile as sf
import numpy as np

# function part
# img2text
def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    text = image_to_text_model(url)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    story_text_model = pipeline("text-generation", model="meta-llama/Llama-3.1-8B")
    story = story_text_model(text, max_length=150)[0]['generated_text']
    return story

# text2audio
def text2audio(story_text):
    tts_model = pipeline("text-to-speech", model="tts_models/en/ljspeech/tacotron2")
    audio_data = tts_model(story_text)
    
    # Save audio to a file
    audio_filename = "story_audio.wav"
    sf.write(audio_filename, audio_data['audio'], audio_data['sampling_rate'])
    
    return audio_filename

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
        file.write(bytes_data)

    st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)

    # Stage 1: Image to Text
    st.text('Processing img2text...')
    scenario = img2text(uploaded_file.name)
    st.write(scenario)

    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(scenario)
    st.write(story)

    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_filename = text2audio(story)

    # Play button
    if st.button("Play Audio"):
        st.audio(audio_filename, format="audio/wav")