File size: 4,082 Bytes
4a3b4d9
b48b44d
4a3b4d9
b48b44d
 
 
4a3b4d9
 
e00cf23
b48b44d
 
e00cf23
b48b44d
 
 
 
 
e00cf23
b48b44d
e00cf23
b48b44d
 
4a3b4d9
b48b44d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e00cf23
b48b44d
 
 
b9f8827
e00cf23
b48b44d
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
import imageio
import numpy as np
from PIL import Image
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
import soundfile as sf
import torch
import os

# Set page title
st.title("Video Sound Effect Generator")

# File uploader for video
uploaded_file = st.file_uploader(
    "Upload a short video (MP4, max 10 seconds, high resolution)", 
    type=["mp4"]
)

if uploaded_file is not None:
    try:
        # Save the uploaded video temporarily
        with open("temp_video.mp4", "wb") as f:
            f.write(uploaded_file.getbuffer())
        
        # Check video duration
        video = imageio.get_reader("temp_video.mp4")
        fps = video.get_meta_data()['fps']
        num_frames = len(list(video.iter_data()))
        duration = num_frames / fps
        
        if duration > 10:
            st.error("Video is too long. Please upload a video of maximum 10 seconds.")
        else:
            st.success("Video uploaded successfully!")
            
            # Extract 10 evenly spaced frames
            num_frames_to_extract = 10
            step = max(1, num_frames // num_frames_to_extract)
            frames = [
                Image.fromarray(video.get_data(i)) 
                for i in range(0, num_frames, step)
            ][:num_frames_to_extract]
            
            # Load BLIP model with caching
            @st.cache_resource
            def load_blip_model():
                processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
                model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
                return processor, model

            processor, model = load_blip_model()
            
            # Generate text descriptions for each frame
            descriptions = []
            for frame in frames:
                inputs = processor(images=frame, return_tensors="pt")
                out = model.generate(**inputs)
                description = processor.decode(out[0], skip_special_tokens=True)
                descriptions.append(description)
            
            # Combine descriptions into a single prompt
            text_prompt = ". ".join(descriptions)
            st.write("Generated text prompt:", text_prompt)
            
            # Load MusicGen model with caching
            @st.cache_resource
            def load_musicgen_model():
                processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
                model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
                return processor, model

            musicgen_processor, musicgen_model = load_musicgen_model()
            
            # Generate sound effect
            inputs = musicgen_processor(
                text=[text_prompt],
                padding=True,
                return_tensors="pt",
            )
            audio_values = musicgen_model.generate(**inputs, max_new_tokens=512)
            audio_array = audio_values[0].numpy()
            sample_rate = musicgen_model.config.audio_encoder.sampling_rate
            
            # Save audio to a WAV file
            sf.write("output.wav", audio_array, sample_rate)
            
            # Provide audio playback and download options
            st.audio("output.wav", format="audio/wav")
            with open("output.wav", "rb") as audio_file:
                st.download_button(
                    label="Download Sound Effect",
                    data=audio_file,
                    file_name="sound_effect.wav",
                    mime="audio/wav"
                )
    
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.write("Please try uploading a different video or check your connection.")
    
    finally:
        # Clean up temporary files
        if os.path.exists("temp_video.mp4"):
            os.remove("temp_video.mp4")
        if os.path.exists("output.wav"):
            os.remove("output.wav")