Spaces:
Running
Running
File size: 4,082 Bytes
4a3b4d9 b48b44d 4a3b4d9 b48b44d 4a3b4d9 e00cf23 b48b44d e00cf23 b48b44d e00cf23 b48b44d e00cf23 b48b44d 4a3b4d9 b48b44d e00cf23 b48b44d b9f8827 e00cf23 b48b44d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import streamlit as st
import imageio
import numpy as np
from PIL import Image
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
import soundfile as sf
import torch
import os
# Set page title
st.title("Video Sound Effect Generator")
# File uploader for video
uploaded_file = st.file_uploader(
"Upload a short video (MP4, max 10 seconds, high resolution)",
type=["mp4"]
)
if uploaded_file is not None:
try:
# Save the uploaded video temporarily
with open("temp_video.mp4", "wb") as f:
f.write(uploaded_file.getbuffer())
# Check video duration
video = imageio.get_reader("temp_video.mp4")
fps = video.get_meta_data()['fps']
num_frames = len(list(video.iter_data()))
duration = num_frames / fps
if duration > 10:
st.error("Video is too long. Please upload a video of maximum 10 seconds.")
else:
st.success("Video uploaded successfully!")
# Extract 10 evenly spaced frames
num_frames_to_extract = 10
step = max(1, num_frames // num_frames_to_extract)
frames = [
Image.fromarray(video.get_data(i))
for i in range(0, num_frames, step)
][:num_frames_to_extract]
# Load BLIP model with caching
@st.cache_resource
def load_blip_model():
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
return processor, model
processor, model = load_blip_model()
# Generate text descriptions for each frame
descriptions = []
for frame in frames:
inputs = processor(images=frame, return_tensors="pt")
out = model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
descriptions.append(description)
# Combine descriptions into a single prompt
text_prompt = ". ".join(descriptions)
st.write("Generated text prompt:", text_prompt)
# Load MusicGen model with caching
@st.cache_resource
def load_musicgen_model():
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
return processor, model
musicgen_processor, musicgen_model = load_musicgen_model()
# Generate sound effect
inputs = musicgen_processor(
text=[text_prompt],
padding=True,
return_tensors="pt",
)
audio_values = musicgen_model.generate(**inputs, max_new_tokens=512)
audio_array = audio_values[0].numpy()
sample_rate = musicgen_model.config.audio_encoder.sampling_rate
# Save audio to a WAV file
sf.write("output.wav", audio_array, sample_rate)
# Provide audio playback and download options
st.audio("output.wav", format="audio/wav")
with open("output.wav", "rb") as audio_file:
st.download_button(
label="Download Sound Effect",
data=audio_file,
file_name="sound_effect.wav",
mime="audio/wav"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.write("Please try uploading a different video or check your connection.")
finally:
# Clean up temporary files
if os.path.exists("temp_video.mp4"):
os.remove("temp_video.mp4")
if os.path.exists("output.wav"):
os.remove("output.wav")
|