Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,14 +7,32 @@ import soundfile as sf
|
|
7 |
import torch
|
8 |
import os
|
9 |
import tempfile
|
|
|
10 |
|
11 |
# Set page title and instructions
|
12 |
-
st.title("Story Video Sound Effect Generator")
|
13 |
-
st.write("Upload an MP4 video to
|
14 |
|
15 |
# User-configurable settings
|
16 |
-
num_frames_to_extract = st.slider("Number of frames to analyze", 1,
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
# File uploader for video
|
20 |
uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
|
@@ -39,7 +57,7 @@ if uploaded_file is not None:
|
|
39 |
Image.fromarray(video.get_data(i))
|
40 |
for i in range(0, min(total_frames, num_frames_to_extract * step), step)
|
41 |
][:num_frames_to_extract]
|
42 |
-
progress_bar.progress(
|
43 |
|
44 |
# Load BLIP model
|
45 |
@st.cache_resource
|
@@ -61,16 +79,9 @@ if uploaded_file is not None:
|
|
61 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
62 |
out = model.generate(**inputs)
|
63 |
base_description = processor.decode(out[0], skip_special_tokens=True)
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
enhanced = f"{base_description} with ambient sounds like wind or rain"
|
68 |
-
elif sound_type == "Action (e.g., footsteps, crashes)":
|
69 |
-
enhanced = f"{base_description} with action sounds like footsteps or crashes"
|
70 |
-
else: # Dialogue
|
71 |
-
enhanced = f"{base_description} with background dialogue or crowd noise"
|
72 |
-
descriptions.append(enhanced)
|
73 |
-
progress_bar.progress(25 + int(25 * (i + 1) / len(frames)))
|
74 |
|
75 |
text_prompt = ". ".join(descriptions)
|
76 |
st.write("Enhanced text prompt:", text_prompt)
|
@@ -86,7 +97,7 @@ if uploaded_file is not None:
|
|
86 |
|
87 |
musicgen_processor, musicgen_model = load_musicgen_model()
|
88 |
|
89 |
-
# Generate sound effect (~
|
90 |
status_text.text("Generating sound effect...")
|
91 |
inputs = musicgen_processor(
|
92 |
text=[text_prompt],
|
@@ -95,46 +106,79 @@ if uploaded_file is not None:
|
|
95 |
)
|
96 |
if torch.cuda.is_available():
|
97 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
audio_array = audio_values[0].cpu().numpy()
|
102 |
if audio_array.ndim > 1:
|
103 |
audio_array = audio_array.flatten()
|
104 |
-
|
105 |
-
|
106 |
-
audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9 # Normalize and slightly reduce clipping
|
107 |
-
audio_array = np.clip(audio_array, -1.0, 1.0) # Ensure bounds
|
108 |
sample_rate = 32000
|
109 |
-
progress_bar.progress(
|
110 |
|
111 |
-
# Save audio
|
112 |
-
status_text.text("Saving audio...")
|
113 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
114 |
sf.write(temp_audio.name, audio_array, sample_rate)
|
115 |
temp_audio_path = temp_audio.name
|
116 |
|
117 |
-
#
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
129 |
else:
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
except Exception as e:
|
133 |
st.error(f"An error occurred: {str(e)}")
|
134 |
-
st.write("Try
|
135 |
|
136 |
finally:
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
7 |
import torch
|
8 |
import os
|
9 |
import tempfile
|
10 |
+
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
|
11 |
|
12 |
# Set page title and instructions
|
13 |
+
st.title("Story Video Sound Effect Sync Generator")
|
14 |
+
st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
|
15 |
|
16 |
# User-configurable settings
|
17 |
+
num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
|
18 |
+
mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
|
19 |
+
|
20 |
+
# Prompt enhancement function
|
21 |
+
def enhance_prompt(base_description):
|
22 |
+
"""Enhance BLIP caption with sound-specific details."""
|
23 |
+
base = base_description.lower()
|
24 |
+
if "walk" in base or "run" in base:
|
25 |
+
return f"{base} with crisp footsteps on a wooden floor"
|
26 |
+
elif "car" in base or "drive" in base:
|
27 |
+
return f"{base} with the roar of an engine and tires screeching"
|
28 |
+
elif "talk" in base or "person" in base:
|
29 |
+
return f"{base} with soft voices and background crowd murmur"
|
30 |
+
elif "wind" in base or "tree" in base or "forest" in base:
|
31 |
+
return f"{base} with gentle wind rustling through leaves"
|
32 |
+
elif "crash" in base or "fall" in base:
|
33 |
+
return f"{base} with a loud crash and debris scattering"
|
34 |
+
else:
|
35 |
+
return f"{base} with subtle ambient hum and faint echoes"
|
36 |
|
37 |
# File uploader for video
|
38 |
uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
|
|
|
57 |
Image.fromarray(video.get_data(i))
|
58 |
for i in range(0, min(total_frames, num_frames_to_extract * step), step)
|
59 |
][:num_frames_to_extract]
|
60 |
+
progress_bar.progress(20)
|
61 |
|
62 |
# Load BLIP model
|
63 |
@st.cache_resource
|
|
|
79 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
80 |
out = model.generate(**inputs)
|
81 |
base_description = processor.decode(out[0], skip_special_tokens=True)
|
82 |
+
enhanced_description = enhance_prompt(base_description)
|
83 |
+
descriptions.append(enhanced_description)
|
84 |
+
progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
text_prompt = ". ".join(descriptions)
|
87 |
st.write("Enhanced text prompt:", text_prompt)
|
|
|
97 |
|
98 |
musicgen_processor, musicgen_model = load_musicgen_model()
|
99 |
|
100 |
+
# Generate sound effect (~8 seconds)
|
101 |
status_text.text("Generating sound effect...")
|
102 |
inputs = musicgen_processor(
|
103 |
text=[text_prompt],
|
|
|
106 |
)
|
107 |
if torch.cuda.is_available():
|
108 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
109 |
+
audio_values = musicgen_model.generate(
|
110 |
+
**inputs,
|
111 |
+
max_new_tokens=256,
|
112 |
+
do_sample=True,
|
113 |
+
guidance_scale=3.0,
|
114 |
+
top_k=50,
|
115 |
+
top_p=0.95
|
116 |
+
)
|
117 |
audio_array = audio_values[0].cpu().numpy()
|
118 |
if audio_array.ndim > 1:
|
119 |
audio_array = audio_array.flatten()
|
120 |
+
audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
|
121 |
+
audio_array = np.clip(audio_array, -1.0, 1.0)
|
|
|
|
|
122 |
sample_rate = 32000
|
123 |
+
progress_bar.progress(60)
|
124 |
|
125 |
+
# Save temporary audio
|
|
|
126 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
127 |
sf.write(temp_audio.name, audio_array, sample_rate)
|
128 |
temp_audio_path = temp_audio.name
|
129 |
|
130 |
+
# Synchronize with video
|
131 |
+
status_text.text("Syncing audio with video...")
|
132 |
+
video_clip = VideoFileClip(temp_video_path)
|
133 |
+
video_duration = video_clip.duration
|
134 |
+
audio_clip = AudioFileClip(temp_audio_path)
|
135 |
+
|
136 |
+
# Adjust audio length
|
137 |
+
if audio_clip.duration < video_duration:
|
138 |
+
loops_needed = int(np.ceil(video_duration / audio_clip.duration))
|
139 |
+
audio_clip = concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
|
140 |
+
else:
|
141 |
+
audio_clip = audio_clip.subclip(0, video_duration)
|
142 |
+
|
143 |
+
# Mix or replace audio
|
144 |
+
if mix_original_audio and video_clip.audio:
|
145 |
+
final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
|
146 |
else:
|
147 |
+
final_audio = audio_clip
|
148 |
+
|
149 |
+
# Set audio to video
|
150 |
+
final_video = video_clip.set_audio(final_audio)
|
151 |
+
|
152 |
+
# Save final video with faster preset
|
153 |
+
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
154 |
+
final_video.write_videofile(
|
155 |
+
output_path,
|
156 |
+
codec="libx264",
|
157 |
+
audio_codec="aac",
|
158 |
+
preset="ultrafast",
|
159 |
+
temp_audiofile="temp-audio.m4a",
|
160 |
+
remove_temp=True
|
161 |
+
)
|
162 |
+
progress_bar.progress(90)
|
163 |
+
|
164 |
+
# Provide playback and download
|
165 |
+
status_text.text("Done!")
|
166 |
+
st.video(output_path)
|
167 |
+
with open(output_path, "rb") as video_file:
|
168 |
+
st.download_button(
|
169 |
+
label="Download Synced Video",
|
170 |
+
data=video_file,
|
171 |
+
file_name="synced_story_video.mp4",
|
172 |
+
mime="video/mp4"
|
173 |
+
)
|
174 |
+
progress_bar.progress(100)
|
175 |
|
176 |
except Exception as e:
|
177 |
st.error(f"An error occurred: {str(e)}")
|
178 |
+
st.write("Try reducing frames or uploading a smaller video.")
|
179 |
|
180 |
finally:
|
181 |
+
# Clean up
|
182 |
+
for path in [temp_video_path, temp_audio_path, output_path]:
|
183 |
+
if 'path' in locals() and os.path.exists(path):
|
184 |
+
os.remove(path)
|