garyuzair commited on
Commit
21a92e3
·
verified ·
1 Parent(s): 8d12ec8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -44
app.py CHANGED
@@ -7,14 +7,32 @@ import soundfile as sf
7
  import torch
8
  import os
9
  import tempfile
 
10
 
11
  # Set page title and instructions
12
- st.title("Story Video Sound Effect Generator")
13
- st.write("Upload an MP4 video to create high-quality sound effects for your story clips.")
14
 
15
  # User-configurable settings
16
- num_frames_to_extract = st.slider("Number of frames to analyze", 1, 5, 3, help="Fewer frames = faster processing")
17
- sound_type = st.selectbox("Sound effect type", ["Ambient (e.g., wind, rain)", "Action (e.g., footsteps, crashes)", "Dialogue (e.g., crowd murmur)"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # File uploader for video
20
  uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
@@ -39,7 +57,7 @@ if uploaded_file is not None:
39
  Image.fromarray(video.get_data(i))
40
  for i in range(0, min(total_frames, num_frames_to_extract * step), step)
41
  ][:num_frames_to_extract]
42
- progress_bar.progress(25)
43
 
44
  # Load BLIP model
45
  @st.cache_resource
@@ -61,16 +79,9 @@ if uploaded_file is not None:
61
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
62
  out = model.generate(**inputs)
63
  base_description = processor.decode(out[0], skip_special_tokens=True)
64
-
65
- # Enhance prompt based on sound type
66
- if sound_type == "Ambient (e.g., wind, rain)":
67
- enhanced = f"{base_description} with ambient sounds like wind or rain"
68
- elif sound_type == "Action (e.g., footsteps, crashes)":
69
- enhanced = f"{base_description} with action sounds like footsteps or crashes"
70
- else: # Dialogue
71
- enhanced = f"{base_description} with background dialogue or crowd noise"
72
- descriptions.append(enhanced)
73
- progress_bar.progress(25 + int(25 * (i + 1) / len(frames)))
74
 
75
  text_prompt = ". ".join(descriptions)
76
  st.write("Enhanced text prompt:", text_prompt)
@@ -86,7 +97,7 @@ if uploaded_file is not None:
86
 
87
  musicgen_processor, musicgen_model = load_musicgen_model()
88
 
89
- # Generate sound effect (~10 seconds)
90
  status_text.text("Generating sound effect...")
91
  inputs = musicgen_processor(
92
  text=[text_prompt],
@@ -95,46 +106,79 @@ if uploaded_file is not None:
95
  )
96
  if torch.cuda.is_available():
97
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
98
-
99
- # Increase to 10 seconds (320 tokens at 32kHz)
100
- audio_values = musicgen_model.generate(**inputs, max_new_tokens=320, do_sample=True, guidance_scale=3.0)
 
 
 
 
 
101
  audio_array = audio_values[0].cpu().numpy()
102
  if audio_array.ndim > 1:
103
  audio_array = audio_array.flatten()
104
-
105
- # Enhance audio quality
106
- audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9 # Normalize and slightly reduce clipping
107
- audio_array = np.clip(audio_array, -1.0, 1.0) # Ensure bounds
108
  sample_rate = 32000
109
- progress_bar.progress(75)
110
 
111
- # Save audio
112
- status_text.text("Saving audio...")
113
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
114
  sf.write(temp_audio.name, audio_array, sample_rate)
115
  temp_audio_path = temp_audio.name
116
 
117
- # Playback and download
118
- progress_bar.progress(100)
119
- status_text.text("Done!")
120
- if os.path.exists(temp_audio_path):
121
- st.audio(temp_audio_path, format="audio/wav")
122
- with open(temp_audio_path, "rb") as audio_file:
123
- st.download_button(
124
- label="Download Sound Effect",
125
- data=audio_file,
126
- file_name="story_sound_effect.wav",
127
- mime="audio/wav"
128
- )
 
 
 
 
129
  else:
130
- st.error("Failed to generate the audio file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  except Exception as e:
133
  st.error(f"An error occurred: {str(e)}")
134
- st.write("Try adjusting settings or uploading a different video.")
135
 
136
  finally:
137
- if 'temp_video_path' in locals() and os.path.exists(temp_video_path):
138
- os.remove(temp_video_path)
139
- if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
140
- os.remove(temp_audio_path)
 
7
  import torch
8
  import os
9
  import tempfile
10
+ from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
11
 
12
  # Set page title and instructions
13
+ st.title("Story Video Sound Effect Sync Generator")
14
+ st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
15
 
16
  # User-configurable settings
17
+ num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
18
+ mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
19
+
20
+ # Prompt enhancement function
21
+ def enhance_prompt(base_description):
22
+ """Enhance BLIP caption with sound-specific details."""
23
+ base = base_description.lower()
24
+ if "walk" in base or "run" in base:
25
+ return f"{base} with crisp footsteps on a wooden floor"
26
+ elif "car" in base or "drive" in base:
27
+ return f"{base} with the roar of an engine and tires screeching"
28
+ elif "talk" in base or "person" in base:
29
+ return f"{base} with soft voices and background crowd murmur"
30
+ elif "wind" in base or "tree" in base or "forest" in base:
31
+ return f"{base} with gentle wind rustling through leaves"
32
+ elif "crash" in base or "fall" in base:
33
+ return f"{base} with a loud crash and debris scattering"
34
+ else:
35
+ return f"{base} with subtle ambient hum and faint echoes"
36
 
37
  # File uploader for video
38
  uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
 
57
  Image.fromarray(video.get_data(i))
58
  for i in range(0, min(total_frames, num_frames_to_extract * step), step)
59
  ][:num_frames_to_extract]
60
+ progress_bar.progress(20)
61
 
62
  # Load BLIP model
63
  @st.cache_resource
 
79
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
80
  out = model.generate(**inputs)
81
  base_description = processor.decode(out[0], skip_special_tokens=True)
82
+ enhanced_description = enhance_prompt(base_description)
83
+ descriptions.append(enhanced_description)
84
+ progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
 
 
 
 
 
 
 
85
 
86
  text_prompt = ". ".join(descriptions)
87
  st.write("Enhanced text prompt:", text_prompt)
 
97
 
98
  musicgen_processor, musicgen_model = load_musicgen_model()
99
 
100
+ # Generate sound effect (~8 seconds)
101
  status_text.text("Generating sound effect...")
102
  inputs = musicgen_processor(
103
  text=[text_prompt],
 
106
  )
107
  if torch.cuda.is_available():
108
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
109
+ audio_values = musicgen_model.generate(
110
+ **inputs,
111
+ max_new_tokens=256,
112
+ do_sample=True,
113
+ guidance_scale=3.0,
114
+ top_k=50,
115
+ top_p=0.95
116
+ )
117
  audio_array = audio_values[0].cpu().numpy()
118
  if audio_array.ndim > 1:
119
  audio_array = audio_array.flatten()
120
+ audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
121
+ audio_array = np.clip(audio_array, -1.0, 1.0)
 
 
122
  sample_rate = 32000
123
+ progress_bar.progress(60)
124
 
125
+ # Save temporary audio
 
126
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
127
  sf.write(temp_audio.name, audio_array, sample_rate)
128
  temp_audio_path = temp_audio.name
129
 
130
+ # Synchronize with video
131
+ status_text.text("Syncing audio with video...")
132
+ video_clip = VideoFileClip(temp_video_path)
133
+ video_duration = video_clip.duration
134
+ audio_clip = AudioFileClip(temp_audio_path)
135
+
136
+ # Adjust audio length
137
+ if audio_clip.duration < video_duration:
138
+ loops_needed = int(np.ceil(video_duration / audio_clip.duration))
139
+ audio_clip = concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
140
+ else:
141
+ audio_clip = audio_clip.subclip(0, video_duration)
142
+
143
+ # Mix or replace audio
144
+ if mix_original_audio and video_clip.audio:
145
+ final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
146
  else:
147
+ final_audio = audio_clip
148
+
149
+ # Set audio to video
150
+ final_video = video_clip.set_audio(final_audio)
151
+
152
+ # Save final video with faster preset
153
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
154
+ final_video.write_videofile(
155
+ output_path,
156
+ codec="libx264",
157
+ audio_codec="aac",
158
+ preset="ultrafast",
159
+ temp_audiofile="temp-audio.m4a",
160
+ remove_temp=True
161
+ )
162
+ progress_bar.progress(90)
163
+
164
+ # Provide playback and download
165
+ status_text.text("Done!")
166
+ st.video(output_path)
167
+ with open(output_path, "rb") as video_file:
168
+ st.download_button(
169
+ label="Download Synced Video",
170
+ data=video_file,
171
+ file_name="synced_story_video.mp4",
172
+ mime="video/mp4"
173
+ )
174
+ progress_bar.progress(100)
175
 
176
  except Exception as e:
177
  st.error(f"An error occurred: {str(e)}")
178
+ st.write("Try reducing frames or uploading a smaller video.")
179
 
180
  finally:
181
+ # Clean up
182
+ for path in [temp_video_path, temp_audio_path, output_path]:
183
+ if 'path' in locals() and os.path.exists(path):
184
+ os.remove(path)