Spaces:

garyuzair
/

Video-To-SoundFX

Running

App Files Files Community

garyuzair commited on Mar 11

Commit

c6fb5b9

verified ·

1 Parent(s): 8237612

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -58

app.py CHANGED Viewed

@@ -6,89 +6,109 @@ from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenFo
 import soundfile as sf
 import torch
 import os
-# Set page title
 st.title("Video Sound Effect Generator")
 # File uploader for video
-uploaded_file = st.file_uploader(
-    "Upload a short video (MP4, high resolution)",
-    type=["mp4"]
-)
 if uploaded_file is not None:
     try:
-        # Save the uploaded video temporarily
-        with open("temp_video.mp4", "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        # Extract frames using ffmpeg backend
-        video = imageio.get_reader("temp_video.mp4", "ffmpeg")
-        num_frames = len(list(video.iter_data()))
-        # Extract 10 evenly spaced frames
-        num_frames_to_extract = 10
-        step = max(1, num_frames // num_frames_to_extract)
         frames = [
             Image.fromarray(video.get_data(i))
-            for i in range(0, num_frames, step)
         ][:num_frames_to_extract]
-        # Load BLIP model for image captioning
         @st.cache_resource
         def load_blip_model():
             processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
             model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
             return processor, model
         processor, model = load_blip_model()
-        # Generate text descriptions for each frame
         descriptions = []
-        for frame in frames:
             inputs = processor(images=frame, return_tensors="pt")
             out = model.generate(**inputs)
             description = processor.decode(out[0], skip_special_tokens=True)
             descriptions.append(description)
-        # Combine descriptions into a single prompt
         text_prompt = ". ".join(descriptions)
         st.write("Generated text prompt:", text_prompt)
-        # Load MusicGen model for sound generation
         @st.cache_resource
         def load_musicgen_model():
             processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
             model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
             return processor, model
         musicgen_processor, musicgen_model = load_musicgen_model()
-        # Generate sound effect
-        with st.spinner("Generating sound effect..."):
-            inputs = musicgen_processor(
-                text=[text_prompt],
-                padding=True,
-                return_tensors="pt",
-            )
-            audio_values = musicgen_model.generate(**inputs, max_new_tokens=512)
-            # Convert audio_values to a 1D NumPy array and normalize
-            audio_array = audio_values[0].cpu().numpy()  # Move to CPU and convert to NumPy
-            if audio_array.ndim > 1:  # Ensure it’s 1D
-                audio_array = audio_array.flatten()
-            audio_array = audio_array / np.max(np.abs(audio_array))  # Normalize to [-1, 1]
-            # Define sample rate (MusicGen small uses 32kHz)
-            sample_rate = 32000
-            # Save audio to WAV file
-            sf.write("output.wav", audio_array, sample_rate)
-        # Verify file exists and provide playback/download
-        if os.path.exists("output.wav"):
-            st.audio("output.wav", format="audio/wav")
-            with open("output.wav", "rb") as audio_file:
                 st.download_button(
                     label="Download Sound Effect",
                     data=audio_file,
@@ -97,14 +117,14 @@ if uploaded_file is not None:
                 )
         else:
             st.error("Failed to generate the audio file.")
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
-        st.write("Please try uploading a different video or check your connection.")
     finally:
         # Clean up temporary files
-        if os.path.exists("temp_video.mp4"):
-            os.remove("temp_video.mp4")
-        if os.path.exists("output.wav"):
-            os.remove("output.wav")

 import soundfile as sf
 import torch
 import os
+import tempfile
+import time
+# Set page title and instructions
 st.title("Video Sound Effect Generator")
+st.write("Upload an MP4 video to generate a sound effect based on its content.")
+# User-configurable settings
+num_frames_to_extract = st.slider("Number of frames to analyze", 1, 10, 3, help="Fewer frames = faster processing")
 # File uploader for video
+uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
 if uploaded_file is not None:
     try:
+        # Use a temporary file for video
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
+            temp_video.write(uploaded_file.getbuffer())
+            temp_video_path = temp_video.name
+        # Progress bar setup
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Extract frames
+        status_text.text("Extracting frames...")
+        video = imageio.get_reader(temp_video_path, "ffmpeg")
+        total_frames = len(list(video.iter_data()))
+        step = max(1, total_frames // num_frames_to_extract)
         frames = [
             Image.fromarray(video.get_data(i))
+            for i in range(0, min(total_frames, num_frames_to_extract * step), step)
         ][:num_frames_to_extract]
+        progress_bar.progress(25)
+        # Load BLIP model with FP16 if GPU available
         @st.cache_resource
         def load_blip_model():
             processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
             model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
             return processor, model
         processor, model = load_blip_model()
+        # Generate text descriptions
+        status_text.text("Analyzing frames with BLIP...")
         descriptions = []
+        for i, frame in enumerate(frames):
             inputs = processor(images=frame, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
             out = model.generate(**inputs)
             description = processor.decode(out[0], skip_special_tokens=True)
             descriptions.append(description)
+            progress_bar.progress(25 + int(25 * (i + 1) / len(frames)))
         text_prompt = ". ".join(descriptions)
         st.write("Generated text prompt:", text_prompt)
+        # Load MusicGen model with FP16 if GPU available
         @st.cache_resource
         def load_musicgen_model():
             processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
             model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
             return processor, model
         musicgen_processor, musicgen_model = load_musicgen_model()
+        # Generate sound effect (limit to ~5 seconds)
+        status_text.text("Generating sound effect with MusicGen...")
+        inputs = musicgen_processor(
+            text=[text_prompt],
+            padding=True,
+            return_tensors="pt",
+        )
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # max_new_tokens = 160 (5 seconds at 32kHz)
+        audio_values = musicgen_model.generate(**inputs, max_new_tokens=160)
+        audio_array = audio_values[0].cpu().numpy()
+        if audio_array.ndim > 1:
+            audio_array = audio_array.flatten()
+        audio_array = audio_array / np.max(np.abs(audio_array))  # Normalize
+        sample_rate = 32000  # MusicGen small uses 32kHz
+        progress_bar.progress(75)
+        # Save audio to temporary file
+        status_text.text("Saving audio...")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+            sf.write(temp_audio.name, audio_array, sample_rate)
+            temp_audio_path = temp_audio.name
+        # Provide playback and download
+        progress_bar.progress(100)
+        status_text.text("Done!")
+        if os.path.exists(temp_audio_path):
+            st.audio(temp_audio_path, format="audio/wav")
+            with open(temp_audio_path, "rb") as audio_file:
                 st.download_button(
                     label="Download Sound Effect",
                     data=audio_file,
                 )
         else:
             st.error("Failed to generate the audio file.")
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
+        st.write("Try reducing the number of frames or uploading a smaller video.")
     finally:
         # Clean up temporary files
+        if os.path.exists(temp_video_path):
+            os.remove(temp_video_path)
+        if os.path.exists(temp_audio_path):
+            os.remove(temp_audio_path)