Spaces:

testdeep123
/

sda

Running

App Files Files Community

testdeep123 commited on 6 days ago

Commit

96c0d7e

verified ·

1 Parent(s): 8f72d8b

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -147

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Import necessary libraries
 import os
 import re
 import time
@@ -9,21 +8,16 @@ import shutil
 import torch
 import numpy as np
 import soundfile as sf
-from PIL import Image, ImageDraw, ImageFont
 from pydub import AudioSegment
 from gtts import gTTS
-import whisper
 import gradio as gr
 import requests
 import json
 from moviepy.editor import (
     VideoFileClip, concatenate_videoclips, AudioFileClip,
-    CompositeVideoClip, TextClip, CompositeAudioClip
 )
-import subprocess
-import cv2
-import moviepy.config as mpy_config
-import moviepy.video.fx.all as vfx
 import logging
 # Set up logging
@@ -31,18 +25,15 @@ logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Configure moviepy
-mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
 # Global Configuration Variables
 OPENROUTER_API_KEY = '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'
 OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
-TARGET_RESOLUTION = (1080, 1920)  # Fixed to vertical format for shorts
 OUTPUT_VIDEO_FILENAME = "final_video.mp4"
 TEMP_FOLDER = None
 CAPTION_COLOR = "white"
-# Additional global variables for the Gradio interface
 selected_voice = 'en_us_001'  # Default voice
 voice_speed = 1.0  # Default voice speed
 font_size = 45  # Default font size
@@ -50,7 +41,7 @@ bg_music_volume = 0.08  # Default background music volume
 fps = 30  # Default FPS
 preset = "veryfast"  # Default preset
-# Initialize whisper model globally to avoid reloading
 whisper_model = None
 def load_whisper_model():
@@ -65,7 +56,6 @@ def load_whisper_model():
         logger.error(f"Failed to load Whisper model: {e}")
         return False
-# Helper Functions
 def generate_script(user_input):
     """Generate documentary script using OpenRouter API."""
     headers = {
@@ -94,31 +84,23 @@ Only output the script. No extra comments or text.
 Example:
 [Ocean]
 The ocean covers over seventy percent of the Earth's surface.
 [Currents]
 Ocean currents distribute heat and regulate global climate patterns.
 [Coral Reefs]
 These ecosystems support over one million species of marine life.
 [Pollution]
 Plastic waste threatens marine biodiversity and food chains.
 [Climate Impact]
 Rising temperatures are causing coral bleaching and habitat loss.
 [Subscribe]
 Follow to explore more about the changing planet we live on.
 Now here is the Topic: {user_input}
 """
@@ -147,7 +129,6 @@ Now here is the Topic: {user_input}
         else:
             logger.error(f"API Error {response.status_code}: {response.text}")
             return None
     except Exception as e:
         logger.error(f"Request failed: {str(e)}")
         return None
@@ -182,7 +163,7 @@ def parse_script(script_text):
             media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
             words = narration.split()
-            duration = max(3, len(words) * 0.5)  # Estimate duration based on word count
             tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
             elements.append(media_element)
             elements.append(tts_element)
@@ -205,7 +186,6 @@ def generate_tts(text, voice="en"):
         # Convert MP3 to WAV
         audio = AudioSegment.from_mp3(mp3_path)
-        # Adjust speed if needed
         if voice_speed != 1.0:
             audio = audio._spawn(audio.raw_data, overrides={
                 "frame_rate": int(audio.frame_rate * voice_speed)
@@ -220,7 +200,7 @@ def generate_tts(text, voice="en"):
         return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
 def generate_silent_audio(duration, sample_rate=24000):
-    """Generate a silent WAV audio file lasting 'duration' seconds."""
     num_samples = int(duration * sample_rate)
     silence = np.zeros(num_samples, dtype=np.float32)
     silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
@@ -229,20 +209,14 @@ def generate_silent_audio(duration, sample_rate=24000):
     return silent_path
 def analyze_audio_with_whisper(audio_path):
-    """
-    Use Whisper to transcribe audio and generate word-level timestamps.
-    Returns a list of dictionaries with word, start_time, and end_time.
-    """
     try:
         if whisper_model is None:
             load_whisper_model()
         logger.info(f"Analyzing audio with Whisper: {audio_path}")
-        # Transcribe the audio file
         result = whisper_model.transcribe(audio_path, word_timestamps=True)
-        # Extract word-level segments
         word_segments = []
         for segment in result["segments"]:
             for word in segment["words"]:
@@ -259,24 +233,19 @@ def analyze_audio_with_whisper(audio_path):
         return []
 def get_video_clip_segment(video_path, start_time, duration):
-    """
-    Extract a segment from the video file starting at a random position,
-    but ensuring the segment is at least 'duration' seconds long.
-    """
     try:
         video = VideoFileClip(video_path)
         video_duration = video.duration
         if duration > video_duration:
-            logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s). Using full video.")
             return video
-        # Calculate a random start time ensuring we have enough duration left
         max_start_time = video_duration - duration
         if start_time is None or start_time > max_start_time:
             start_time = random.uniform(0, max_start_time)
-        # Extract the segment
         clip = video.subclip(start_time, start_time + duration)
         logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
         return clip
@@ -285,13 +254,9 @@ def get_video_clip_segment(video_path, start_time, duration):
         return None
 def create_word_level_subtitles(clip, words_data, font_size=45):
-    """
-    Create subtitles that highlight words as they are spoken.
-    Takes a list of word dictionaries with timing information.
-    """
     try:
         logger.info("Creating word-level synchronized subtitles")
-        # Group words into chunks of approximately 5 words
         chunks = []
         current_chunk = []
         current_chunk_words = []
@@ -310,7 +275,6 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
                 current_chunk = []
                 current_chunk_words = []
-        # Add any remaining words
         if current_chunk_words:
             chunks.append({
                 "text": " ".join(current_chunk_words),
@@ -319,25 +283,26 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
                 "end": current_chunk[-1]["end"]
             })
-        # Create subtitle clips for each chunk
         subtitle_clips = []
         for chunk in chunks:
             txt_clip = TextClip(
                 chunk["text"],
                 fontsize=font_size,
-                font='Arial-Bold',
                 color=CAPTION_COLOR,
-                bg_color='rgba(0, 0, 0, 0.5)',
-                method='caption',
-                align='center',
-                stroke_width=2,
-                stroke_color='black',
-                size=(TARGET_RESOLUTION[0] * 0.9, None)
-            ).set_start(chunk["start"]).set_end(chunk["end"])
-            txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] * 0.85))
-            subtitle_clips.append(txt_clip)
         logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
         return subtitle_clips
@@ -346,7 +311,7 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
         return []
 def add_background_music(final_video, bg_music_volume=0.08):
-    """Add background music to the final video."""
     try:
         bg_music_path = "music.mp3"
         if bg_music_path and os.path.exists(bg_music_path):
@@ -367,61 +332,54 @@ def add_background_music(final_video, bg_music_volume=0.08):
         return final_video
     except Exception as e:
         logger.error(f"Error adding background music: {e}")
-        logger.info("Continuing without background music")
         return final_video
 def create_clip(tts_path, narration_text, segment_index=0):
-    """
-    Create a video clip with synchronized subtitles using whisper timestamps.
-    Uses a random segment from video.mp4 matching the audio duration.
-    """
     try:
         logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
         if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
             logger.error("Missing video or TTS file")
             return None
-        # Get audio duration
         audio_clip = AudioFileClip(tts_path)
         audio_duration = audio_clip.duration
-        target_duration = audio_duration + 0.5  # Add a small buffer
-        # Get a random segment from the main video
         video_clip = get_video_clip_segment("video.mp4", None, target_duration)
         if video_clip is None:
             logger.error("Failed to extract video segment")
             return None
-        # Resize to target resolution
         video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
-        # Set the audio
         video_clip = video_clip.set_audio(audio_clip)
-        # Generate word-level timestamps with Whisper
         word_data = analyze_audio_with_whisper(tts_path)
         if word_data:
-            # Create word-level subtitles
             subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
             if subtitle_clips:
-                # Combine video with subtitles
                 video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
         else:
-            # Fallback to basic subtitle if whisper fails
             logger.warning("Falling back to basic subtitles")
             txt_clip = TextClip(
                 narration_text,
                 fontsize=font_size,
-                font='Arial-Bold',
                 color=CAPTION_COLOR,
-                bg_color='rgba(0, 0, 0, 0.5)',
-                method='caption',
-                align='center',
-                size=(TARGET_RESOLUTION[0] * 0.9, None)
-            ).set_position(('center', TARGET_RESOLUTION[1] * 0.85)).set_duration(video_clip.duration)
-            video_clip = CompositeVideoClip([video_clip, txt_clip])
         logger.info(f"Clip created: {video_clip.duration:.1f}s")
         return video_clip
@@ -429,125 +387,82 @@ def create_clip(tts_path, narration_text, segment_index=0):
         logger.error(f"Error in create_clip: {str(e)}")
         return None
-# Main Video Generation Function
 def generate_video(user_input, resolution, caption_option):
-    """Generate a video based on user input via Gradio."""
     global TEMP_FOLDER, CAPTION_COLOR
-    # Set caption color based on option
     CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
-    # Create a unique temporary folder
     TEMP_FOLDER = tempfile.mkdtemp()
     logger.info(f"Created temporary folder: {TEMP_FOLDER}")
-    # Check if video.mp4 exists
     if not os.path.exists("video.mp4"):
-        logger.error("video.mp4 not found in the current directory")
         return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
-    # Load Whisper model
     load_whisper_model()
-    # Generate script
-    logger.info("Generating script from API...")
     script = generate_script(user_input)
     if not script:
-        logger.error("Failed to generate script.")
         shutil.rmtree(TEMP_FOLDER)
-        return "Failed to generate script. Please try again."
     logger.info("Generated Script:\n" + script)
-    # Parse script into elements
     elements = parse_script(script)
     if not elements:
-        logger.error("Failed to parse script into elements.")
         shutil.rmtree(TEMP_FOLDER)
-        return "Failed to parse script. Please try again."
     logger.info(f"Parsed {len(elements)//2} script segments.")
-    # Group elements into pairs (media prompt + TTS)
-    paired_elements = []
-    for i in range(0, len(elements), 2):
-        if i + 1 < len(elements):
-            paired_elements.append((elements[i], elements[i + 1]))
     if not paired_elements:
-        logger.error("No valid script segments found.")
         shutil.rmtree(TEMP_FOLDER)
-        return "No valid script segments were generated."
-    # Create video clips for each segment
     clips = []
     for idx, (media_elem, tts_elem) in enumerate(paired_elements):
         logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
-        # Generate TTS for the segment
         tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
         if not tts_path:
-            logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
             continue
-        # Create video clip with subtitles
-        clip = create_clip(
-            tts_path=tts_path,
-            narration_text=tts_elem['text'],
-            segment_index=idx
-        )
         if clip:
             clips.append(clip)
-        else:
-            logger.error(f"Clip creation failed for segment {idx+1}.")
     if not clips:
-        logger.error("No clips were successfully created.")
         shutil.rmtree(TEMP_FOLDER)
-        return "Failed to create any video clips. Please try again."
-    # Concatenate all clips
     logger.info("\nConcatenating clips...")
     final_video = concatenate_videoclips(clips, method="compose")
-    # Add background music if available
     final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
-    # Export final video
     logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
     final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
     logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
-    # Clean up
-    logger.info("Cleaning up temporary files...")
     shutil.rmtree(TEMP_FOLDER)
     logger.info("Temporary files removed.")
     return OUTPUT_VIDEO_FILENAME
-# Gradio Interface Setup
 def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
     global voice_speed, font_size, bg_music_volume, fps, preset
-    # Update global variables with user selections
     voice_speed = v_speed
     font_size = caption_size
     bg_music_volume = bg_vol
     fps = video_fps
     preset = video_preset
-    # Handle music upload
     if music_file is not None:
-        target_path = "music.mp3"
-        shutil.copy(music_file.name, target_path)
-        logger.info(f"Uploaded music saved as: {target_path}")
-    # Generate the video (always using vertical resolution)
     return generate_video(user_input, "Short", caption_option)
-# Create the Gradio interface
 def create_interface():
     iface = gr.Interface(
         fn=generate_video_with_options,
         inputs=[
@@ -570,16 +485,13 @@ def create_interface():
         3. Adjust settings as needed
         4. Click submit and wait for video generation
-        NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space for this app to work.
         """
     )
     return iface
-# Launch the application
 if __name__ == "__main__":
-    # Create interface and launch
     demo = create_interface()
     demo.launch()
 else:
-    # For importing as a module
     demo = create_interface()

 import os
 import re
 import time
 import torch
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment
 from gtts import gTTS
+import whisper  # Ensure this is openai-whisper in requirements.txt
 import gradio as gr
 import requests
 import json
 from moviepy.editor import (
     VideoFileClip, concatenate_videoclips, AudioFileClip,
+    CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
 )
 import logging
 # Set up logging
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Global Configuration Variables
 OPENROUTER_API_KEY = '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'
 OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
+TARGET_RESOLUTION = (1080, 1920)  # Vertical format for shorts
 OUTPUT_VIDEO_FILENAME = "final_video.mp4"
 TEMP_FOLDER = None
 CAPTION_COLOR = "white"
+# Additional global variables for Gradio interface
 selected_voice = 'en_us_001'  # Default voice
 voice_speed = 1.0  # Default voice speed
 font_size = 45  # Default font size
 fps = 30  # Default FPS
 preset = "veryfast"  # Default preset
+# Initialize whisper model globally
 whisper_model = None
 def load_whisper_model():
         logger.error(f"Failed to load Whisper model: {e}")
         return False
 def generate_script(user_input):
     """Generate documentary script using OpenRouter API."""
     headers = {
 Example:
 [Ocean]
 The ocean covers over seventy percent of the Earth's surface.
 [Currents]
 Ocean currents distribute heat and regulate global climate patterns.
 [Coral Reefs]
 These ecosystems support over one million species of marine life.
 [Pollution]
 Plastic waste threatens marine biodiversity and food chains.
 [Climate Impact]
 Rising temperatures are causing coral bleaching and habitat loss.
 [Subscribe]
 Follow to explore more about the changing planet we live on.
 Now here is the Topic: {user_input}
 """
         else:
             logger.error(f"API Error {response.status_code}: {response.text}")
             return None
     except Exception as e:
         logger.error(f"Request failed: {str(e)}")
         return None
             media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
             words = narration.split()
+            duration = max(3, len(words) * 0.5)  # Estimate duration
             tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
             elements.append(media_element)
             elements.append(tts_element)
         # Convert MP3 to WAV
         audio = AudioSegment.from_mp3(mp3_path)
         if voice_speed != 1.0:
             audio = audio._spawn(audio.raw_data, overrides={
                 "frame_rate": int(audio.frame_rate * voice_speed)
         return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
 def generate_silent_audio(duration, sample_rate=24000):
+    """Generate a silent WAV audio file."""
     num_samples = int(duration * sample_rate)
     silence = np.zeros(num_samples, dtype=np.float32)
     silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
     return silent_path
 def analyze_audio_with_whisper(audio_path):
+    """Use Whisper to generate word-level timestamps."""
     try:
         if whisper_model is None:
             load_whisper_model()
         logger.info(f"Analyzing audio with Whisper: {audio_path}")
         result = whisper_model.transcribe(audio_path, word_timestamps=True)
         word_segments = []
         for segment in result["segments"]:
             for word in segment["words"]:
         return []
 def get_video_clip_segment(video_path, start_time, duration):
+    """Extract a random video segment."""
     try:
         video = VideoFileClip(video_path)
         video_duration = video.duration
         if duration > video_duration:
+            logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
             return video
         max_start_time = video_duration - duration
         if start_time is None or start_time > max_start_time:
             start_time = random.uniform(0, max_start_time)
         clip = video.subclip(start_time, start_time + duration)
         logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
         return clip
         return None
 def create_word_level_subtitles(clip, words_data, font_size=45):
+    """Create synchronized subtitles without ImageMagick."""
     try:
         logger.info("Creating word-level synchronized subtitles")
         chunks = []
         current_chunk = []
         current_chunk_words = []
                 current_chunk = []
                 current_chunk_words = []
         if current_chunk_words:
             chunks.append({
                 "text": " ".join(current_chunk_words),
                 "end": current_chunk[-1]["end"]
             })
         subtitle_clips = []
         for chunk in chunks:
             txt_clip = TextClip(
                 chunk["text"],
                 fontsize=font_size,
                 color=CAPTION_COLOR,
+                method='label'
+            )
+            bg_clip = ColorClip(
+                size=(txt_clip.w + 20, txt_clip.h + 10),
+                color=(0, 0, 0, 128)  # Semi-transparent black
+            )
+            subtitle_clip = CompositeVideoClip([
+                bg_clip.set_position('center'),
+                txt_clip.set_position('center')
+            ])
+            subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
+            subtitle_clips.append(subtitle_clip)
         logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
         return subtitle_clips
         return []
 def add_background_music(final_video, bg_music_volume=0.08):
+    """Add background music to the video."""
     try:
         bg_music_path = "music.mp3"
         if bg_music_path and os.path.exists(bg_music_path):
         return final_video
     except Exception as e:
         logger.error(f"Error adding background music: {e}")
         return final_video
 def create_clip(tts_path, narration_text, segment_index=0):
+    """Create a video clip with synchronized subtitles."""
     try:
         logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
         if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
             logger.error("Missing video or TTS file")
             return None
         audio_clip = AudioFileClip(tts_path)
         audio_duration = audio_clip.duration
+        target_duration = audio_duration + 0.5
         video_clip = get_video_clip_segment("video.mp4", None, target_duration)
         if video_clip is None:
             logger.error("Failed to extract video segment")
             return None
         video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
         video_clip = video_clip.set_audio(audio_clip)
         word_data = analyze_audio_with_whisper(tts_path)
         if word_data:
             subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
             if subtitle_clips:
                 video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
         else:
             logger.warning("Falling back to basic subtitles")
             txt_clip = TextClip(
                 narration_text,
                 fontsize=font_size,
                 color=CAPTION_COLOR,
+                method='label'
+            )
+            bg_clip = ColorClip(
+                size=(txt_clip.w + 20, txt_clip.h + 10),
+                color=(0, 0, 0, 128)
+            )
+            subtitle_clip = CompositeVideoClip([
+                bg_clip.set_position('center'),
+                txt_clip.set_position('center')
+            ])
+            subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
+            video_clip = CompositeVideoClip([video_clip, subtitle_clip])
         logger.info(f"Clip created: {video_clip.duration:.1f}s")
         return video_clip
         logger.error(f"Error in create_clip: {str(e)}")
         return None
 def generate_video(user_input, resolution, caption_option):
+    """Generate a video based on user input."""
     global TEMP_FOLDER, CAPTION_COLOR
     CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
     TEMP_FOLDER = tempfile.mkdtemp()
     logger.info(f"Created temporary folder: {TEMP_FOLDER}")
     if not os.path.exists("video.mp4"):
+        logger.error("video.mp4 not found")
         return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
     load_whisper_model()
     script = generate_script(user_input)
     if not script:
         shutil.rmtree(TEMP_FOLDER)
+        return "Failed to generate script."
     logger.info("Generated Script:\n" + script)
     elements = parse_script(script)
     if not elements:
         shutil.rmtree(TEMP_FOLDER)
+        return "Failed to parse script."
     logger.info(f"Parsed {len(elements)//2} script segments.")
+    paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
     if not paired_elements:
         shutil.rmtree(TEMP_FOLDER)
+        return "No valid script segments generated."
     clips = []
     for idx, (media_elem, tts_elem) in enumerate(paired_elements):
         logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
         tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
         if not tts_path:
             continue
+        clip = create_clip(tts_path, tts_elem['text'], idx)
         if clip:
             clips.append(clip)
     if not clips:
         shutil.rmtree(TEMP_FOLDER)
+        return "Failed to create any video clips."
     logger.info("\nConcatenating clips...")
     final_video = concatenate_videoclips(clips, method="compose")
     final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
     logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
     final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
     logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
     shutil.rmtree(TEMP_FOLDER)
     logger.info("Temporary files removed.")
     return OUTPUT_VIDEO_FILENAME
 def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
+    """Generate video with Gradio options."""
     global voice_speed, font_size, bg_music_volume, fps, preset
     voice_speed = v_speed
     font_size = caption_size
     bg_music_volume = bg_vol
     fps = video_fps
     preset = video_preset
     if music_file is not None:
+        shutil.copy(music_file.name, "music.mp3")
+        logger.info(f"Uploaded music saved as: music.mp3")
     return generate_video(user_input, "Short", caption_option)
 def create_interface():
+    """Create Gradio interface."""
     iface = gr.Interface(
         fn=generate_video_with_options,
         inputs=[
         3. Adjust settings as needed
         4. Click submit and wait for video generation
+        NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
         """
     )
     return iface
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()
 else:
     demo = create_interface()