Spaces:

testdeep123
/

sda

Running

App Files Files Community

testdeep123 commited on 5 days ago

Commit

0e69295

verified ·

1 Parent(s): 731f02e

Update app.py

Browse files

Files changed (1) hide show

app.py +552 -191

app.py CHANGED Viewed

@@ -1,224 +1,585 @@
 import os
 import re
 import math
-import random
 import tempfile
 import shutil
-import requests
 import numpy as np
-from kokoro import KPipeline
 import soundfile as sf
 from pydub import AudioSegment
 from gtts import gTTS
 import gradio as gr
 from moviepy.editor import (
-    VideoFileClip, AudioFileClip, concatenate_audioclips,
-    CompositeAudioClip, CompositeVideoClip, TextClip
 )
-# ────────── GLOBAL CONFIG ──────────
-OPENROUTER_API_KEY = ''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'  # ← your key here
-OPENROUTER_MODEL     = "google/gemini-2.0-flash-exp:free"
-SOURCE_VIDEO_PATH    = "video.mp4"    # your 13 min source
-OUTPUT_VIDEO_PATH    = "final_video.mp4"
-TARGET_RESOLUTION    = (1080, 1920)   # vertical
-VOICE_SPEED          = 0.9
-CAPTION_FONT_SIZE    = 45
-BG_MUSIC_VOLUME      = 0.08
-# Kokoro TTS pipeline (American English)
-pipeline = KPipeline(lang_code='a')
-# ────────── UTILS ──────────
-def generate_script(topic: str) -> str:
-    """Ask the LLM to produce a tagged, one-sentence-per-scene script."""
     headers = {
         'Authorization': f'Bearer {OPENROUTER_API_KEY}',
         'X-Title': 'AI Documentary Maker'
     }
-    prompt = f"""
-You’re a professional documentary narrator.
-Break your script into scenes with [Tags], one sentence each (≤12 words).
-No slang or numbers. End with [Subscribe] + formal reason.
-Topic: {topic}
 """
-    payload = {
         'model': OPENROUTER_MODEL,
-        'messages': [{'role':'user','content':prompt}],
-        'temperature':0.4,
-        'max_tokens':5000
     }
-    r = requests.post('https://openrouter.ai/api/v1/chat/completions',
-                      headers=headers, json=payload, timeout=30)
-    r.raise_for_status()
-    return r.json()['choices'][0]['message']['content']
-def parse_script(script_text: str):
-    """
-    Return list of (tag, sentence), skipping empties.
-    If [Subscribe] has no sentence, fill a default call-to-action.
-    """
-    sections = []
-    current = None
-    for line in script_text.splitlines():
-        m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
-        if m:
-            if current:
-                sections.append(tuple(current))
-            current = [m.group(1).strip(), m.group(2).strip()]
-        elif current and line.strip():
-            current[1] += ' ' + line.strip()
-    if current:
-        sections.append(tuple(current))
-    # filter & fix
-    cleaned = []
-    for tag, sentence in sections:
-        if not sentence:
-            if tag.lower() == 'subscribe':
-                sentence = "Follow to explore more on this topic."
             else:
                 continue
-        cleaned.append((tag, sentence))
-    return cleaned
-def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
     """
-    Try Kokoro → fallback to gTTS. Returns a .wav path.
     """
-    safe = re.sub(r'[^\w]', '_', text[:10]).strip()
-    out_wav = os.path.join(dirpath, f"tts_{safe}.wav")
-    if os.path.exists(out_wav):
-        return out_wav
-    # 1) Kokoro
     try:
-        segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
-        arrays = [audio for _, _, audio in segments]
-        audio = np.concatenate(arrays, axis=0) if len(arrays)>1 else arrays[0]
-        sf.write(out_wav, audio, 24000)
-        return out_wav
-    except Exception:
-        # 2) fallback to gTTS
-        mp3 = os.path.join(dirpath, f"{safe}.mp3")
-        gTTS(text=text, lang='en').save(mp3)
-        wav_seg = AudioSegment.from_mp3(mp3)
-        wav_seg.export(out_wav, format="wav")
-        os.remove(mp3)
-        return out_wav
-def add_pillow_subtitles(video_clip, sections):
     """
-    Break each sentence into ~5-word chunks and overlay as timed subtitles,
-    all via Pillow (no ImageMagick).
     """
-    subs = []
-    total_words = sum(len(s.split()) for _, s in sections)
-    cum_time = 0.0
-    for _, sentence in sections:
-        words = sentence.split()
-        seg_words = len(words)
-        seg_dur = video_clip.duration * (seg_words / total_words)
-        chunk_dur = seg_dur / max(1, math.ceil(seg_words/5))
-        # group into 5-word chunks
-        for i in range(0, seg_words, 5):
-            chunk = " ".join(words[i:i+5])
-            txt_clip = (
-                TextClip(
-                    chunk,
-                    fontsize=CAPTION_FONT_SIZE,
-                    font='Arial-Bold',
-                    color='white',
-                    bg_color='rgba(0,0,0,0.3)',
-                    size=(TARGET_RESOLUTION[0]*0.9, None),
-                    method='pillow'
-                )
-                .set_start(cum_time + (i//5)*chunk_dur)
-                .set_duration(chunk_dur)
-                .set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
-            )
-            subs.append(txt_clip)
-        cum_time += seg_dur
-    return subs
-# ────────── MAIN RENDER FUNCTION ──────────
-VOICE_MAP = {
-    'Emma (Female)':  'af_heart',
-    'Bella (Female)': 'af_bella',
-    'Nicole (Female)':'af_nicole',
-    # … add your full list here …
-}
-def generate_video(topic, include_captions, music_file, voice_choice):
-    # 1) Script → sections
-    script = generate_script(topic)
-    sections = parse_script(script)
-    # 2) TTS each sentence
-    tmpdir = tempfile.mkdtemp()
-    tts_paths = []
-    voice_code = VOICE_MAP.get(voice_choice, 'af_heart')
-    for _, sentence in sections:
-        tts_paths.append(generate_tts_audio(sentence, voice_code, tmpdir))
-    # 3) Concatenate narration
-    aud_clips = [AudioFileClip(p) for p in tts_paths]
-    narration = concatenate_audioclips(aud_clips)
-    narration = narration.set_fps(24000)
-    # 4) Pick one random video subclip
-    src = VideoFileClip(SOURCE_VIDEO_PATH)
-    max_start = max(0, src.duration - narration.duration)
-    start = random.uniform(0, max_start)
-    vid = src.subclip(start, start + narration.duration).resize(TARGET_RESOLUTION)
-    src.close()
-    # 5) Overlay narration audio
-    vid = vid.set_audio(narration)
-    # 6) Add captions if requested
-    if include_captions:
-        subs = add_pillow_subtitles(vid, sections)
-        vid = CompositeVideoClip([vid, *subs])
-    # 7) Add background music
-    if music_file:
-        bg = AudioFileClip(music_file.name)
-        loops = math.ceil(vid.duration / bg.duration)
-        bg_full = concatenate_audioclips([bg]*loops).subclip(0, vid.duration)
-        bg_full = bg_full.volumex(BG_MUSIC_VOLUME)
-        vid = vid.set_audio(CompositeAudioClip([vid.audio, bg_full]))
-    # 8) Export
-    vid.write_videofile(
-        OUTPUT_VIDEO_PATH,
-        codec='libx264',
-        fps=30,
-        preset='veryfast',
-        audio_codec='aac'
-    )
-    # Cleanup
-    shutil.rmtree(tmpdir)
-    return OUTPUT_VIDEO_PATH
-# ────────── GRADIO UI ──────────
-iface = gr.Interface(
-    fn=generate_video,
-    inputs=[
-        gr.Textbox(label="Video Concept", placeholder="Enter your topic…"),
-        gr.Checkbox(label="Include Captions"),
-        gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
-        gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
-    ],
-    outputs=gr.Video(label="Generated Video"),
-    title="AI Documentary Video Generator",
-    description="Cuts one ~64 s clip from your video, adds AI narration & TikTok-style subtitles."
-)
 if __name__ == "__main__":
-    iface.launch(share=True)

+# Import necessary libraries
 import os
 import re
+import time
 import math
 import tempfile
+import random
 import shutil
+import torch
 import numpy as np
 import soundfile as sf
+from PIL import Image, ImageDraw, ImageFont
 from pydub import AudioSegment
 from gtts import gTTS
+import whisper
 import gradio as gr
+import requests
+import json
 from moviepy.editor import (
+    VideoFileClip, concatenate_videoclips, AudioFileClip,
+    CompositeVideoClip, TextClip, CompositeAudioClip
 )
+import subprocess
+import cv2
+import moviepy.config as mpy_config
+import moviepy.video.fx.all as vfx
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Configure moviepy
+mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
+# Global Configuration Variables
+OPENROUTER_API_KEY = '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'
+OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
+TARGET_RESOLUTION = (1080, 1920)  # Fixed to vertical format for shorts
+OUTPUT_VIDEO_FILENAME = "final_video.mp4"
+TEMP_FOLDER = None
+CAPTION_COLOR = "white"
+# Additional global variables for the Gradio interface
+selected_voice = 'en_us_001'  # Default voice
+voice_speed = 1.0  # Default voice speed
+font_size = 45  # Default font size
+bg_music_volume = 0.08  # Default background music volume
+fps = 30  # Default FPS
+preset = "veryfast"  # Default preset
+# Initialize whisper model globally to avoid reloading
+whisper_model = None
+def load_whisper_model():
+    """Load the Whisper model."""
+    global whisper_model
+    try:
+        logger.info("Loading Whisper model...")
+        whisper_model = whisper.load_model("tiny")  # Using tiny for CPU efficiency
+        logger.info("Whisper model loaded successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to load Whisper model: {e}")
+        return False
+# Helper Functions
+def generate_script(user_input):
+    """Generate documentary script using OpenRouter API."""
     headers = {
         'Authorization': f'Bearer {OPENROUTER_API_KEY}',
+        'HTTP-Referer': 'https://huggingface.co/spaces',
         'X-Title': 'AI Documentary Maker'
     }
+    prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
+The script should sound like a real human voiceover from a TV show or documentary — clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
+Structure:
+- Break the script into scenes using [Tags]. Each tag is a short title (1–2 words) that describes the scene.
+- Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
+- The full script should make sense as one connected narration — no randomness.
+- Use natural, formal English. No slang, no fake AI language, and no robotic tone.
+- Do not use humor, sarcasm, or casual language. This is a serious narration.
+- No emotion-sound words like "aww," "eww," "whoa," etc.
+- Do not use numbers like 1, 2, 3 — write them out as one, two, three.
+- Make the total narration about 1 minute long (around 150-200 words total).
+- At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
+Only output the script. No extra comments or text.
+Example:
+[Ocean]
+The ocean covers over seventy percent of the Earth's surface.
+[Currents]
+Ocean currents distribute heat and regulate global climate patterns.
+[Coral Reefs]
+These ecosystems support over one million species of marine life.
+[Pollution]
+Plastic waste threatens marine biodiversity and food chains.
+[Climate Impact]
+Rising temperatures are causing coral bleaching and habitat loss.
+[Subscribe]
+Follow to explore more about the changing planet we live on.
+Now here is the Topic: {user_input}
 """
+    data = {
         'model': OPENROUTER_MODEL,
+        'messages': [{'role': 'user', 'content': prompt}],
+        'temperature': 0.4,
+        'max_tokens': 2000
     }
+    try:
+        response = requests.post(
+            'https://openrouter.ai/api/v1/chat/completions',
+            headers=headers,
+            json=data,
+            timeout=30
+        )
+        if response.status_code == 200:
+            response_data = response.json()
+            if 'choices' in response_data and len(response_data['choices']) > 0:
+                return response_data['choices'][0]['message']['content']
             else:
+                logger.error(f"Unexpected response format: {response_data}")
+                return None
+        else:
+            logger.error(f"API Error {response.status_code}: {response.text}")
+            return None
+    except Exception as e:
+        logger.error(f"Request failed: {str(e)}")
+        return None
+def parse_script(script_text):
+    """Parse the generated script into a list of elements."""
+    sections = {}
+    current_title = None
+    current_text = ""
+    try:
+        for line in script_text.splitlines():
+            line = line.strip()
+            if line.startswith("[") and "]" in line:
+                bracket_start = line.find("[")
+                bracket_end = line.find("]", bracket_start)
+                if bracket_start != -1 and bracket_end != -1:
+                    if current_title is not None:
+                        sections[current_title] = current_text.strip()
+                    current_title = line[bracket_start+1:bracket_end]
+                    current_text = line[bracket_end+1:].strip()
+            elif current_title:
+                current_text += line + " "
+        if current_title:
+            sections[current_title] = current_text.strip()
+        elements = []
+        for title, narration in sections.items():
+            if not title or not narration:
                 continue
+            media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
+            words = narration.split()
+            duration = max(3, len(words) * 0.5)  # Estimate duration based on word count
+            tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
+            elements.append(media_element)
+            elements.append(tts_element)
+        return elements
+    except Exception as e:
+        logger.error(f"Error parsing script: {e}")
+        return []
+def generate_tts(text, voice="en"):
+    """Generate TTS audio using gTTS."""
+    safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
+    file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
+    try:
+        logger.info(f"Generating TTS for: {text[:30]}...")
+        tts = gTTS(text=text, lang='en', slow=False)
+        mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
+        tts.save(mp3_path)
+        # Convert MP3 to WAV
+        audio = AudioSegment.from_mp3(mp3_path)
+        # Adjust speed if needed
+        if voice_speed != 1.0:
+            audio = audio._spawn(audio.raw_data, overrides={
+                "frame_rate": int(audio.frame_rate * voice_speed)
+            })
+        audio.export(file_path, format="wav")
+        os.remove(mp3_path)
+        logger.info(f"TTS saved to {file_path}")
+        return file_path
+    except Exception as e:
+        logger.error(f"TTS generation error: {e}")
+        return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
+def generate_silent_audio(duration, sample_rate=24000):
+    """Generate a silent WAV audio file lasting 'duration' seconds."""
+    num_samples = int(duration * sample_rate)
+    silence = np.zeros(num_samples, dtype=np.float32)
+    silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
+    sf.write(silent_path, silence, sample_rate)
+    logger.info(f"Silent audio generated: {silent_path}")
+    return silent_path
+def analyze_audio_with_whisper(audio_path):
     """
+    Use Whisper to transcribe audio and generate word-level timestamps.
+    Returns a list of dictionaries with word, start_time, and end_time.
     """
+    try:
+        if whisper_model is None:
+            load_whisper_model()
+        logger.info(f"Analyzing audio with Whisper: {audio_path}")
+        # Transcribe the audio file
+        result = whisper_model.transcribe(audio_path, word_timestamps=True)
+        # Extract word-level segments
+        word_segments = []
+        for segment in result["segments"]:
+            for word in segment["words"]:
+                word_segments.append({
+                    "word": word["word"].strip(),
+                    "start": word["start"],
+                    "end": word["end"]
+                })
+        logger.info(f"Extracted {len(word_segments)} word segments")
+        return word_segments
+    except Exception as e:
+        logger.error(f"Whisper analysis error: {e}")
+        return []
+def get_video_clip_segment(video_path, start_time, duration):
+    """
+    Extract a segment from the video file starting at a random position,
+    but ensuring the segment is at least 'duration' seconds long.
+    """
     try:
+        video = VideoFileClip(video_path)
+        video_duration = video.duration
+        if duration > video_duration:
+            logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s). Using full video.")
+            return video
+        # Calculate a random start time ensuring we have enough duration left
+        max_start_time = video_duration - duration
+        if start_time is None or start_time > max_start_time:
+            start_time = random.uniform(0, max_start_time)
+        # Extract the segment
+        clip = video.subclip(start_time, start_time + duration)
+        logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
+        return clip
+    except Exception as e:
+        logger.error(f"Error extracting video segment: {e}")
+        return None
+def create_word_level_subtitles(clip, words_data, font_size=45):
     """
+    Create subtitles that highlight words as they are spoken.
+    Takes a list of word dictionaries with timing information.
     """
+    try:
+        logger.info("Creating word-level synchronized subtitles")
+        # Group words into chunks of approximately 5 words
+        chunks = []
+        current_chunk = []
+        current_chunk_words = []
+        for word_data in words_data:
+            current_chunk_words.append(word_data["word"])
+            current_chunk.append(word_data)
+            if len(current_chunk_words) >= 5:
+                chunks.append({
+                    "text": " ".join(current_chunk_words),
+                    "words": current_chunk,
+                    "start": current_chunk[0]["start"],
+                    "end": current_chunk[-1]["end"]
+                })
+                current_chunk = []
+                current_chunk_words = []
+        # Add any remaining words
+        if current_chunk_words:
+            chunks.append({
+                "text": " ".join(current_chunk_words),
+                "words": current_chunk,
+                "start": current_chunk[0]["start"],
+                "end": current_chunk[-1]["end"]
+            })
+        # Create subtitle clips for each chunk
+        subtitle_clips = []
+        for chunk in chunks:
+            txt_clip = TextClip(
+                chunk["text"],
+                fontsize=font_size,
+                font='Arial-Bold',
+                color=CAPTION_COLOR,
+                bg_color='rgba(0, 0, 0, 0.5)',
+                method='caption',
+                align='center',
+                stroke_width=2,
+                stroke_color='black',
+                size=(TARGET_RESOLUTION[0] * 0.9, None)
+            ).set_start(chunk["start"]).set_end(chunk["end"])
+            txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] * 0.85))
+            subtitle_clips.append(txt_clip)
+        logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
+        return subtitle_clips
+    except Exception as e:
+        logger.error(f"Error creating subtitles: {e}")
+        return []
+def add_background_music(final_video, bg_music_volume=0.08):
+    """Add background music to the final video."""
+    try:
+        bg_music_path = "music.mp3"
+        if bg_music_path and os.path.exists(bg_music_path):
+            logger.info(f"Adding background music from: {bg_music_path}")
+            bg_music = AudioFileClip(bg_music_path)
+            if bg_music.duration < final_video.duration:
+                loops_needed = math.ceil(final_video.duration / bg_music.duration)
+                bg_segments = [bg_music] * loops_needed
+                bg_music = CompositeAudioClip(bg_segments)
+            bg_music = bg_music.subclip(0, final_video.duration)
+            bg_music = bg_music.volumex(bg_music_volume)
+            video_audio = final_video.audio
+            mixed_audio = CompositeAudioClip([video_audio, bg_music])
+            final_video = final_video.set_audio(mixed_audio)
+            logger.info("Background music added successfully")
+        else:
+            logger.info("No music file found, skipping background music")
+        return final_video
+    except Exception as e:
+        logger.error(f"Error adding background music: {e}")
+        logger.info("Continuing without background music")
+        return final_video
+def create_clip(tts_path, narration_text, segment_index=0):
+    """
+    Create a video clip with synchronized subtitles using whisper timestamps.
+    Uses a random segment from video.mp4 matching the audio duration.
+    """
+    try:
+        logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
+        if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
+            logger.error("Missing video or TTS file")
+            return None
+        # Get audio duration
+        audio_clip = AudioFileClip(tts_path)
+        audio_duration = audio_clip.duration
+        target_duration = audio_duration + 0.5  # Add a small buffer
+        # Get a random segment from the main video
+        video_clip = get_video_clip_segment("video.mp4", None, target_duration)
+        if video_clip is None:
+            logger.error("Failed to extract video segment")
+            return None
+        # Resize to target resolution
+        video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
+        # Set the audio
+        video_clip = video_clip.set_audio(audio_clip)
+        # Generate word-level timestamps with Whisper
+        word_data = analyze_audio_with_whisper(tts_path)
+        if word_data:
+            # Create word-level subtitles
+            subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
+            if subtitle_clips:
+                # Combine video with subtitles
+                video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
+        else:
+            # Fallback to basic subtitle if whisper fails
+            logger.warning("Falling back to basic subtitles")
+            txt_clip = TextClip(
+                narration_text,
+                fontsize=font_size,
+                font='Arial-Bold',
+                color=CAPTION_COLOR,
+                bg_color='rgba(0, 0, 0, 0.5)',
+                method='caption',
+                align='center',
+                size=(TARGET_RESOLUTION[0] * 0.9, None)
+            ).set_position(('center', TARGET_RESOLUTION[1] * 0.85)).set_duration(video_clip.duration)
+            video_clip = CompositeVideoClip([video_clip, txt_clip])
+        logger.info(f"Clip created: {video_clip.duration:.1f}s")
+        return video_clip
+    except Exception as e:
+        logger.error(f"Error in create_clip: {str(e)}")
+        return None
+# Main Video Generation Function
+def generate_video(user_input, resolution, caption_option):
+    """Generate a video based on user input via Gradio."""
+    global TEMP_FOLDER, CAPTION_COLOR
+    # Set caption color based on option
+    CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
+    # Create a unique temporary folder
+    TEMP_FOLDER = tempfile.mkdtemp()
+    logger.info(f"Created temporary folder: {TEMP_FOLDER}")
+    # Check if video.mp4 exists
+    if not os.path.exists("video.mp4"):
+        logger.error("video.mp4 not found in the current directory")
+        return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
+    # Load Whisper model
+    load_whisper_model()
+    # Generate script
+    logger.info("Generating script from API...")
+    script = generate_script(user_input)
+    if not script:
+        logger.error("Failed to generate script.")
+        shutil.rmtree(TEMP_FOLDER)
+        return "Failed to generate script. Please try again."
+    logger.info("Generated Script:\n" + script)
+    # Parse script into elements
+    elements = parse_script(script)
+    if not elements:
+        logger.error("Failed to parse script into elements.")
+        shutil.rmtree(TEMP_FOLDER)
+        return "Failed to parse script. Please try again."
+    logger.info(f"Parsed {len(elements)//2} script segments.")
+    # Group elements into pairs (media prompt + TTS)
+    paired_elements = []
+    for i in range(0, len(elements), 2):
+        if i + 1 < len(elements):
+            paired_elements.append((elements[i], elements[i + 1]))
+    if not paired_elements:
+        logger.error("No valid script segments found.")
+        shutil.rmtree(TEMP_FOLDER)
+        return "No valid script segments were generated."
+    # Create video clips for each segment
+    clips = []
+    for idx, (media_elem, tts_elem) in enumerate(paired_elements):
+        logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
+        # Generate TTS for the segment
+        tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
+        if not tts_path:
+            logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
+            continue
+        # Create video clip with subtitles
+        clip = create_clip(
+            tts_path=tts_path,
+            narration_text=tts_elem['text'],
+            segment_index=idx
+        )
+        if clip:
+            clips.append(clip)
+        else:
+            logger.error(f"Clip creation failed for segment {idx+1}.")
+    if not clips:
+        logger.error("No clips were successfully created.")
+        shutil.rmtree(TEMP_FOLDER)
+        return "Failed to create any video clips. Please try again."
+    # Concatenate all clips
+    logger.info("\nConcatenating clips...")
+    final_video = concatenate_videoclips(clips, method="compose")
+    # Add background music if available
+    final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
+    # Export final video
+    logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
+    final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
+    logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
+    # Clean up
+    logger.info("Cleaning up temporary files...")
+    shutil.rmtree(TEMP_FOLDER)
+    logger.info("Temporary files removed.")
+    return OUTPUT_VIDEO_FILENAME
+# Gradio Interface Setup
+def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
+    global voice_speed, font_size, bg_music_volume, fps, preset
+    # Update global variables with user selections
+    voice_speed = v_speed
+    font_size = caption_size
+    bg_music_volume = bg_vol
+    fps = video_fps
+    preset = video_preset
+    # Handle music upload
+    if music_file is not None:
+        target_path = "music.mp3"
+        shutil.copy(music_file.name, target_path)
+        logger.info(f"Uploaded music saved as: {target_path}")
+    # Generate the video (always using vertical resolution)
+    return generate_video(user_input, "Short", caption_option)
+# Create the Gradio interface
+def create_interface():
+    iface = gr.Interface(
+        fn=generate_video_with_options,
+        inputs=[
+            gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
+            gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
+            gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
+            gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
+            gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
+            gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
+                      value="veryfast", label="Export Preset"),
+            gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
+            gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
+        ],
+        outputs=gr.Video(label="Generated Video"),
+        title="AI Documentary Video Generator",
+        description="""
+        Create short documentary videos with AI narration and synchronized captions.
+        1. Enter a topic or concept for your documentary
+        2. Optionally upload background music
+        3. Adjust settings as needed
+        4. Click submit and wait for video generation
+        NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space for this app to work.
+        """
+    )
+    return iface
+# Launch the application
 if __name__ == "__main__":
+    # Create interface and launch
+    demo = create_interface()
+    demo.launch()
+else:
+    # For importing as a module
+    demo = create_interface()