Spaces:

testdeep123
/

sda

Running

App Files Files Community

testdeep123 commited on 5 days ago

Commit

3d3cf6f

verified ·

1 Parent(s): eb527ee

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -485

app.py CHANGED Viewed

@@ -1,497 +1,114 @@
-import os
-import re
-import time
-import math
 import tempfile
-import random
-import shutil
-import torch
-import numpy as np
-import soundfile as sf
 from pydub import AudioSegment
-from gtts import gTTS
-import whisper  # Ensure this is openai-whisper in requirements.txt
-import gradio as gr
-import requests
 import json
-from moviepy.editor import (
-    VideoFileClip, concatenate_videoclips, AudioFileClip,
-    CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
-)
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Global Configuration Variables
 OPENROUTER_API_KEY = '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'
 OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
-TARGET_RESOLUTION = (1080, 1920)  # Vertical format for shorts
 OUTPUT_VIDEO_FILENAME = "final_video.mp4"
-TEMP_FOLDER = None
 CAPTION_COLOR = "white"
-# Additional global variables for Gradio interface
-selected_voice = 'en_us_001'  # Default voice
-voice_speed = 1.0  # Default voice speed
-font_size = 45  # Default font size
-bg_music_volume = 0.08  # Default background music volume
-fps = 30  # Default FPS
-preset = "veryfast"  # Default preset
-# Initialize whisper model globally
-whisper_model = None
-def load_whisper_model():
-    """Load the Whisper model."""
-    global whisper_model
-    try:
-        logger.info("Loading Whisper model...")
-        whisper_model = whisper.load_model("tiny")  # Using tiny for CPU efficiency
-        logger.info("Whisper model loaded successfully")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to load Whisper model: {e}")
-        return False
-def generate_script(user_input):
-    """Generate documentary script using OpenRouter API."""
-    headers = {
-        'Authorization': f'Bearer {OPENROUTER_API_KEY}',
-        'HTTP-Referer': 'https://huggingface.co/spaces',
-        'X-Title': 'AI Documentary Maker'
-    }
-    prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
-The script should sound like a real human voiceover from a TV show or documentary — clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
-Structure:
-- Break the script into scenes using [Tags]. Each tag is a short title (1–2 words) that describes the scene.
-- Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
-- The full script should make sense as one connected narration — no randomness.
-- Use natural, formal English. No slang, no fake AI language, and no robotic tone.
-- Do not use humor, sarcasm, or casual language. This is a serious narration.
-- No emotion-sound words like "aww," "eww," "whoa," etc.
-- Do not use numbers like 1, 2, 3 — write them out as one, two, three.
-- Make the total narration about 1 minute long (around 150-200 words total).
-- At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
-Only output the script. No extra comments or text.
-Example:
-[Ocean]
-The ocean covers over seventy percent of the Earth's surface.
-[Currents]
-Ocean currents distribute heat and regulate global climate patterns.
-[Coral Reefs]
-These ecosystems support over one million species of marine life.
-[Pollution]
-Plastic waste threatens marine biodiversity and food chains.
-[Climate Impact]
-Rising temperatures are causing coral bleaching and habitat loss.
-[Subscribe]
-Follow to explore more about the changing planet we live on.
-Now here is the Topic: {user_input}
-"""
-    data = {
-        'model': OPENROUTER_MODEL,
-        'messages': [{'role': 'user', 'content': prompt}],
-        'temperature': 0.4,
-        'max_tokens': 2000
-    }
-    try:
-        response = requests.post(
-            'https://openrouter.ai/api/v1/chat/completions',
-            headers=headers,
-            json=data,
-            timeout=30
-        )
-        if response.status_code == 200:
-            response_data = response.json()
-            if 'choices' in response_data and len(response_data['choices']) > 0:
-                return response_data['choices'][0]['message']['content']
-            else:
-                logger.error(f"Unexpected response format: {response_data}")
-                return None
-        else:
-            logger.error(f"API Error {response.status_code}: {response.text}")
-            return None
-    except Exception as e:
-        logger.error(f"Request failed: {str(e)}")
-        return None
-def parse_script(script_text):
-    """Parse the generated script into a list of elements."""
-    sections = {}
-    current_title = None
-    current_text = ""
-    try:
-        for line in script_text.splitlines():
-            line = line.strip()
-            if line.startswith("[") and "]" in line:
-                bracket_start = line.find("[")
-                bracket_end = line.find("]", bracket_start)
-                if bracket_start != -1 and bracket_end != -1:
-                    if current_title is not None:
-                        sections[current_title] = current_text.strip()
-                    current_title = line[bracket_start+1:bracket_end]
-                    current_text = line[bracket_end+1:].strip()
-            elif current_title:
-                current_text += line + " "
-        if current_title:
-            sections[current_title] = current_text.strip()
-        elements = []
-        for title, narration in sections.items():
-            if not title or not narration:
-                continue
-            media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
-            words = narration.split()
-            duration = max(3, len(words) * 0.5)  # Estimate duration
-            tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
-            elements.append(media_element)
-            elements.append(tts_element)
-        return elements
-    except Exception as e:
-        logger.error(f"Error parsing script: {e}")
-        return []
-def generate_tts(text, voice="en"):
-    """Generate TTS audio using gTTS."""
-    safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
-    file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
-    try:
-        logger.info(f"Generating TTS for: {text[:30]}...")
-        tts = gTTS(text=text, lang='en', slow=False)
-        mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
-        tts.save(mp3_path)
-        # Convert MP3 to WAV
-        audio = AudioSegment.from_mp3(mp3_path)
-        if voice_speed != 1.0:
-            audio = audio._spawn(audio.raw_data, overrides={
-                "frame_rate": int(audio.frame_rate * voice_speed)
-            })
-        audio.export(file_path, format="wav")
-        os.remove(mp3_path)
-        logger.info(f"TTS saved to {file_path}")
-        return file_path
-    except Exception as e:
-        logger.error(f"TTS generation error: {e}")
-        return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
-def generate_silent_audio(duration, sample_rate=24000):
-    """Generate a silent WAV audio file."""
-    num_samples = int(duration * sample_rate)
-    silence = np.zeros(num_samples, dtype=np.float32)
-    silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
-    sf.write(silent_path, silence, sample_rate)
-    logger.info(f"Silent audio generated: {silent_path}")
-    return silent_path
-def analyze_audio_with_whisper(audio_path):
-    """Use Whisper to generate word-level timestamps."""
-    try:
-        if whisper_model is None:
-            load_whisper_model()
-        logger.info(f"Analyzing audio with Whisper: {audio_path}")
-        result = whisper_model.transcribe(audio_path, word_timestamps=True)
-        word_segments = []
-        for segment in result["segments"]:
-            for word in segment["words"]:
-                word_segments.append({
-                    "word": word["word"].strip(),
-                    "start": word["start"],
-                    "end": word["end"]
-                })
-        logger.info(f"Extracted {len(word_segments)} word segments")
-        return word_segments
-    except Exception as e:
-        logger.error(f"Whisper analysis error: {e}")
-        return []
-def get_video_clip_segment(video_path, start_time, duration):
-    """Extract a random video segment."""
-    try:
-        video = VideoFileClip(video_path)
-        video_duration = video.duration
-        if duration > video_duration:
-            logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
-            return video
-        max_start_time = video_duration - duration
-        if start_time is None or start_time > max_start_time:
-            start_time = random.uniform(0, max_start_time)
-        clip = video.subclip(start_time, start_time + duration)
-        logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
-        return clip
-    except Exception as e:
-        logger.error(f"Error extracting video segment: {e}")
-        return None
-def create_word_level_subtitles(clip, words_data, font_size=45):
-    """Create synchronized subtitles without ImageMagick."""
-    try:
-        logger.info("Creating word-level synchronized subtitles")
-        chunks = []
-        current_chunk = []
-        current_chunk_words = []
-        for word_data in words_data:
-            current_chunk_words.append(word_data["word"])
-            current_chunk.append(word_data)
-            if len(current_chunk_words) >= 5:
-                chunks.append({
-                    "text": " ".join(current_chunk_words),
-                    "words": current_chunk,
-                    "start": current_chunk[0]["start"],
-                    "end": current_chunk[-1]["end"]
-                })
-                current_chunk = []
-                current_chunk_words = []
-        if current_chunk_words:
-            chunks.append({
-                "text": " ".join(current_chunk_words),
-                "words": current_chunk,
-                "start": current_chunk[0]["start"],
-                "end": current_chunk[-1]["end"]
-            })
-        subtitle_clips = []
-        for chunk in chunks:
-            txt_clip = TextClip(
-                chunk["text"],
-                fontsize=font_size,
-                color=CAPTION_COLOR,
-                method='label'
-            )
-            bg_clip = ColorClip(
-                size=(txt_clip.w + 20, txt_clip.h + 10),
-                color=(0, 0, 0, 128)  # Semi-transparent black
-            )
-            subtitle_clip = CompositeVideoClip([
-                bg_clip.set_position('center'),
-                txt_clip.set_position('center')
-            ])
-            subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
-            subtitle_clips.append(subtitle_clip)
-        logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
-        return subtitle_clips
-    except Exception as e:
-        logger.error(f"Error creating subtitles: {e}")
-        return []
-def add_background_music(final_video, bg_music_volume=0.08):
-    """Add background music to the video."""
-    try:
-        bg_music_path = "music.mp3"
-        if bg_music_path and os.path.exists(bg_music_path):
-            logger.info(f"Adding background music from: {bg_music_path}")
-            bg_music = AudioFileClip(bg_music_path)
-            if bg_music.duration < final_video.duration:
-                loops_needed = math.ceil(final_video.duration / bg_music.duration)
-                bg_segments = [bg_music] * loops_needed
-                bg_music = CompositeAudioClip(bg_segments)
-            bg_music = bg_music.subclip(0, final_video.duration)
-            bg_music = bg_music.volumex(bg_music_volume)
-            video_audio = final_video.audio
-            mixed_audio = CompositeAudioClip([video_audio, bg_music])
-            final_video = final_video.set_audio(mixed_audio)
-            logger.info("Background music added successfully")
-        else:
-            logger.info("No music file found, skipping background music")
-        return final_video
-    except Exception as e:
-        logger.error(f"Error adding background music: {e}")
-        return final_video
-def create_clip(tts_path, narration_text, segment_index=0):
-    """Create a video clip with synchronized subtitles."""
-    try:
-        logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
-        if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
-            logger.error("Missing video or TTS file")
-            return None
-        audio_clip = AudioFileClip(tts_path)
-        audio_duration = audio_clip.duration
-        target_duration = audio_duration + 0.5
-        video_clip = get_video_clip_segment("video.mp4", None, target_duration)
-        if video_clip is None:
-            logger.error("Failed to extract video segment")
-            return None
-        video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
-        video_clip = video_clip.set_audio(audio_clip)
-        word_data = analyze_audio_with_whisper(tts_path)
-        if word_data:
-            subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
-            if subtitle_clips:
-                video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
-        else:
-            logger.warning("Falling back to basic subtitles")
-            txt_clip = TextClip(
-                narration_text,
-                fontsize=font_size,
-                color=CAPTION_COLOR,
-                method='label'
-            )
-            bg_clip = ColorClip(
-                size=(txt_clip.w + 20, txt_clip.h + 10),
-                color=(0, 0, 0, 128)
-            )
-            subtitle_clip = CompositeVideoClip([
-                bg_clip.set_position('center'),
-                txt_clip.set_position('center')
-            ])
-            subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
-            video_clip = CompositeVideoClip([video_clip, subtitle_clip])
-        logger.info(f"Clip created: {video_clip.duration:.1f}s")
-        return video_clip
-    except Exception as e:
-        logger.error(f"Error in create_clip: {str(e)}")
-        return None
-def generate_video(user_input, resolution, caption_option):
-    """Generate a video based on user input."""
-    global TEMP_FOLDER, CAPTION_COLOR
-    CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
-    TEMP_FOLDER = tempfile.mkdtemp()
-    logger.info(f"Created temporary folder: {TEMP_FOLDER}")
-    if not os.path.exists("video.mp4"):
-        logger.error("video.mp4 not found")
-        return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
-    load_whisper_model()
-    script = generate_script(user_input)
-    if not script:
-        shutil.rmtree(TEMP_FOLDER)
-        return "Failed to generate script."
-    logger.info("Generated Script:\n" + script)
-    elements = parse_script(script)
-    if not elements:
-        shutil.rmtree(TEMP_FOLDER)
-        return "Failed to parse script."
-    logger.info(f"Parsed {len(elements)//2} script segments.")
-    paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
-    if not paired_elements:
-        shutil.rmtree(TEMP_FOLDER)
-        return "No valid script segments generated."
-    clips = []
-    for idx, (media_elem, tts_elem) in enumerate(paired_elements):
-        logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
-        tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
-        if not tts_path:
-            continue
-        clip = create_clip(tts_path, tts_elem['text'], idx)
-        if clip:
-            clips.append(clip)
-    if not clips:
-        shutil.rmtree(TEMP_FOLDER)
-        return "Failed to create any video clips."
-    logger.info("\nConcatenating clips...")
-    final_video = concatenate_videoclips(clips, method="compose")
-    final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
-    logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
-    final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
-    logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
-    shutil.rmtree(TEMP_FOLDER)
-    logger.info("Temporary files removed.")
-    return OUTPUT_VIDEO_FILENAME
-def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
-    """Generate video with Gradio options."""
-    global voice_speed, font_size, bg_music_volume, fps, preset
-    voice_speed = v_speed
-    font_size = caption_size
-    bg_music_volume = bg_vol
-    fps = video_fps
-    preset = video_preset
-    if music_file is not None:
-        shutil.copy(music_file.name, "music.mp3")
-        logger.info(f"Uploaded music saved as: music.mp3")
-    return generate_video(user_input, "Short", caption_option)
-def create_interface():
-    """Create Gradio interface."""
-    iface = gr.Interface(
-        fn=generate_video_with_options,
-        inputs=[
-            gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
-            gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
-            gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
-            gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
-            gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
-            gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
-                      value="veryfast", label="Export Preset"),
-            gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
-            gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
-        ],
-        outputs=gr.Video(label="Generated Video"),
-        title="AI Documentary Video Generator",
-        description="""
-        Create short documentary videos with AI narration and synchronized captions.
-        1. Enter a topic or concept for your documentary
-        2. Optionally upload background music
-        3. Adjust settings as needed
-        4. Click submit and wait for video generation
-        NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
-        """
     )
-    return iface
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch()
-else:
-    demo = create_interface()

+import gradio as gr
 import tempfile
+import os
+from moviepy.editor import *
 from pydub import AudioSegment
+import whisper
 import json
+import requests
+# Configuration
 OPENROUTER_API_KEY = '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'
 OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
+TARGET_RESOLUTION = (1080, 1920)
 OUTPUT_VIDEO_FILENAME = "final_video.mp4"
 CAPTION_COLOR = "white"
+# Placeholder for Kokoro TTS
+def kokoro_tts(text):
+    # TODO: Replace with actual Kokoro TTS implementation
+    # Should return path to generated audio file
+    return "dummy_audio.wav"
+def generate_script(topic):
+    prompt = f"Generate a script about {topic} divided into parts, and output it as a JSON array of strings."
+    response = requests.post(
+        "https://api.openrouter.com/v1/completions",
+        headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"},
+        json={"model": OPENROUTER_MODEL, "prompt": prompt}
     )
+    script_json = response.json()["choices"][0]["text"]
+    return json.loads(script_json)
+def generate_audio(script_parts, temp_folder):
+    full_audio = AudioSegment.empty()
+    for part in script_parts:
+        audio_file = kokoro_tts(part)
+        audio_segment = AudioSegment.from_file(audio_file)
+        silence = AudioSegment.silent(duration=300)  # 0.3s gap
+        full_audio += audio_segment + silence
+    full_audio = full_audio[:-300]  # Remove last silence
+    audio_path = os.path.join(temp_folder, "full_audio.wav")
+    full_audio.export(audio_path, format="wav")
+    return audio_path
+def generate_subtitles(audio_path):
+    model = whisper.load_model("base")
+    result = model.transcribe(audio_path, word_timestamps=True)
+    return result['segments']
+def process_background_video(audio_duration):
+    background = VideoFileClip("video.mp4")
+    background = background.resize(height=1920)
+    if background.w > 1080:
+        background = background.crop(x_center=background.w/2, width=1080)
+    required_duration = audio_duration + 0.5
+    if background.duration < required_duration:
+        n_loops = int(required_duration / background.duration) + 1
+        background = concatenate_videoclips([background] * n_loops)
+    return background.set_duration(required_duration)
+def create_subtitle_clips(segments, video_height=1920, font_size=24, color='white', highlight_color='yellow'):
+    subtitle_y = video_height - 200
+    all_words = [word for segment in segments for word in segment['words']]
+    chunks = [all_words[i:i+5] for i in range(0, len(all_words), 5)]
+    subtitle_clips = []
+    for chunk in chunks:
+        for i, word in enumerate(chunk):
+            line_clip = create_text_line(chunk, i, font_size, color, highlight_color)
+            line_clip = line_clip.set_start(word['start']).set_end(word['end']).set_pos(('center', subtitle_y))
+            subtitle_clips.append(line_clip)
+    return subtitle_clips
+def create_text_line(words, highlighted_index, font_size, color, highlight_color):
+    space_clip = TextClip(" ", fontsize=font_size, color=color)
+    space_width = space_clip.w
+    text_clips = []
+    total_width = 0
+    for i, word in enumerate(words):
+        c = highlight_color if i == highlighted_index else color
+        text_clip = TextClip(word['word'], fontsize=font_size, color=c)
+        text_clips.append(text_clip)
+        total_width += text_clip.w + (space_width if i < len(words) - 1 else 0)
+    current_x = -total_width / 2
+    positioned_clips = []
+    for clip in text_clips:
+        positioned_clips.append(clip.set_pos((current_x, 0)))
+        current_x += clip.w + space_width
+    return CompositeVideoClip(positioned_clips, size=(total_width, text_clips[0].h))
+def generate_video(topic):
+    with tempfile.TemporaryDirectory() as temp_folder:
+        script_parts = generate_script(topic)
+        audio_path = generate_audio(script_parts, temp_folder)
+        audio_duration = AudioSegment.from_file(audio_path).duration_seconds
+        segments = generate_subtitles(audio_path)
+        background = process_background_video(audio_duration)
+        subtitle_clips = create_subtitle_clips(segments)
+        audio_clip = AudioFileClip(audio_path)
+        final_video = background.set_audio(audio_clip)
+        final_video = CompositeVideoClip([final_video] + subtitle_clips)
+        output_path = os.path.join(temp_folder, OUTPUT_VIDEO_FILENAME)
+        final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
+        return output_path
+# Gradio UI
+iface = gr.Interface(
+    fn=generate_video,
+    inputs=gr.Textbox(label="Topic"),
+    outputs=gr.Video(label="Generated YouTube Short"),
+    title="YouTube Short Creator"
+)
 if __name__ == "__main__":
+    iface.launch()