Spaces:
Running
Running
import os | |
import re | |
import time | |
import math | |
import tempfile | |
import random | |
import shutil | |
import torch | |
import numpy as np | |
import soundfile as sf | |
from pydub import AudioSegment | |
from gtts import gTTS | |
import whisper # Ensure this is openai-whisper in requirements.txt | |
import gradio as gr | |
import requests | |
import json | |
from moviepy.editor import ( | |
VideoFileClip, concatenate_videoclips, AudioFileClip, | |
CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip | |
) | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Global Configuration Variables | |
OPENROUTER_API_KEY = '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' | |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free" | |
TARGET_RESOLUTION = (1080, 1920) # Vertical format for shorts | |
OUTPUT_VIDEO_FILENAME = "final_video.mp4" | |
TEMP_FOLDER = None | |
CAPTION_COLOR = "white" | |
# Additional global variables for Gradio interface | |
selected_voice = 'en_us_001' # Default voice | |
voice_speed = 1.0 # Default voice speed | |
font_size = 45 # Default font size | |
bg_music_volume = 0.08 # Default background music volume | |
fps = 30 # Default FPS | |
preset = "veryfast" # Default preset | |
# Initialize whisper model globally | |
whisper_model = None | |
def load_whisper_model(): | |
"""Load the Whisper model.""" | |
global whisper_model | |
try: | |
logger.info("Loading Whisper model...") | |
whisper_model = whisper.load_model("tiny") # Using tiny for CPU efficiency | |
logger.info("Whisper model loaded successfully") | |
return True | |
except Exception as e: | |
logger.error(f"Failed to load Whisper model: {e}") | |
return False | |
def generate_script(user_input): | |
"""Generate documentary script using OpenRouter API.""" | |
headers = { | |
'Authorization': f'Bearer {OPENROUTER_API_KEY}', | |
'HTTP-Referer': 'https://huggingface.co/spaces', | |
'X-Title': 'AI Documentary Maker' | |
} | |
prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic. | |
The script should sound like a real human voiceover from a TV show or documentary β clear, factual, and engaging, like something you'd hear on National Geographic or a news report. | |
Structure: | |
- Break the script into scenes using [Tags]. Each tag is a short title (1β2 words) that describes the scene. | |
- Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic. | |
- The full script should make sense as one connected narration β no randomness. | |
- Use natural, formal English. No slang, no fake AI language, and no robotic tone. | |
- Do not use humor, sarcasm, or casual language. This is a serious narration. | |
- No emotion-sound words like "aww," "eww," "whoa," etc. | |
- Do not use numbers like 1, 2, 3 β write them out as one, two, three. | |
- Make the total narration about 1 minute long (around 150-200 words total). | |
- At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe. | |
Only output the script. No extra comments or text. | |
Example: | |
[Ocean] | |
The ocean covers over seventy percent of the Earth's surface. | |
[Currents] | |
Ocean currents distribute heat and regulate global climate patterns. | |
[Coral Reefs] | |
These ecosystems support over one million species of marine life. | |
[Pollution] | |
Plastic waste threatens marine biodiversity and food chains. | |
[Climate Impact] | |
Rising temperatures are causing coral bleaching and habitat loss. | |
[Subscribe] | |
Follow to explore more about the changing planet we live on. | |
Now here is the Topic: {user_input} | |
""" | |
data = { | |
'model': OPENROUTER_MODEL, | |
'messages': [{'role': 'user', 'content': prompt}], | |
'temperature': 0.4, | |
'max_tokens': 2000 | |
} | |
try: | |
response = requests.post( | |
'https://openrouter.ai/api/v1/chat/completions', | |
headers=headers, | |
json=data, | |
timeout=30 | |
) | |
if response.status_code == 200: | |
response_data = response.json() | |
if 'choices' in response_data and len(response_data['choices']) > 0: | |
return response_data['choices'][0]['message']['content'] | |
else: | |
logger.error(f"Unexpected response format: {response_data}") | |
return None | |
else: | |
logger.error(f"API Error {response.status_code}: {response.text}") | |
return None | |
except Exception as e: | |
logger.error(f"Request failed: {str(e)}") | |
return None | |
def parse_script(script_text): | |
"""Parse the generated script into a list of elements.""" | |
sections = {} | |
current_title = None | |
current_text = "" | |
try: | |
for line in script_text.splitlines(): | |
line = line.strip() | |
if line.startswith("[") and "]" in line: | |
bracket_start = line.find("[") | |
bracket_end = line.find("]", bracket_start) | |
if bracket_start != -1 and bracket_end != -1: | |
if current_title is not None: | |
sections[current_title] = current_text.strip() | |
current_title = line[bracket_start+1:bracket_end] | |
current_text = line[bracket_end+1:].strip() | |
elif current_title: | |
current_text += line + " " | |
if current_title: | |
sections[current_title] = current_text.strip() | |
elements = [] | |
for title, narration in sections.items(): | |
if not title or not narration: | |
continue | |
media_element = {"type": "media", "prompt": title, "effects": "fade-in"} | |
words = narration.split() | |
duration = max(3, len(words) * 0.5) # Estimate duration | |
tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration} | |
elements.append(media_element) | |
elements.append(tts_element) | |
return elements | |
except Exception as e: | |
logger.error(f"Error parsing script: {e}") | |
return [] | |
def generate_tts(text, voice="en"): | |
"""Generate TTS audio using gTTS.""" | |
safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_') | |
file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav") | |
try: | |
logger.info(f"Generating TTS for: {text[:30]}...") | |
tts = gTTS(text=text, lang='en', slow=False) | |
mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3") | |
tts.save(mp3_path) | |
# Convert MP3 to WAV | |
audio = AudioSegment.from_mp3(mp3_path) | |
if voice_speed != 1.0: | |
audio = audio._spawn(audio.raw_data, overrides={ | |
"frame_rate": int(audio.frame_rate * voice_speed) | |
}) | |
audio.export(file_path, format="wav") | |
os.remove(mp3_path) | |
logger.info(f"TTS saved to {file_path}") | |
return file_path | |
except Exception as e: | |
logger.error(f"TTS generation error: {e}") | |
return generate_silent_audio(duration=max(3, len(text.split()) * 0.5)) | |
def generate_silent_audio(duration, sample_rate=24000): | |
"""Generate a silent WAV audio file.""" | |
num_samples = int(duration * sample_rate) | |
silence = np.zeros(num_samples, dtype=np.float32) | |
silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav") | |
sf.write(silent_path, silence, sample_rate) | |
logger.info(f"Silent audio generated: {silent_path}") | |
return silent_path | |
def analyze_audio_with_whisper(audio_path): | |
"""Use Whisper to generate word-level timestamps.""" | |
try: | |
if whisper_model is None: | |
load_whisper_model() | |
logger.info(f"Analyzing audio with Whisper: {audio_path}") | |
result = whisper_model.transcribe(audio_path, word_timestamps=True) | |
word_segments = [] | |
for segment in result["segments"]: | |
for word in segment["words"]: | |
word_segments.append({ | |
"word": word["word"].strip(), | |
"start": word["start"], | |
"end": word["end"] | |
}) | |
logger.info(f"Extracted {len(word_segments)} word segments") | |
return word_segments | |
except Exception as e: | |
logger.error(f"Whisper analysis error: {e}") | |
return [] | |
def get_video_clip_segment(video_path, start_time, duration): | |
"""Extract a random video segment.""" | |
try: | |
video = VideoFileClip(video_path) | |
video_duration = video.duration | |
if duration > video_duration: | |
logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).") | |
return video | |
max_start_time = video_duration - duration | |
if start_time is None or start_time > max_start_time: | |
start_time = random.uniform(0, max_start_time) | |
clip = video.subclip(start_time, start_time + duration) | |
logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s") | |
return clip | |
except Exception as e: | |
logger.error(f"Error extracting video segment: {e}") | |
return None | |
def create_word_level_subtitles(clip, words_data, font_size=45): | |
"""Create synchronized subtitles without ImageMagick.""" | |
try: | |
logger.info("Creating word-level synchronized subtitles") | |
chunks = [] | |
current_chunk = [] | |
current_chunk_words = [] | |
for word_data in words_data: | |
current_chunk_words.append(word_data["word"]) | |
current_chunk.append(word_data) | |
if len(current_chunk_words) >= 5: | |
chunks.append({ | |
"text": " ".join(current_chunk_words), | |
"words": current_chunk, | |
"start": current_chunk[0]["start"], | |
"end": current_chunk[-1]["end"] | |
}) | |
current_chunk = [] | |
current_chunk_words = [] | |
if current_chunk_words: | |
chunks.append({ | |
"text": " ".join(current_chunk_words), | |
"words": current_chunk, | |
"start": current_chunk[0]["start"], | |
"end": current_chunk[-1]["end"] | |
}) | |
subtitle_clips = [] | |
for chunk in chunks: | |
txt_clip = TextClip( | |
chunk["text"], | |
fontsize=font_size, | |
color=CAPTION_COLOR, | |
method='label' | |
) | |
bg_clip = ColorClip( | |
size=(txt_clip.w + 20, txt_clip.h + 10), | |
color=(0, 0, 0, 128) # Semi-transparent black | |
) | |
subtitle_clip = CompositeVideoClip([ | |
bg_clip.set_position('center'), | |
txt_clip.set_position('center') | |
]) | |
subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85)) | |
subtitle_clips.append(subtitle_clip) | |
logger.info(f"Created {len(subtitle_clips)} subtitle chunks") | |
return subtitle_clips | |
except Exception as e: | |
logger.error(f"Error creating subtitles: {e}") | |
return [] | |
def add_background_music(final_video, bg_music_volume=0.08): | |
"""Add background music to the video.""" | |
try: | |
bg_music_path = "music.mp3" | |
if bg_music_path and os.path.exists(bg_music_path): | |
logger.info(f"Adding background music from: {bg_music_path}") | |
bg_music = AudioFileClip(bg_music_path) | |
if bg_music.duration < final_video.duration: | |
loops_needed = math.ceil(final_video.duration / bg_music.duration) | |
bg_segments = [bg_music] * loops_needed | |
bg_music = CompositeAudioClip(bg_segments) | |
bg_music = bg_music.subclip(0, final_video.duration) | |
bg_music = bg_music.volumex(bg_music_volume) | |
video_audio = final_video.audio | |
mixed_audio = CompositeAudioClip([video_audio, bg_music]) | |
final_video = final_video.set_audio(mixed_audio) | |
logger.info("Background music added successfully") | |
else: | |
logger.info("No music file found, skipping background music") | |
return final_video | |
except Exception as e: | |
logger.error(f"Error adding background music: {e}") | |
return final_video | |
def create_clip(tts_path, narration_text, segment_index=0): | |
"""Create a video clip with synchronized subtitles.""" | |
try: | |
logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}") | |
if not os.path.exists(tts_path) or not os.path.exists("video.mp4"): | |
logger.error("Missing video or TTS file") | |
return None | |
audio_clip = AudioFileClip(tts_path) | |
audio_duration = audio_clip.duration | |
target_duration = audio_duration + 0.5 | |
video_clip = get_video_clip_segment("video.mp4", None, target_duration) | |
if video_clip is None: | |
logger.error("Failed to extract video segment") | |
return None | |
video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0]) | |
video_clip = video_clip.set_audio(audio_clip) | |
word_data = analyze_audio_with_whisper(tts_path) | |
if word_data: | |
subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size) | |
if subtitle_clips: | |
video_clip = CompositeVideoClip([video_clip] + subtitle_clips) | |
else: | |
logger.warning("Falling back to basic subtitles") | |
txt_clip = TextClip( | |
narration_text, | |
fontsize=font_size, | |
color=CAPTION_COLOR, | |
method='label' | |
) | |
bg_clip = ColorClip( | |
size=(txt_clip.w + 20, txt_clip.h + 10), | |
color=(0, 0, 0, 128) | |
) | |
subtitle_clip = CompositeVideoClip([ | |
bg_clip.set_position('center'), | |
txt_clip.set_position('center') | |
]) | |
subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85)) | |
video_clip = CompositeVideoClip([video_clip, subtitle_clip]) | |
logger.info(f"Clip created: {video_clip.duration:.1f}s") | |
return video_clip | |
except Exception as e: | |
logger.error(f"Error in create_clip: {str(e)}") | |
return None | |
def generate_video(user_input, resolution, caption_option): | |
"""Generate a video based on user input.""" | |
global TEMP_FOLDER, CAPTION_COLOR | |
CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent" | |
TEMP_FOLDER = tempfile.mkdtemp() | |
logger.info(f"Created temporary folder: {TEMP_FOLDER}") | |
if not os.path.exists("video.mp4"): | |
logger.error("video.mp4 not found") | |
return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'." | |
load_whisper_model() | |
script = generate_script(user_input) | |
if not script: | |
shutil.rmtree(TEMP_FOLDER) | |
return "Failed to generate script." | |
logger.info("Generated Script:\n" + script) | |
elements = parse_script(script) | |
if not elements: | |
shutil.rmtree(TEMP_FOLDER) | |
return "Failed to parse script." | |
logger.info(f"Parsed {len(elements)//2} script segments.") | |
paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)] | |
if not paired_elements: | |
shutil.rmtree(TEMP_FOLDER) | |
return "No valid script segments generated." | |
clips = [] | |
for idx, (media_elem, tts_elem) in enumerate(paired_elements): | |
logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'") | |
tts_path = generate_tts(tts_elem['text'], tts_elem['voice']) | |
if not tts_path: | |
continue | |
clip = create_clip(tts_path, tts_elem['text'], idx) | |
if clip: | |
clips.append(clip) | |
if not clips: | |
shutil.rmtree(TEMP_FOLDER) | |
return "Failed to create any video clips." | |
logger.info("\nConcatenating clips...") | |
final_video = concatenate_videoclips(clips, method="compose") | |
final_video = add_background_music(final_video, bg_music_volume=bg_music_volume) | |
logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...") | |
final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset) | |
logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}") | |
shutil.rmtree(TEMP_FOLDER) | |
logger.info("Temporary files removed.") | |
return OUTPUT_VIDEO_FILENAME | |
def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size): | |
"""Generate video with Gradio options.""" | |
global voice_speed, font_size, bg_music_volume, fps, preset | |
voice_speed = v_speed | |
font_size = caption_size | |
bg_music_volume = bg_vol | |
fps = video_fps | |
preset = video_preset | |
if music_file is not None: | |
shutil.copy(music_file.name, "music.mp3") | |
logger.info(f"Uploaded music saved as: music.mp3") | |
return generate_video(user_input, "Short", caption_option) | |
def create_interface(): | |
"""Create Gradio interface.""" | |
iface = gr.Interface( | |
fn=generate_video_with_options, | |
inputs=[ | |
gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."), | |
gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"), | |
gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]), | |
gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"), | |
gr.Slider(10, 60, value=30, step=1, label="Video FPS"), | |
gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"], | |
value="veryfast", label="Export Preset"), | |
gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"), | |
gr.Slider(20, 100, value=45, step=1, label="Caption Font Size") | |
], | |
outputs=gr.Video(label="Generated Video"), | |
title="AI Documentary Video Generator", | |
description=""" | |
Create short documentary videos with AI narration and synchronized captions. | |
1. Enter a topic or concept for your documentary | |
2. Optionally upload background music | |
3. Adjust settings as needed | |
4. Click submit and wait for video generation | |
NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space. | |
""" | |
) | |
return iface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() | |
else: | |
demo = create_interface() |