Spaces:
Running
Running
import os | |
import re | |
import math | |
import random | |
import tempfile | |
import shutil | |
import requests | |
import numpy as np | |
from kokoro import KPipeline | |
import soundfile as sf | |
from pydub import AudioSegment | |
from gtts import gTTS | |
import gradio as gr | |
from moviepy.editor import ( | |
VideoFileClip, AudioFileClip, concatenate_audioclips, | |
CompositeAudioClip, CompositeVideoClip, TextClip | |
) | |
# ββββββββββ GLOBAL CONFIG ββββββββββ | |
OPENROUTER_API_KEY = 'sk-or-v1-β¦' | |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free" | |
SOURCE_VIDEO_PATH = "video.mp4" # 13-min source | |
OUTPUT_VIDEO_PATH = "final_video.mp4" | |
TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style | |
VOICE_SPEED = 0.9 | |
CAPTION_FONT_SIZE = 45 | |
BG_MUSIC_VOLUME = 0.08 | |
# Initialize Kokoro TTS | |
pipeline = KPipeline(lang_code='a') # American English | |
# ββββββββββ SCRIPT GENERATION ββββββββββ | |
def generate_script(topic: str) -> str: | |
headers = { | |
'Authorization': f'Bearer {OPENROUTER_API_KEY}', | |
'HTTP-Referer': 'https://your-domain.com', | |
'X-Title': 'AI Documentary Maker' | |
} | |
prompt = f"""Youβre a professional documentary narrator. | |
Break your script into scenes with [Tags], one sentence each (β€12 words). | |
No slang or numbers. At the end, include [Subscribe] with a formal reason. | |
Topic: {topic} | |
""" | |
payload = { | |
'model': OPENROUTER_MODEL, | |
'messages': [{'role':'user','content':prompt}], | |
'temperature':0.4, | |
'max_tokens':5000 | |
} | |
r = requests.post('https://openrouter.ai/api/v1/chat/completions', | |
headers=headers, json=payload, timeout=30) | |
r.raise_for_status() | |
return r.json()['choices'][0]['message']['content'] | |
def parse_script(script_text: str): | |
"""Return list of (scene_title, sentence_text).""" | |
sections = [] | |
current = None | |
for line in script_text.splitlines(): | |
m = re.match(r'^\[(.+?)\]\s*(.*)$', line) | |
if m: | |
if current: | |
sections.append(current) | |
current = [m.group(1), m.group(2)] | |
elif current and line.strip(): | |
current[1] += ' ' + line.strip() | |
if current: | |
sections.append(current) | |
return sections | |
# ββββββββββ TTS ββββββββββ | |
def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str: | |
"""Produce a WAV file, using Kokoro then gTTS fallback.""" | |
safe = re.sub(r'[^\w]', '_', text[:10]).strip() | |
out_path = os.path.join(dirpath, f"tts_{safe}.wav") | |
if os.path.exists(out_path): | |
return out_path | |
try: | |
# Kokoro pipeline returns a sequence of numpy audio arrays | |
segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+') | |
arrays = [seg_audio for _, _, seg_audio in segments] | |
combined = np.concatenate(arrays, axis=0) | |
sf.write(out_path, combined, 24000) | |
except Exception: | |
# fallback to gTTS | |
mp3 = os.path.join(dirpath, f"{safe}.mp3") | |
gTTS(text=text, lang='en').save(mp3) | |
wav = AudioSegment.from_mp3(mp3) | |
wav.export(out_path, format="wav") | |
os.remove(mp3) | |
return out_path | |
# ββββββββββ VIDEO + SUBTITLES ββββββββββ | |
def add_pillow_subtitles(clip, sections): | |
"""Overlay each sentence as timed subtitles using Pillow (no ImageMagick).""" | |
subtitles = [] | |
cum_time = 0 | |
for title, sentence in sections: | |
audio_path = clip.audio # duration only | |
# split sentence into 5-word chunks | |
words = sentence.split() | |
chunks = [words[i:i+5] for i in range(0, len(words), 5)] | |
seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections)) | |
# approximate each chunk duration | |
chunk_dur = seg_dur / len(chunks) if chunks else seg_dur | |
for i, chunk in enumerate(chunks): | |
txt = ' '.join(chunk) | |
txt_clip = ( | |
TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold', | |
color='white', bg_color='rgba(0,0,0,0.3)', | |
size=(TARGET_RESOLUTION[0]*0.9, None), | |
method='pillow') | |
.set_start(cum_time + i*chunk_dur) | |
.set_duration(chunk_dur) | |
.set_position(('center', int(TARGET_RESOLUTION[1]*0.8))) | |
) | |
subtitles.append(txt_clip) | |
cum_time += seg_dur | |
return subtitles | |
def generate_video(topic, include_captions, music_file, voice_choice): | |
if not os.path.exists(SOURCE_VIDEO_PATH): | |
raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.") | |
# 1) get script & sections | |
script = generate_script(topic) | |
sections = parse_script(script) | |
# 2) TTS each sentence | |
tmp = tempfile.mkdtemp() | |
tts_paths = [ | |
generate_tts_audio(sentence, voice_choice, tmp) | |
for _, sentence in sections | |
] | |
# 3) concatenate all TTS audios | |
clips_audio = [AudioFileClip(p) for p in tts_paths] | |
narration = concatenate_audioclips(clips_audio) | |
total_dur = narration.duration | |
# 4) pick one random subclip | |
src = VideoFileClip(SOURCE_VIDEO_PATH) | |
start = random.uniform(0, max(0, src.duration - total_dur)) | |
video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION) | |
src.close() | |
# 5) overlay narration | |
video = video.set_audio(narration) | |
# 6) optional subtitles | |
if include_captions: | |
subs = add_pillow_subtitles(video, sections) | |
video = CompositeVideoClip([video, *subs]) | |
# 7) optional background music | |
if music_file: | |
bg = AudioFileClip(music_file) | |
# loop or trim | |
loops = math.ceil(video.duration / bg.duration) | |
bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME) | |
mixed = CompositeAudioClip([video.audio, bg]) | |
video = video.set_audio(mixed) | |
# 8) export | |
video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast') | |
# cleanup | |
shutil.rmtree(tmp) | |
return OUTPUT_VIDEO_PATH | |
# ββββββββββ GRADIO UI ββββββββββ | |
VOICE_MAP = { | |
'Emma (Female)': 'af_heart', | |
'Bella (Female)': 'af_bella', | |
# β¦ add others as needed β¦ | |
} | |
iface = gr.Interface( | |
fn=generate_video, | |
inputs=[ | |
gr.Textbox(label="Video Concept"), | |
gr.Checkbox(label="Include Captions"), | |
gr.File(label="Background Music (MP3)", file_types=[".mp3"]), | |
gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)") | |
], | |
outputs=gr.Video(label="Generated Video"), | |
title="AI Documentary Video Generator", | |
description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles." | |
) | |
if __name__ == "__main__": | |
iface.launch(share=True) | |