Spaces:
Running
Running
File size: 6,939 Bytes
878d3d4 c25723f 3ea8b5d c25723f 878d3d4 c25723f fb20f92 c25723f 3ea8b5d c25723f 878d3d4 c25723f 3ea8b5d c25723f 878d3d4 c25723f fb20f92 c25723f 878d3d4 fb20f92 878d3d4 c25723f 878d3d4 c25723f 878d3d4 c25723f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import os
import re
import math
import random
import tempfile
import shutil
import requests
import numpy as np
from kokoro import KPipeline
import soundfile as sf
from pydub import AudioSegment
from gtts import gTTS
import gradio as gr
from moviepy.editor import (
VideoFileClip, AudioFileClip, concatenate_audioclips,
CompositeAudioClip, CompositeVideoClip, TextClip
)
# ββββββββββ GLOBAL CONFIG ββββββββββ
OPENROUTER_API_KEY = 'sk-or-v1-β¦'
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
SOURCE_VIDEO_PATH = "video.mp4" # 13-min source
OUTPUT_VIDEO_PATH = "final_video.mp4"
TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style
VOICE_SPEED = 0.9
CAPTION_FONT_SIZE = 45
BG_MUSIC_VOLUME = 0.08
# Initialize Kokoro TTS
pipeline = KPipeline(lang_code='a') # American English
# ββββββββββ SCRIPT GENERATION ββββββββββ
def generate_script(topic: str) -> str:
headers = {
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
'HTTP-Referer': 'https://your-domain.com',
'X-Title': 'AI Documentary Maker'
}
prompt = f"""Youβre a professional documentary narrator.
Break your script into scenes with [Tags], one sentence each (β€12 words).
No slang or numbers. At the end, include [Subscribe] with a formal reason.
Topic: {topic}
"""
payload = {
'model': OPENROUTER_MODEL,
'messages': [{'role':'user','content':prompt}],
'temperature':0.4,
'max_tokens':5000
}
r = requests.post('https://openrouter.ai/api/v1/chat/completions',
headers=headers, json=payload, timeout=30)
r.raise_for_status()
return r.json()['choices'][0]['message']['content']
def parse_script(script_text: str):
"""Return list of (scene_title, sentence_text)."""
sections = []
current = None
for line in script_text.splitlines():
m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
if m:
if current:
sections.append(current)
current = [m.group(1), m.group(2)]
elif current and line.strip():
current[1] += ' ' + line.strip()
if current:
sections.append(current)
return sections
# ββββββββββ TTS ββββββββββ
def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
"""Produce a WAV file, using Kokoro then gTTS fallback."""
safe = re.sub(r'[^\w]', '_', text[:10]).strip()
out_path = os.path.join(dirpath, f"tts_{safe}.wav")
if os.path.exists(out_path):
return out_path
try:
# Kokoro pipeline returns a sequence of numpy audio arrays
segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
arrays = [seg_audio for _, _, seg_audio in segments]
combined = np.concatenate(arrays, axis=0)
sf.write(out_path, combined, 24000)
except Exception:
# fallback to gTTS
mp3 = os.path.join(dirpath, f"{safe}.mp3")
gTTS(text=text, lang='en').save(mp3)
wav = AudioSegment.from_mp3(mp3)
wav.export(out_path, format="wav")
os.remove(mp3)
return out_path
# ββββββββββ VIDEO + SUBTITLES ββββββββββ
def add_pillow_subtitles(clip, sections):
"""Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
subtitles = []
cum_time = 0
for title, sentence in sections:
audio_path = clip.audio # duration only
# split sentence into 5-word chunks
words = sentence.split()
chunks = [words[i:i+5] for i in range(0, len(words), 5)]
seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
# approximate each chunk duration
chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
for i, chunk in enumerate(chunks):
txt = ' '.join(chunk)
txt_clip = (
TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
color='white', bg_color='rgba(0,0,0,0.3)',
size=(TARGET_RESOLUTION[0]*0.9, None),
method='pillow')
.set_start(cum_time + i*chunk_dur)
.set_duration(chunk_dur)
.set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
)
subtitles.append(txt_clip)
cum_time += seg_dur
return subtitles
def generate_video(topic, include_captions, music_file, voice_choice):
if not os.path.exists(SOURCE_VIDEO_PATH):
raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
# 1) get script & sections
script = generate_script(topic)
sections = parse_script(script)
# 2) TTS each sentence
tmp = tempfile.mkdtemp()
tts_paths = [
generate_tts_audio(sentence, voice_choice, tmp)
for _, sentence in sections
]
# 3) concatenate all TTS audios
clips_audio = [AudioFileClip(p) for p in tts_paths]
narration = concatenate_audioclips(clips_audio)
total_dur = narration.duration
# 4) pick one random subclip
src = VideoFileClip(SOURCE_VIDEO_PATH)
start = random.uniform(0, max(0, src.duration - total_dur))
video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
src.close()
# 5) overlay narration
video = video.set_audio(narration)
# 6) optional subtitles
if include_captions:
subs = add_pillow_subtitles(video, sections)
video = CompositeVideoClip([video, *subs])
# 7) optional background music
if music_file:
bg = AudioFileClip(music_file)
# loop or trim
loops = math.ceil(video.duration / bg.duration)
bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
mixed = CompositeAudioClip([video.audio, bg])
video = video.set_audio(mixed)
# 8) export
video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')
# cleanup
shutil.rmtree(tmp)
return OUTPUT_VIDEO_PATH
# ββββββββββ GRADIO UI ββββββββββ
VOICE_MAP = {
'Emma (Female)': 'af_heart',
'Bella (Female)': 'af_bella',
# β¦ add others as needed β¦
}
iface = gr.Interface(
fn=generate_video,
inputs=[
gr.Textbox(label="Video Concept"),
gr.Checkbox(label="Include Captions"),
gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
],
outputs=gr.Video(label="Generated Video"),
title="AI Documentary Video Generator",
description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
)
if __name__ == "__main__":
iface.launch(share=True)
|