sda / app.py
testdeep123's picture
Update app.py
c25723f verified
raw
history blame
6.94 kB
import os
import re
import math
import random
import tempfile
import shutil
import requests
import numpy as np
from kokoro import KPipeline
import soundfile as sf
from pydub import AudioSegment
from gtts import gTTS
import gradio as gr
from moviepy.editor import (
VideoFileClip, AudioFileClip, concatenate_audioclips,
CompositeAudioClip, CompositeVideoClip, TextClip
)
# ────────── GLOBAL CONFIG ──────────
OPENROUTER_API_KEY = 'sk-or-v1-…'
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
SOURCE_VIDEO_PATH = "video.mp4" # 13-min source
OUTPUT_VIDEO_PATH = "final_video.mp4"
TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style
VOICE_SPEED = 0.9
CAPTION_FONT_SIZE = 45
BG_MUSIC_VOLUME = 0.08
# Initialize Kokoro TTS
pipeline = KPipeline(lang_code='a') # American English
# ────────── SCRIPT GENERATION ──────────
def generate_script(topic: str) -> str:
headers = {
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
'HTTP-Referer': 'https://your-domain.com',
'X-Title': 'AI Documentary Maker'
}
prompt = f"""You’re a professional documentary narrator.
Break your script into scenes with [Tags], one sentence each (≀12 words).
No slang or numbers. At the end, include [Subscribe] with a formal reason.
Topic: {topic}
"""
payload = {
'model': OPENROUTER_MODEL,
'messages': [{'role':'user','content':prompt}],
'temperature':0.4,
'max_tokens':5000
}
r = requests.post('https://openrouter.ai/api/v1/chat/completions',
headers=headers, json=payload, timeout=30)
r.raise_for_status()
return r.json()['choices'][0]['message']['content']
def parse_script(script_text: str):
"""Return list of (scene_title, sentence_text)."""
sections = []
current = None
for line in script_text.splitlines():
m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
if m:
if current:
sections.append(current)
current = [m.group(1), m.group(2)]
elif current and line.strip():
current[1] += ' ' + line.strip()
if current:
sections.append(current)
return sections
# ────────── TTS ──────────
def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
"""Produce a WAV file, using Kokoro then gTTS fallback."""
safe = re.sub(r'[^\w]', '_', text[:10]).strip()
out_path = os.path.join(dirpath, f"tts_{safe}.wav")
if os.path.exists(out_path):
return out_path
try:
# Kokoro pipeline returns a sequence of numpy audio arrays
segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
arrays = [seg_audio for _, _, seg_audio in segments]
combined = np.concatenate(arrays, axis=0)
sf.write(out_path, combined, 24000)
except Exception:
# fallback to gTTS
mp3 = os.path.join(dirpath, f"{safe}.mp3")
gTTS(text=text, lang='en').save(mp3)
wav = AudioSegment.from_mp3(mp3)
wav.export(out_path, format="wav")
os.remove(mp3)
return out_path
# ────────── VIDEO + SUBTITLES ──────────
def add_pillow_subtitles(clip, sections):
"""Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
subtitles = []
cum_time = 0
for title, sentence in sections:
audio_path = clip.audio # duration only
# split sentence into 5-word chunks
words = sentence.split()
chunks = [words[i:i+5] for i in range(0, len(words), 5)]
seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
# approximate each chunk duration
chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
for i, chunk in enumerate(chunks):
txt = ' '.join(chunk)
txt_clip = (
TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
color='white', bg_color='rgba(0,0,0,0.3)',
size=(TARGET_RESOLUTION[0]*0.9, None),
method='pillow')
.set_start(cum_time + i*chunk_dur)
.set_duration(chunk_dur)
.set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
)
subtitles.append(txt_clip)
cum_time += seg_dur
return subtitles
def generate_video(topic, include_captions, music_file, voice_choice):
if not os.path.exists(SOURCE_VIDEO_PATH):
raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
# 1) get script & sections
script = generate_script(topic)
sections = parse_script(script)
# 2) TTS each sentence
tmp = tempfile.mkdtemp()
tts_paths = [
generate_tts_audio(sentence, voice_choice, tmp)
for _, sentence in sections
]
# 3) concatenate all TTS audios
clips_audio = [AudioFileClip(p) for p in tts_paths]
narration = concatenate_audioclips(clips_audio)
total_dur = narration.duration
# 4) pick one random subclip
src = VideoFileClip(SOURCE_VIDEO_PATH)
start = random.uniform(0, max(0, src.duration - total_dur))
video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
src.close()
# 5) overlay narration
video = video.set_audio(narration)
# 6) optional subtitles
if include_captions:
subs = add_pillow_subtitles(video, sections)
video = CompositeVideoClip([video, *subs])
# 7) optional background music
if music_file:
bg = AudioFileClip(music_file)
# loop or trim
loops = math.ceil(video.duration / bg.duration)
bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
mixed = CompositeAudioClip([video.audio, bg])
video = video.set_audio(mixed)
# 8) export
video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')
# cleanup
shutil.rmtree(tmp)
return OUTPUT_VIDEO_PATH
# ────────── GRADIO UI ──────────
VOICE_MAP = {
'Emma (Female)': 'af_heart',
'Bella (Female)': 'af_bella',
# … add others as needed …
}
iface = gr.Interface(
fn=generate_video,
inputs=[
gr.Textbox(label="Video Concept"),
gr.Checkbox(label="Include Captions"),
gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
],
outputs=gr.Video(label="Generated Video"),
title="AI Documentary Video Generator",
description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
)
if __name__ == "__main__":
iface.launch(share=True)