File size: 6,939 Bytes
878d3d4
c25723f
 
3ea8b5d
c25723f
878d3d4
c25723f
fb20f92
c25723f
 
 
 
3ea8b5d
c25723f
 
 
 
878d3d4
c25723f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ea8b5d
c25723f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878d3d4
c25723f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb20f92
c25723f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878d3d4
fb20f92
878d3d4
c25723f
 
 
 
878d3d4
 
c25723f
 
878d3d4
 
 
c25723f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import os
import re
import math
import random
import tempfile
import shutil
import requests
import numpy as np
from kokoro import KPipeline
import soundfile as sf
from pydub import AudioSegment
from gtts import gTTS
import gradio as gr
from moviepy.editor import (
    VideoFileClip, AudioFileClip, concatenate_audioclips,
    CompositeAudioClip, CompositeVideoClip, TextClip
)

# ────────── GLOBAL CONFIG ──────────
OPENROUTER_API_KEY = 'sk-or-v1-…'
OPENROUTER_MODEL     = "google/gemini-2.0-flash-exp:free"
SOURCE_VIDEO_PATH    = "video.mp4"   # 13-min source
OUTPUT_VIDEO_PATH    = "final_video.mp4"
TARGET_RESOLUTION    = (1080, 1920)  # Vertical TikTok style
VOICE_SPEED          = 0.9
CAPTION_FONT_SIZE    = 45
BG_MUSIC_VOLUME      = 0.08

# Initialize Kokoro TTS
pipeline = KPipeline(lang_code='a')  # American English

# ────────── SCRIPT GENERATION ──────────
def generate_script(topic: str) -> str:
    headers = {
        'Authorization': f'Bearer {OPENROUTER_API_KEY}',
        'HTTP-Referer': 'https://your-domain.com',
        'X-Title': 'AI Documentary Maker'
    }
    prompt = f"""You’re a professional documentary narrator.
Break your script into scenes with [Tags], one sentence each (≀12 words).
No slang or numbers. At the end, include [Subscribe] with a formal reason.

Topic: {topic}
"""
    payload = {
        'model': OPENROUTER_MODEL,
        'messages': [{'role':'user','content':prompt}],
        'temperature':0.4,
        'max_tokens':5000
    }
    r = requests.post('https://openrouter.ai/api/v1/chat/completions',
                      headers=headers, json=payload, timeout=30)
    r.raise_for_status()
    return r.json()['choices'][0]['message']['content']

def parse_script(script_text: str):
    """Return list of (scene_title, sentence_text)."""
    sections = []
    current = None
    for line in script_text.splitlines():
        m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
        if m:
            if current:
                sections.append(current)
            current = [m.group(1), m.group(2)]
        elif current and line.strip():
            current[1] += ' ' + line.strip()
    if current:
        sections.append(current)
    return sections

# ────────── TTS ──────────
def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
    """Produce a WAV file, using Kokoro then gTTS fallback."""
    safe = re.sub(r'[^\w]', '_', text[:10]).strip()
    out_path = os.path.join(dirpath, f"tts_{safe}.wav")
    if os.path.exists(out_path):
        return out_path

    try:
        # Kokoro pipeline returns a sequence of numpy audio arrays
        segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
        arrays = [seg_audio for _, _, seg_audio in segments]
        combined = np.concatenate(arrays, axis=0)
        sf.write(out_path, combined, 24000)
    except Exception:
        # fallback to gTTS
        mp3 = os.path.join(dirpath, f"{safe}.mp3")
        gTTS(text=text, lang='en').save(mp3)
        wav = AudioSegment.from_mp3(mp3)
        wav.export(out_path, format="wav")
        os.remove(mp3)
    return out_path

# ────────── VIDEO + SUBTITLES ──────────
def add_pillow_subtitles(clip, sections):
    """Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
    subtitles = []
    cum_time = 0
    for title, sentence in sections:
        audio_path = clip.audio  # duration only
        # split sentence into 5-word chunks
        words = sentence.split()
        chunks = [words[i:i+5] for i in range(0, len(words), 5)]
        seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
        # approximate each chunk duration
        chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
        for i, chunk in enumerate(chunks):
            txt = ' '.join(chunk)
            txt_clip = (
                TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
                         color='white', bg_color='rgba(0,0,0,0.3)',
                         size=(TARGET_RESOLUTION[0]*0.9, None),
                         method='pillow')
                .set_start(cum_time + i*chunk_dur)
                .set_duration(chunk_dur)
                .set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
            )
            subtitles.append(txt_clip)
        cum_time += seg_dur
    return subtitles

def generate_video(topic, include_captions, music_file, voice_choice):
    if not os.path.exists(SOURCE_VIDEO_PATH):
        raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
    # 1) get script & sections
    script = generate_script(topic)
    sections = parse_script(script)
    # 2) TTS each sentence
    tmp = tempfile.mkdtemp()
    tts_paths = [
        generate_tts_audio(sentence, voice_choice, tmp)
        for _, sentence in sections
    ]
    # 3) concatenate all TTS audios
    clips_audio = [AudioFileClip(p) for p in tts_paths]
    narration = concatenate_audioclips(clips_audio)
    total_dur = narration.duration

    # 4) pick one random subclip
    src = VideoFileClip(SOURCE_VIDEO_PATH)
    start = random.uniform(0, max(0, src.duration - total_dur))
    video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
    src.close()

    # 5) overlay narration
    video = video.set_audio(narration)

    # 6) optional subtitles
    if include_captions:
        subs = add_pillow_subtitles(video, sections)
        video = CompositeVideoClip([video, *subs])

    # 7) optional background music
    if music_file:
        bg = AudioFileClip(music_file)
        # loop or trim
        loops = math.ceil(video.duration / bg.duration)
        bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
        mixed = CompositeAudioClip([video.audio, bg])
        video = video.set_audio(mixed)

    # 8) export
    video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')

    # cleanup
    shutil.rmtree(tmp)
    return OUTPUT_VIDEO_PATH

# ────────── GRADIO UI ──────────
VOICE_MAP = {
    'Emma (Female)': 'af_heart',
    'Bella (Female)': 'af_bella',
    # … add others as needed …
}

iface = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Textbox(label="Video Concept"),
        gr.Checkbox(label="Include Captions"),
        gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
        gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
    ],
    outputs=gr.Video(label="Generated Video"),
    title="AI Documentary Video Generator",
    description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
)

if __name__ == "__main__":
    iface.launch(share=True)