Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,224 +1,585 @@
|
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import math
|
4 |
-
import random
|
5 |
import tempfile
|
|
|
6 |
import shutil
|
7 |
-
import
|
8 |
import numpy as np
|
9 |
-
from kokoro import KPipeline
|
10 |
import soundfile as sf
|
|
|
11 |
from pydub import AudioSegment
|
12 |
from gtts import gTTS
|
|
|
13 |
import gradio as gr
|
|
|
|
|
14 |
from moviepy.editor import (
|
15 |
-
VideoFileClip,
|
16 |
-
|
17 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
#
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
headers = {
|
36 |
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
|
|
|
37 |
'X-Title': 'AI Documentary Maker'
|
38 |
}
|
39 |
-
|
40 |
-
You
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"""
|
45 |
-
|
|
|
46 |
'model': OPENROUTER_MODEL,
|
47 |
-
'messages': [{'role':'user','content':prompt}],
|
48 |
-
'temperature':0.4,
|
49 |
-
'max_tokens':
|
50 |
}
|
51 |
-
r = requests.post('https://openrouter.ai/api/v1/chat/completions',
|
52 |
-
headers=headers, json=payload, timeout=30)
|
53 |
-
r.raise_for_status()
|
54 |
-
return r.json()['choices'][0]['message']['content']
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
if
|
67 |
-
|
68 |
-
current = [m.group(1).strip(), m.group(2).strip()]
|
69 |
-
elif current and line.strip():
|
70 |
-
current[1] += ' ' + line.strip()
|
71 |
-
if current:
|
72 |
-
sections.append(tuple(current))
|
73 |
-
|
74 |
-
# filter & fix
|
75 |
-
cleaned = []
|
76 |
-
for tag, sentence in sections:
|
77 |
-
if not sentence:
|
78 |
-
if tag.lower() == 'subscribe':
|
79 |
-
sentence = "Follow to explore more on this topic."
|
80 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
continue
|
82 |
-
cleaned.append((tag, sentence))
|
83 |
-
return cleaned
|
84 |
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
"""
|
87 |
-
|
|
|
88 |
"""
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
|
|
|
|
|
|
|
|
95 |
try:
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
"""
|
112 |
-
|
113 |
-
|
114 |
"""
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
.
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
narration = concatenate_audioclips(aud_clips)
|
171 |
-
narration = narration.set_fps(24000)
|
172 |
-
|
173 |
-
# 4) Pick one random video subclip
|
174 |
-
src = VideoFileClip(SOURCE_VIDEO_PATH)
|
175 |
-
max_start = max(0, src.duration - narration.duration)
|
176 |
-
start = random.uniform(0, max_start)
|
177 |
-
vid = src.subclip(start, start + narration.duration).resize(TARGET_RESOLUTION)
|
178 |
-
src.close()
|
179 |
-
|
180 |
-
# 5) Overlay narration audio
|
181 |
-
vid = vid.set_audio(narration)
|
182 |
-
|
183 |
-
# 6) Add captions if requested
|
184 |
-
if include_captions:
|
185 |
-
subs = add_pillow_subtitles(vid, sections)
|
186 |
-
vid = CompositeVideoClip([vid, *subs])
|
187 |
-
|
188 |
-
# 7) Add background music
|
189 |
-
if music_file:
|
190 |
-
bg = AudioFileClip(music_file.name)
|
191 |
-
loops = math.ceil(vid.duration / bg.duration)
|
192 |
-
bg_full = concatenate_audioclips([bg]*loops).subclip(0, vid.duration)
|
193 |
-
bg_full = bg_full.volumex(BG_MUSIC_VOLUME)
|
194 |
-
vid = vid.set_audio(CompositeAudioClip([vid.audio, bg_full]))
|
195 |
-
|
196 |
-
# 8) Export
|
197 |
-
vid.write_videofile(
|
198 |
-
OUTPUT_VIDEO_PATH,
|
199 |
-
codec='libx264',
|
200 |
-
fps=30,
|
201 |
-
preset='veryfast',
|
202 |
-
audio_codec='aac'
|
203 |
-
)
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
|
|
223 |
if __name__ == "__main__":
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
import os
|
3 |
import re
|
4 |
+
import time
|
5 |
import math
|
|
|
6 |
import tempfile
|
7 |
+
import random
|
8 |
import shutil
|
9 |
+
import torch
|
10 |
import numpy as np
|
|
|
11 |
import soundfile as sf
|
12 |
+
from PIL import Image, ImageDraw, ImageFont
|
13 |
from pydub import AudioSegment
|
14 |
from gtts import gTTS
|
15 |
+
import whisper
|
16 |
import gradio as gr
|
17 |
+
import requests
|
18 |
+
import json
|
19 |
from moviepy.editor import (
|
20 |
+
VideoFileClip, concatenate_videoclips, AudioFileClip,
|
21 |
+
CompositeVideoClip, TextClip, CompositeAudioClip
|
22 |
)
|
23 |
+
import subprocess
|
24 |
+
import cv2
|
25 |
+
import moviepy.config as mpy_config
|
26 |
+
import moviepy.video.fx.all as vfx
|
27 |
+
import logging
|
28 |
+
|
29 |
+
# Set up logging
|
30 |
+
logging.basicConfig(level=logging.INFO,
|
31 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
32 |
+
logger = logging.getLogger(__name__)
|
33 |
|
34 |
+
# Configure moviepy
|
35 |
+
mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
|
36 |
+
|
37 |
+
# Global Configuration Variables
|
38 |
+
OPENROUTER_API_KEY = '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'
|
39 |
+
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
40 |
+
TARGET_RESOLUTION = (1080, 1920) # Fixed to vertical format for shorts
|
41 |
+
OUTPUT_VIDEO_FILENAME = "final_video.mp4"
|
42 |
+
TEMP_FOLDER = None
|
43 |
+
CAPTION_COLOR = "white"
|
44 |
+
|
45 |
+
# Additional global variables for the Gradio interface
|
46 |
+
selected_voice = 'en_us_001' # Default voice
|
47 |
+
voice_speed = 1.0 # Default voice speed
|
48 |
+
font_size = 45 # Default font size
|
49 |
+
bg_music_volume = 0.08 # Default background music volume
|
50 |
+
fps = 30 # Default FPS
|
51 |
+
preset = "veryfast" # Default preset
|
52 |
+
|
53 |
+
# Initialize whisper model globally to avoid reloading
|
54 |
+
whisper_model = None
|
55 |
+
|
56 |
+
def load_whisper_model():
|
57 |
+
"""Load the Whisper model."""
|
58 |
+
global whisper_model
|
59 |
+
try:
|
60 |
+
logger.info("Loading Whisper model...")
|
61 |
+
whisper_model = whisper.load_model("tiny") # Using tiny for CPU efficiency
|
62 |
+
logger.info("Whisper model loaded successfully")
|
63 |
+
return True
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"Failed to load Whisper model: {e}")
|
66 |
+
return False
|
67 |
+
|
68 |
+
# Helper Functions
|
69 |
+
def generate_script(user_input):
|
70 |
+
"""Generate documentary script using OpenRouter API."""
|
71 |
headers = {
|
72 |
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
|
73 |
+
'HTTP-Referer': 'https://huggingface.co/spaces',
|
74 |
'X-Title': 'AI Documentary Maker'
|
75 |
}
|
76 |
+
|
77 |
+
prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
|
78 |
+
|
79 |
+
The script should sound like a real human voiceover from a TV show or documentary β clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
|
80 |
+
|
81 |
+
Structure:
|
82 |
+
- Break the script into scenes using [Tags]. Each tag is a short title (1β2 words) that describes the scene.
|
83 |
+
- Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
|
84 |
+
- The full script should make sense as one connected narration β no randomness.
|
85 |
+
- Use natural, formal English. No slang, no fake AI language, and no robotic tone.
|
86 |
+
- Do not use humor, sarcasm, or casual language. This is a serious narration.
|
87 |
+
- No emotion-sound words like "aww," "eww," "whoa," etc.
|
88 |
+
- Do not use numbers like 1, 2, 3 β write them out as one, two, three.
|
89 |
+
- Make the total narration about 1 minute long (around 150-200 words total).
|
90 |
+
- At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
|
91 |
+
|
92 |
+
Only output the script. No extra comments or text.
|
93 |
+
|
94 |
+
Example:
|
95 |
+
|
96 |
+
[Ocean]
|
97 |
+
|
98 |
+
The ocean covers over seventy percent of the Earth's surface.
|
99 |
+
|
100 |
+
[Currents]
|
101 |
+
|
102 |
+
Ocean currents distribute heat and regulate global climate patterns.
|
103 |
+
|
104 |
+
[Coral Reefs]
|
105 |
+
|
106 |
+
These ecosystems support over one million species of marine life.
|
107 |
+
|
108 |
+
[Pollution]
|
109 |
+
|
110 |
+
Plastic waste threatens marine biodiversity and food chains.
|
111 |
+
|
112 |
+
[Climate Impact]
|
113 |
+
|
114 |
+
Rising temperatures are causing coral bleaching and habitat loss.
|
115 |
+
|
116 |
+
[Subscribe]
|
117 |
+
|
118 |
+
Follow to explore more about the changing planet we live on.
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
Now here is the Topic: {user_input}
|
123 |
"""
|
124 |
+
|
125 |
+
data = {
|
126 |
'model': OPENROUTER_MODEL,
|
127 |
+
'messages': [{'role': 'user', 'content': prompt}],
|
128 |
+
'temperature': 0.4,
|
129 |
+
'max_tokens': 2000
|
130 |
}
|
|
|
|
|
|
|
|
|
131 |
|
132 |
+
try:
|
133 |
+
response = requests.post(
|
134 |
+
'https://openrouter.ai/api/v1/chat/completions',
|
135 |
+
headers=headers,
|
136 |
+
json=data,
|
137 |
+
timeout=30
|
138 |
+
)
|
139 |
+
|
140 |
+
if response.status_code == 200:
|
141 |
+
response_data = response.json()
|
142 |
+
if 'choices' in response_data and len(response_data['choices']) > 0:
|
143 |
+
return response_data['choices'][0]['message']['content']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
else:
|
145 |
+
logger.error(f"Unexpected response format: {response_data}")
|
146 |
+
return None
|
147 |
+
else:
|
148 |
+
logger.error(f"API Error {response.status_code}: {response.text}")
|
149 |
+
return None
|
150 |
+
|
151 |
+
except Exception as e:
|
152 |
+
logger.error(f"Request failed: {str(e)}")
|
153 |
+
return None
|
154 |
+
|
155 |
+
def parse_script(script_text):
|
156 |
+
"""Parse the generated script into a list of elements."""
|
157 |
+
sections = {}
|
158 |
+
current_title = None
|
159 |
+
current_text = ""
|
160 |
+
|
161 |
+
try:
|
162 |
+
for line in script_text.splitlines():
|
163 |
+
line = line.strip()
|
164 |
+
if line.startswith("[") and "]" in line:
|
165 |
+
bracket_start = line.find("[")
|
166 |
+
bracket_end = line.find("]", bracket_start)
|
167 |
+
if bracket_start != -1 and bracket_end != -1:
|
168 |
+
if current_title is not None:
|
169 |
+
sections[current_title] = current_text.strip()
|
170 |
+
current_title = line[bracket_start+1:bracket_end]
|
171 |
+
current_text = line[bracket_end+1:].strip()
|
172 |
+
elif current_title:
|
173 |
+
current_text += line + " "
|
174 |
+
|
175 |
+
if current_title:
|
176 |
+
sections[current_title] = current_text.strip()
|
177 |
+
|
178 |
+
elements = []
|
179 |
+
for title, narration in sections.items():
|
180 |
+
if not title or not narration:
|
181 |
continue
|
|
|
|
|
182 |
|
183 |
+
media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
|
184 |
+
words = narration.split()
|
185 |
+
duration = max(3, len(words) * 0.5) # Estimate duration based on word count
|
186 |
+
tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
|
187 |
+
elements.append(media_element)
|
188 |
+
elements.append(tts_element)
|
189 |
+
|
190 |
+
return elements
|
191 |
+
except Exception as e:
|
192 |
+
logger.error(f"Error parsing script: {e}")
|
193 |
+
return []
|
194 |
+
|
195 |
+
def generate_tts(text, voice="en"):
|
196 |
+
"""Generate TTS audio using gTTS."""
|
197 |
+
safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
|
198 |
+
file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
|
199 |
+
|
200 |
+
try:
|
201 |
+
logger.info(f"Generating TTS for: {text[:30]}...")
|
202 |
+
tts = gTTS(text=text, lang='en', slow=False)
|
203 |
+
mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
|
204 |
+
tts.save(mp3_path)
|
205 |
+
|
206 |
+
# Convert MP3 to WAV
|
207 |
+
audio = AudioSegment.from_mp3(mp3_path)
|
208 |
+
# Adjust speed if needed
|
209 |
+
if voice_speed != 1.0:
|
210 |
+
audio = audio._spawn(audio.raw_data, overrides={
|
211 |
+
"frame_rate": int(audio.frame_rate * voice_speed)
|
212 |
+
})
|
213 |
+
audio.export(file_path, format="wav")
|
214 |
+
os.remove(mp3_path)
|
215 |
+
|
216 |
+
logger.info(f"TTS saved to {file_path}")
|
217 |
+
return file_path
|
218 |
+
except Exception as e:
|
219 |
+
logger.error(f"TTS generation error: {e}")
|
220 |
+
return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
|
221 |
+
|
222 |
+
def generate_silent_audio(duration, sample_rate=24000):
|
223 |
+
"""Generate a silent WAV audio file lasting 'duration' seconds."""
|
224 |
+
num_samples = int(duration * sample_rate)
|
225 |
+
silence = np.zeros(num_samples, dtype=np.float32)
|
226 |
+
silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
|
227 |
+
sf.write(silent_path, silence, sample_rate)
|
228 |
+
logger.info(f"Silent audio generated: {silent_path}")
|
229 |
+
return silent_path
|
230 |
+
|
231 |
+
def analyze_audio_with_whisper(audio_path):
|
232 |
"""
|
233 |
+
Use Whisper to transcribe audio and generate word-level timestamps.
|
234 |
+
Returns a list of dictionaries with word, start_time, and end_time.
|
235 |
"""
|
236 |
+
try:
|
237 |
+
if whisper_model is None:
|
238 |
+
load_whisper_model()
|
239 |
+
|
240 |
+
logger.info(f"Analyzing audio with Whisper: {audio_path}")
|
241 |
+
|
242 |
+
# Transcribe the audio file
|
243 |
+
result = whisper_model.transcribe(audio_path, word_timestamps=True)
|
244 |
+
|
245 |
+
# Extract word-level segments
|
246 |
+
word_segments = []
|
247 |
+
for segment in result["segments"]:
|
248 |
+
for word in segment["words"]:
|
249 |
+
word_segments.append({
|
250 |
+
"word": word["word"].strip(),
|
251 |
+
"start": word["start"],
|
252 |
+
"end": word["end"]
|
253 |
+
})
|
254 |
+
|
255 |
+
logger.info(f"Extracted {len(word_segments)} word segments")
|
256 |
+
return word_segments
|
257 |
+
except Exception as e:
|
258 |
+
logger.error(f"Whisper analysis error: {e}")
|
259 |
+
return []
|
260 |
|
261 |
+
def get_video_clip_segment(video_path, start_time, duration):
|
262 |
+
"""
|
263 |
+
Extract a segment from the video file starting at a random position,
|
264 |
+
but ensuring the segment is at least 'duration' seconds long.
|
265 |
+
"""
|
266 |
try:
|
267 |
+
video = VideoFileClip(video_path)
|
268 |
+
video_duration = video.duration
|
269 |
+
|
270 |
+
if duration > video_duration:
|
271 |
+
logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s). Using full video.")
|
272 |
+
return video
|
273 |
+
|
274 |
+
# Calculate a random start time ensuring we have enough duration left
|
275 |
+
max_start_time = video_duration - duration
|
276 |
+
if start_time is None or start_time > max_start_time:
|
277 |
+
start_time = random.uniform(0, max_start_time)
|
278 |
+
|
279 |
+
# Extract the segment
|
280 |
+
clip = video.subclip(start_time, start_time + duration)
|
281 |
+
logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
|
282 |
+
return clip
|
283 |
+
except Exception as e:
|
284 |
+
logger.error(f"Error extracting video segment: {e}")
|
285 |
+
return None
|
286 |
+
|
287 |
+
def create_word_level_subtitles(clip, words_data, font_size=45):
|
288 |
"""
|
289 |
+
Create subtitles that highlight words as they are spoken.
|
290 |
+
Takes a list of word dictionaries with timing information.
|
291 |
"""
|
292 |
+
try:
|
293 |
+
logger.info("Creating word-level synchronized subtitles")
|
294 |
+
# Group words into chunks of approximately 5 words
|
295 |
+
chunks = []
|
296 |
+
current_chunk = []
|
297 |
+
current_chunk_words = []
|
298 |
+
|
299 |
+
for word_data in words_data:
|
300 |
+
current_chunk_words.append(word_data["word"])
|
301 |
+
current_chunk.append(word_data)
|
302 |
+
|
303 |
+
if len(current_chunk_words) >= 5:
|
304 |
+
chunks.append({
|
305 |
+
"text": " ".join(current_chunk_words),
|
306 |
+
"words": current_chunk,
|
307 |
+
"start": current_chunk[0]["start"],
|
308 |
+
"end": current_chunk[-1]["end"]
|
309 |
+
})
|
310 |
+
current_chunk = []
|
311 |
+
current_chunk_words = []
|
312 |
+
|
313 |
+
# Add any remaining words
|
314 |
+
if current_chunk_words:
|
315 |
+
chunks.append({
|
316 |
+
"text": " ".join(current_chunk_words),
|
317 |
+
"words": current_chunk,
|
318 |
+
"start": current_chunk[0]["start"],
|
319 |
+
"end": current_chunk[-1]["end"]
|
320 |
+
})
|
321 |
+
|
322 |
+
# Create subtitle clips for each chunk
|
323 |
+
subtitle_clips = []
|
324 |
+
|
325 |
+
for chunk in chunks:
|
326 |
+
txt_clip = TextClip(
|
327 |
+
chunk["text"],
|
328 |
+
fontsize=font_size,
|
329 |
+
font='Arial-Bold',
|
330 |
+
color=CAPTION_COLOR,
|
331 |
+
bg_color='rgba(0, 0, 0, 0.5)',
|
332 |
+
method='caption',
|
333 |
+
align='center',
|
334 |
+
stroke_width=2,
|
335 |
+
stroke_color='black',
|
336 |
+
size=(TARGET_RESOLUTION[0] * 0.9, None)
|
337 |
+
).set_start(chunk["start"]).set_end(chunk["end"])
|
338 |
+
|
339 |
+
txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] * 0.85))
|
340 |
+
subtitle_clips.append(txt_clip)
|
341 |
+
|
342 |
+
logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
|
343 |
+
return subtitle_clips
|
344 |
+
except Exception as e:
|
345 |
+
logger.error(f"Error creating subtitles: {e}")
|
346 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
+
def add_background_music(final_video, bg_music_volume=0.08):
|
349 |
+
"""Add background music to the final video."""
|
350 |
+
try:
|
351 |
+
bg_music_path = "music.mp3"
|
352 |
+
if bg_music_path and os.path.exists(bg_music_path):
|
353 |
+
logger.info(f"Adding background music from: {bg_music_path}")
|
354 |
+
bg_music = AudioFileClip(bg_music_path)
|
355 |
+
if bg_music.duration < final_video.duration:
|
356 |
+
loops_needed = math.ceil(final_video.duration / bg_music.duration)
|
357 |
+
bg_segments = [bg_music] * loops_needed
|
358 |
+
bg_music = CompositeAudioClip(bg_segments)
|
359 |
+
bg_music = bg_music.subclip(0, final_video.duration)
|
360 |
+
bg_music = bg_music.volumex(bg_music_volume)
|
361 |
+
video_audio = final_video.audio
|
362 |
+
mixed_audio = CompositeAudioClip([video_audio, bg_music])
|
363 |
+
final_video = final_video.set_audio(mixed_audio)
|
364 |
+
logger.info("Background music added successfully")
|
365 |
+
else:
|
366 |
+
logger.info("No music file found, skipping background music")
|
367 |
+
return final_video
|
368 |
+
except Exception as e:
|
369 |
+
logger.error(f"Error adding background music: {e}")
|
370 |
+
logger.info("Continuing without background music")
|
371 |
+
return final_video
|
372 |
+
|
373 |
+
def create_clip(tts_path, narration_text, segment_index=0):
|
374 |
+
"""
|
375 |
+
Create a video clip with synchronized subtitles using whisper timestamps.
|
376 |
+
Uses a random segment from video.mp4 matching the audio duration.
|
377 |
+
"""
|
378 |
+
try:
|
379 |
+
logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
|
380 |
+
if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
|
381 |
+
logger.error("Missing video or TTS file")
|
382 |
+
return None
|
383 |
+
|
384 |
+
# Get audio duration
|
385 |
+
audio_clip = AudioFileClip(tts_path)
|
386 |
+
audio_duration = audio_clip.duration
|
387 |
+
target_duration = audio_duration + 0.5 # Add a small buffer
|
388 |
+
|
389 |
+
# Get a random segment from the main video
|
390 |
+
video_clip = get_video_clip_segment("video.mp4", None, target_duration)
|
391 |
+
if video_clip is None:
|
392 |
+
logger.error("Failed to extract video segment")
|
393 |
+
return None
|
394 |
+
|
395 |
+
# Resize to target resolution
|
396 |
+
video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
|
397 |
+
|
398 |
+
# Set the audio
|
399 |
+
video_clip = video_clip.set_audio(audio_clip)
|
400 |
+
|
401 |
+
# Generate word-level timestamps with Whisper
|
402 |
+
word_data = analyze_audio_with_whisper(tts_path)
|
403 |
+
|
404 |
+
if word_data:
|
405 |
+
# Create word-level subtitles
|
406 |
+
subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
|
407 |
+
if subtitle_clips:
|
408 |
+
# Combine video with subtitles
|
409 |
+
video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
|
410 |
+
else:
|
411 |
+
# Fallback to basic subtitle if whisper fails
|
412 |
+
logger.warning("Falling back to basic subtitles")
|
413 |
+
txt_clip = TextClip(
|
414 |
+
narration_text,
|
415 |
+
fontsize=font_size,
|
416 |
+
font='Arial-Bold',
|
417 |
+
color=CAPTION_COLOR,
|
418 |
+
bg_color='rgba(0, 0, 0, 0.5)',
|
419 |
+
method='caption',
|
420 |
+
align='center',
|
421 |
+
size=(TARGET_RESOLUTION[0] * 0.9, None)
|
422 |
+
).set_position(('center', TARGET_RESOLUTION[1] * 0.85)).set_duration(video_clip.duration)
|
423 |
+
|
424 |
+
video_clip = CompositeVideoClip([video_clip, txt_clip])
|
425 |
+
|
426 |
+
logger.info(f"Clip created: {video_clip.duration:.1f}s")
|
427 |
+
return video_clip
|
428 |
+
except Exception as e:
|
429 |
+
logger.error(f"Error in create_clip: {str(e)}")
|
430 |
+
return None
|
431 |
+
|
432 |
+
# Main Video Generation Function
|
433 |
+
def generate_video(user_input, resolution, caption_option):
|
434 |
+
"""Generate a video based on user input via Gradio."""
|
435 |
+
global TEMP_FOLDER, CAPTION_COLOR
|
436 |
+
|
437 |
+
# Set caption color based on option
|
438 |
+
CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
|
439 |
+
|
440 |
+
# Create a unique temporary folder
|
441 |
+
TEMP_FOLDER = tempfile.mkdtemp()
|
442 |
+
logger.info(f"Created temporary folder: {TEMP_FOLDER}")
|
443 |
+
|
444 |
+
# Check if video.mp4 exists
|
445 |
+
if not os.path.exists("video.mp4"):
|
446 |
+
logger.error("video.mp4 not found in the current directory")
|
447 |
+
return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
|
448 |
+
|
449 |
+
# Load Whisper model
|
450 |
+
load_whisper_model()
|
451 |
+
|
452 |
+
# Generate script
|
453 |
+
logger.info("Generating script from API...")
|
454 |
+
script = generate_script(user_input)
|
455 |
+
if not script:
|
456 |
+
logger.error("Failed to generate script.")
|
457 |
+
shutil.rmtree(TEMP_FOLDER)
|
458 |
+
return "Failed to generate script. Please try again."
|
459 |
+
|
460 |
+
logger.info("Generated Script:\n" + script)
|
461 |
+
|
462 |
+
# Parse script into elements
|
463 |
+
elements = parse_script(script)
|
464 |
+
if not elements:
|
465 |
+
logger.error("Failed to parse script into elements.")
|
466 |
+
shutil.rmtree(TEMP_FOLDER)
|
467 |
+
return "Failed to parse script. Please try again."
|
468 |
+
|
469 |
+
logger.info(f"Parsed {len(elements)//2} script segments.")
|
470 |
+
|
471 |
+
# Group elements into pairs (media prompt + TTS)
|
472 |
+
paired_elements = []
|
473 |
+
for i in range(0, len(elements), 2):
|
474 |
+
if i + 1 < len(elements):
|
475 |
+
paired_elements.append((elements[i], elements[i + 1]))
|
476 |
+
|
477 |
+
if not paired_elements:
|
478 |
+
logger.error("No valid script segments found.")
|
479 |
+
shutil.rmtree(TEMP_FOLDER)
|
480 |
+
return "No valid script segments were generated."
|
481 |
+
|
482 |
+
# Create video clips for each segment
|
483 |
+
clips = []
|
484 |
+
for idx, (media_elem, tts_elem) in enumerate(paired_elements):
|
485 |
+
logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
|
486 |
+
|
487 |
+
# Generate TTS for the segment
|
488 |
+
tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
|
489 |
+
if not tts_path:
|
490 |
+
logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
|
491 |
+
continue
|
492 |
+
|
493 |
+
# Create video clip with subtitles
|
494 |
+
clip = create_clip(
|
495 |
+
tts_path=tts_path,
|
496 |
+
narration_text=tts_elem['text'],
|
497 |
+
segment_index=idx
|
498 |
+
)
|
499 |
+
|
500 |
+
if clip:
|
501 |
+
clips.append(clip)
|
502 |
+
else:
|
503 |
+
logger.error(f"Clip creation failed for segment {idx+1}.")
|
504 |
+
|
505 |
+
if not clips:
|
506 |
+
logger.error("No clips were successfully created.")
|
507 |
+
shutil.rmtree(TEMP_FOLDER)
|
508 |
+
return "Failed to create any video clips. Please try again."
|
509 |
+
|
510 |
+
# Concatenate all clips
|
511 |
+
logger.info("\nConcatenating clips...")
|
512 |
+
final_video = concatenate_videoclips(clips, method="compose")
|
513 |
+
|
514 |
+
# Add background music if available
|
515 |
+
final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
|
516 |
+
|
517 |
+
# Export final video
|
518 |
+
logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
|
519 |
+
final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
|
520 |
+
logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
|
521 |
+
|
522 |
+
# Clean up
|
523 |
+
logger.info("Cleaning up temporary files...")
|
524 |
+
shutil.rmtree(TEMP_FOLDER)
|
525 |
+
logger.info("Temporary files removed.")
|
526 |
+
|
527 |
+
return OUTPUT_VIDEO_FILENAME
|
528 |
+
|
529 |
+
# Gradio Interface Setup
|
530 |
+
def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
|
531 |
+
global voice_speed, font_size, bg_music_volume, fps, preset
|
532 |
+
|
533 |
+
# Update global variables with user selections
|
534 |
+
voice_speed = v_speed
|
535 |
+
font_size = caption_size
|
536 |
+
bg_music_volume = bg_vol
|
537 |
+
fps = video_fps
|
538 |
+
preset = video_preset
|
539 |
+
|
540 |
+
# Handle music upload
|
541 |
+
if music_file is not None:
|
542 |
+
target_path = "music.mp3"
|
543 |
+
shutil.copy(music_file.name, target_path)
|
544 |
+
logger.info(f"Uploaded music saved as: {target_path}")
|
545 |
+
|
546 |
+
# Generate the video (always using vertical resolution)
|
547 |
+
return generate_video(user_input, "Short", caption_option)
|
548 |
+
|
549 |
+
# Create the Gradio interface
|
550 |
+
def create_interface():
|
551 |
+
iface = gr.Interface(
|
552 |
+
fn=generate_video_with_options,
|
553 |
+
inputs=[
|
554 |
+
gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
|
555 |
+
gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
|
556 |
+
gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
|
557 |
+
gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
|
558 |
+
gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
|
559 |
+
gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
|
560 |
+
value="veryfast", label="Export Preset"),
|
561 |
+
gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
|
562 |
+
gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
|
563 |
+
],
|
564 |
+
outputs=gr.Video(label="Generated Video"),
|
565 |
+
title="AI Documentary Video Generator",
|
566 |
+
description="""
|
567 |
+
Create short documentary videos with AI narration and synchronized captions.
|
568 |
+
1. Enter a topic or concept for your documentary
|
569 |
+
2. Optionally upload background music
|
570 |
+
3. Adjust settings as needed
|
571 |
+
4. Click submit and wait for video generation
|
572 |
+
|
573 |
+
NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space for this app to work.
|
574 |
+
"""
|
575 |
+
)
|
576 |
+
return iface
|
577 |
|
578 |
+
# Launch the application
|
579 |
if __name__ == "__main__":
|
580 |
+
# Create interface and launch
|
581 |
+
demo = create_interface()
|
582 |
+
demo.launch()
|
583 |
+
else:
|
584 |
+
# For importing as a module
|
585 |
+
demo = create_interface()
|