Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,497 +1,114 @@
|
|
1 |
-
import
|
2 |
-
import re
|
3 |
-
import time
|
4 |
-
import math
|
5 |
import tempfile
|
6 |
-
import
|
7 |
-
import
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
import soundfile as sf
|
11 |
from pydub import AudioSegment
|
12 |
-
|
13 |
-
import whisper # Ensure this is openai-whisper in requirements.txt
|
14 |
-
import gradio as gr
|
15 |
-
import requests
|
16 |
import json
|
17 |
-
|
18 |
-
VideoFileClip, concatenate_videoclips, AudioFileClip,
|
19 |
-
CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
|
20 |
-
)
|
21 |
-
import logging
|
22 |
-
|
23 |
-
# Set up logging
|
24 |
-
logging.basicConfig(level=logging.INFO,
|
25 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
26 |
-
logger = logging.getLogger(__name__)
|
27 |
|
28 |
-
#
|
29 |
OPENROUTER_API_KEY = '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'
|
30 |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
31 |
-
TARGET_RESOLUTION = (1080, 1920)
|
32 |
OUTPUT_VIDEO_FILENAME = "final_video.mp4"
|
33 |
-
TEMP_FOLDER = None
|
34 |
CAPTION_COLOR = "white"
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
"""Load the Whisper model."""
|
49 |
-
global whisper_model
|
50 |
-
try:
|
51 |
-
logger.info("Loading Whisper model...")
|
52 |
-
whisper_model = whisper.load_model("tiny") # Using tiny for CPU efficiency
|
53 |
-
logger.info("Whisper model loaded successfully")
|
54 |
-
return True
|
55 |
-
except Exception as e:
|
56 |
-
logger.error(f"Failed to load Whisper model: {e}")
|
57 |
-
return False
|
58 |
-
|
59 |
-
def generate_script(user_input):
|
60 |
-
"""Generate documentary script using OpenRouter API."""
|
61 |
-
headers = {
|
62 |
-
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
|
63 |
-
'HTTP-Referer': 'https://huggingface.co/spaces',
|
64 |
-
'X-Title': 'AI Documentary Maker'
|
65 |
-
}
|
66 |
-
|
67 |
-
prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
|
68 |
-
|
69 |
-
The script should sound like a real human voiceover from a TV show or documentary — clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
|
70 |
-
|
71 |
-
Structure:
|
72 |
-
- Break the script into scenes using [Tags]. Each tag is a short title (1–2 words) that describes the scene.
|
73 |
-
- Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
|
74 |
-
- The full script should make sense as one connected narration — no randomness.
|
75 |
-
- Use natural, formal English. No slang, no fake AI language, and no robotic tone.
|
76 |
-
- Do not use humor, sarcasm, or casual language. This is a serious narration.
|
77 |
-
- No emotion-sound words like "aww," "eww," "whoa," etc.
|
78 |
-
- Do not use numbers like 1, 2, 3 — write them out as one, two, three.
|
79 |
-
- Make the total narration about 1 minute long (around 150-200 words total).
|
80 |
-
- At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
|
81 |
-
|
82 |
-
Only output the script. No extra comments or text.
|
83 |
-
|
84 |
-
Example:
|
85 |
-
|
86 |
-
[Ocean]
|
87 |
-
The ocean covers over seventy percent of the Earth's surface.
|
88 |
-
|
89 |
-
[Currents]
|
90 |
-
Ocean currents distribute heat and regulate global climate patterns.
|
91 |
-
|
92 |
-
[Coral Reefs]
|
93 |
-
These ecosystems support over one million species of marine life.
|
94 |
-
|
95 |
-
[Pollution]
|
96 |
-
Plastic waste threatens marine biodiversity and food chains.
|
97 |
-
|
98 |
-
[Climate Impact]
|
99 |
-
Rising temperatures are causing coral bleaching and habitat loss.
|
100 |
-
|
101 |
-
[Subscribe]
|
102 |
-
Follow to explore more about the changing planet we live on.
|
103 |
-
|
104 |
-
Now here is the Topic: {user_input}
|
105 |
-
"""
|
106 |
-
|
107 |
-
data = {
|
108 |
-
'model': OPENROUTER_MODEL,
|
109 |
-
'messages': [{'role': 'user', 'content': prompt}],
|
110 |
-
'temperature': 0.4,
|
111 |
-
'max_tokens': 2000
|
112 |
-
}
|
113 |
-
|
114 |
-
try:
|
115 |
-
response = requests.post(
|
116 |
-
'https://openrouter.ai/api/v1/chat/completions',
|
117 |
-
headers=headers,
|
118 |
-
json=data,
|
119 |
-
timeout=30
|
120 |
-
)
|
121 |
-
|
122 |
-
if response.status_code == 200:
|
123 |
-
response_data = response.json()
|
124 |
-
if 'choices' in response_data and len(response_data['choices']) > 0:
|
125 |
-
return response_data['choices'][0]['message']['content']
|
126 |
-
else:
|
127 |
-
logger.error(f"Unexpected response format: {response_data}")
|
128 |
-
return None
|
129 |
-
else:
|
130 |
-
logger.error(f"API Error {response.status_code}: {response.text}")
|
131 |
-
return None
|
132 |
-
except Exception as e:
|
133 |
-
logger.error(f"Request failed: {str(e)}")
|
134 |
-
return None
|
135 |
-
|
136 |
-
def parse_script(script_text):
|
137 |
-
"""Parse the generated script into a list of elements."""
|
138 |
-
sections = {}
|
139 |
-
current_title = None
|
140 |
-
current_text = ""
|
141 |
-
|
142 |
-
try:
|
143 |
-
for line in script_text.splitlines():
|
144 |
-
line = line.strip()
|
145 |
-
if line.startswith("[") and "]" in line:
|
146 |
-
bracket_start = line.find("[")
|
147 |
-
bracket_end = line.find("]", bracket_start)
|
148 |
-
if bracket_start != -1 and bracket_end != -1:
|
149 |
-
if current_title is not None:
|
150 |
-
sections[current_title] = current_text.strip()
|
151 |
-
current_title = line[bracket_start+1:bracket_end]
|
152 |
-
current_text = line[bracket_end+1:].strip()
|
153 |
-
elif current_title:
|
154 |
-
current_text += line + " "
|
155 |
-
|
156 |
-
if current_title:
|
157 |
-
sections[current_title] = current_text.strip()
|
158 |
-
|
159 |
-
elements = []
|
160 |
-
for title, narration in sections.items():
|
161 |
-
if not title or not narration:
|
162 |
-
continue
|
163 |
-
|
164 |
-
media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
|
165 |
-
words = narration.split()
|
166 |
-
duration = max(3, len(words) * 0.5) # Estimate duration
|
167 |
-
tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
|
168 |
-
elements.append(media_element)
|
169 |
-
elements.append(tts_element)
|
170 |
-
|
171 |
-
return elements
|
172 |
-
except Exception as e:
|
173 |
-
logger.error(f"Error parsing script: {e}")
|
174 |
-
return []
|
175 |
-
|
176 |
-
def generate_tts(text, voice="en"):
|
177 |
-
"""Generate TTS audio using gTTS."""
|
178 |
-
safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
|
179 |
-
file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
|
180 |
-
|
181 |
-
try:
|
182 |
-
logger.info(f"Generating TTS for: {text[:30]}...")
|
183 |
-
tts = gTTS(text=text, lang='en', slow=False)
|
184 |
-
mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
|
185 |
-
tts.save(mp3_path)
|
186 |
-
|
187 |
-
# Convert MP3 to WAV
|
188 |
-
audio = AudioSegment.from_mp3(mp3_path)
|
189 |
-
if voice_speed != 1.0:
|
190 |
-
audio = audio._spawn(audio.raw_data, overrides={
|
191 |
-
"frame_rate": int(audio.frame_rate * voice_speed)
|
192 |
-
})
|
193 |
-
audio.export(file_path, format="wav")
|
194 |
-
os.remove(mp3_path)
|
195 |
-
|
196 |
-
logger.info(f"TTS saved to {file_path}")
|
197 |
-
return file_path
|
198 |
-
except Exception as e:
|
199 |
-
logger.error(f"TTS generation error: {e}")
|
200 |
-
return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
|
201 |
-
|
202 |
-
def generate_silent_audio(duration, sample_rate=24000):
|
203 |
-
"""Generate a silent WAV audio file."""
|
204 |
-
num_samples = int(duration * sample_rate)
|
205 |
-
silence = np.zeros(num_samples, dtype=np.float32)
|
206 |
-
silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
|
207 |
-
sf.write(silent_path, silence, sample_rate)
|
208 |
-
logger.info(f"Silent audio generated: {silent_path}")
|
209 |
-
return silent_path
|
210 |
-
|
211 |
-
def analyze_audio_with_whisper(audio_path):
|
212 |
-
"""Use Whisper to generate word-level timestamps."""
|
213 |
-
try:
|
214 |
-
if whisper_model is None:
|
215 |
-
load_whisper_model()
|
216 |
-
|
217 |
-
logger.info(f"Analyzing audio with Whisper: {audio_path}")
|
218 |
-
result = whisper_model.transcribe(audio_path, word_timestamps=True)
|
219 |
-
|
220 |
-
word_segments = []
|
221 |
-
for segment in result["segments"]:
|
222 |
-
for word in segment["words"]:
|
223 |
-
word_segments.append({
|
224 |
-
"word": word["word"].strip(),
|
225 |
-
"start": word["start"],
|
226 |
-
"end": word["end"]
|
227 |
-
})
|
228 |
-
|
229 |
-
logger.info(f"Extracted {len(word_segments)} word segments")
|
230 |
-
return word_segments
|
231 |
-
except Exception as e:
|
232 |
-
logger.error(f"Whisper analysis error: {e}")
|
233 |
-
return []
|
234 |
-
|
235 |
-
def get_video_clip_segment(video_path, start_time, duration):
|
236 |
-
"""Extract a random video segment."""
|
237 |
-
try:
|
238 |
-
video = VideoFileClip(video_path)
|
239 |
-
video_duration = video.duration
|
240 |
-
|
241 |
-
if duration > video_duration:
|
242 |
-
logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
|
243 |
-
return video
|
244 |
-
|
245 |
-
max_start_time = video_duration - duration
|
246 |
-
if start_time is None or start_time > max_start_time:
|
247 |
-
start_time = random.uniform(0, max_start_time)
|
248 |
-
|
249 |
-
clip = video.subclip(start_time, start_time + duration)
|
250 |
-
logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
|
251 |
-
return clip
|
252 |
-
except Exception as e:
|
253 |
-
logger.error(f"Error extracting video segment: {e}")
|
254 |
-
return None
|
255 |
-
|
256 |
-
def create_word_level_subtitles(clip, words_data, font_size=45):
|
257 |
-
"""Create synchronized subtitles without ImageMagick."""
|
258 |
-
try:
|
259 |
-
logger.info("Creating word-level synchronized subtitles")
|
260 |
-
chunks = []
|
261 |
-
current_chunk = []
|
262 |
-
current_chunk_words = []
|
263 |
-
|
264 |
-
for word_data in words_data:
|
265 |
-
current_chunk_words.append(word_data["word"])
|
266 |
-
current_chunk.append(word_data)
|
267 |
-
|
268 |
-
if len(current_chunk_words) >= 5:
|
269 |
-
chunks.append({
|
270 |
-
"text": " ".join(current_chunk_words),
|
271 |
-
"words": current_chunk,
|
272 |
-
"start": current_chunk[0]["start"],
|
273 |
-
"end": current_chunk[-1]["end"]
|
274 |
-
})
|
275 |
-
current_chunk = []
|
276 |
-
current_chunk_words = []
|
277 |
-
|
278 |
-
if current_chunk_words:
|
279 |
-
chunks.append({
|
280 |
-
"text": " ".join(current_chunk_words),
|
281 |
-
"words": current_chunk,
|
282 |
-
"start": current_chunk[0]["start"],
|
283 |
-
"end": current_chunk[-1]["end"]
|
284 |
-
})
|
285 |
-
|
286 |
-
subtitle_clips = []
|
287 |
-
for chunk in chunks:
|
288 |
-
txt_clip = TextClip(
|
289 |
-
chunk["text"],
|
290 |
-
fontsize=font_size,
|
291 |
-
color=CAPTION_COLOR,
|
292 |
-
method='label'
|
293 |
-
)
|
294 |
-
|
295 |
-
bg_clip = ColorClip(
|
296 |
-
size=(txt_clip.w + 20, txt_clip.h + 10),
|
297 |
-
color=(0, 0, 0, 128) # Semi-transparent black
|
298 |
-
)
|
299 |
-
|
300 |
-
subtitle_clip = CompositeVideoClip([
|
301 |
-
bg_clip.set_position('center'),
|
302 |
-
txt_clip.set_position('center')
|
303 |
-
])
|
304 |
-
subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
|
305 |
-
subtitle_clips.append(subtitle_clip)
|
306 |
-
|
307 |
-
logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
|
308 |
-
return subtitle_clips
|
309 |
-
except Exception as e:
|
310 |
-
logger.error(f"Error creating subtitles: {e}")
|
311 |
-
return []
|
312 |
-
|
313 |
-
def add_background_music(final_video, bg_music_volume=0.08):
|
314 |
-
"""Add background music to the video."""
|
315 |
-
try:
|
316 |
-
bg_music_path = "music.mp3"
|
317 |
-
if bg_music_path and os.path.exists(bg_music_path):
|
318 |
-
logger.info(f"Adding background music from: {bg_music_path}")
|
319 |
-
bg_music = AudioFileClip(bg_music_path)
|
320 |
-
if bg_music.duration < final_video.duration:
|
321 |
-
loops_needed = math.ceil(final_video.duration / bg_music.duration)
|
322 |
-
bg_segments = [bg_music] * loops_needed
|
323 |
-
bg_music = CompositeAudioClip(bg_segments)
|
324 |
-
bg_music = bg_music.subclip(0, final_video.duration)
|
325 |
-
bg_music = bg_music.volumex(bg_music_volume)
|
326 |
-
video_audio = final_video.audio
|
327 |
-
mixed_audio = CompositeAudioClip([video_audio, bg_music])
|
328 |
-
final_video = final_video.set_audio(mixed_audio)
|
329 |
-
logger.info("Background music added successfully")
|
330 |
-
else:
|
331 |
-
logger.info("No music file found, skipping background music")
|
332 |
-
return final_video
|
333 |
-
except Exception as e:
|
334 |
-
logger.error(f"Error adding background music: {e}")
|
335 |
-
return final_video
|
336 |
-
|
337 |
-
def create_clip(tts_path, narration_text, segment_index=0):
|
338 |
-
"""Create a video clip with synchronized subtitles."""
|
339 |
-
try:
|
340 |
-
logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
|
341 |
-
if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
|
342 |
-
logger.error("Missing video or TTS file")
|
343 |
-
return None
|
344 |
-
|
345 |
-
audio_clip = AudioFileClip(tts_path)
|
346 |
-
audio_duration = audio_clip.duration
|
347 |
-
target_duration = audio_duration + 0.5
|
348 |
-
|
349 |
-
video_clip = get_video_clip_segment("video.mp4", None, target_duration)
|
350 |
-
if video_clip is None:
|
351 |
-
logger.error("Failed to extract video segment")
|
352 |
-
return None
|
353 |
-
|
354 |
-
video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
|
355 |
-
video_clip = video_clip.set_audio(audio_clip)
|
356 |
-
|
357 |
-
word_data = analyze_audio_with_whisper(tts_path)
|
358 |
-
|
359 |
-
if word_data:
|
360 |
-
subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
|
361 |
-
if subtitle_clips:
|
362 |
-
video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
|
363 |
-
else:
|
364 |
-
logger.warning("Falling back to basic subtitles")
|
365 |
-
txt_clip = TextClip(
|
366 |
-
narration_text,
|
367 |
-
fontsize=font_size,
|
368 |
-
color=CAPTION_COLOR,
|
369 |
-
method='label'
|
370 |
-
)
|
371 |
-
|
372 |
-
bg_clip = ColorClip(
|
373 |
-
size=(txt_clip.w + 20, txt_clip.h + 10),
|
374 |
-
color=(0, 0, 0, 128)
|
375 |
-
)
|
376 |
-
|
377 |
-
subtitle_clip = CompositeVideoClip([
|
378 |
-
bg_clip.set_position('center'),
|
379 |
-
txt_clip.set_position('center')
|
380 |
-
])
|
381 |
-
subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
|
382 |
-
video_clip = CompositeVideoClip([video_clip, subtitle_clip])
|
383 |
-
|
384 |
-
logger.info(f"Clip created: {video_clip.duration:.1f}s")
|
385 |
-
return video_clip
|
386 |
-
except Exception as e:
|
387 |
-
logger.error(f"Error in create_clip: {str(e)}")
|
388 |
-
return None
|
389 |
-
|
390 |
-
def generate_video(user_input, resolution, caption_option):
|
391 |
-
"""Generate a video based on user input."""
|
392 |
-
global TEMP_FOLDER, CAPTION_COLOR
|
393 |
-
|
394 |
-
CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
|
395 |
-
TEMP_FOLDER = tempfile.mkdtemp()
|
396 |
-
logger.info(f"Created temporary folder: {TEMP_FOLDER}")
|
397 |
-
|
398 |
-
if not os.path.exists("video.mp4"):
|
399 |
-
logger.error("video.mp4 not found")
|
400 |
-
return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
|
401 |
-
|
402 |
-
load_whisper_model()
|
403 |
-
script = generate_script(user_input)
|
404 |
-
if not script:
|
405 |
-
shutil.rmtree(TEMP_FOLDER)
|
406 |
-
return "Failed to generate script."
|
407 |
-
|
408 |
-
logger.info("Generated Script:\n" + script)
|
409 |
-
elements = parse_script(script)
|
410 |
-
if not elements:
|
411 |
-
shutil.rmtree(TEMP_FOLDER)
|
412 |
-
return "Failed to parse script."
|
413 |
-
|
414 |
-
logger.info(f"Parsed {len(elements)//2} script segments.")
|
415 |
-
paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
|
416 |
-
|
417 |
-
if not paired_elements:
|
418 |
-
shutil.rmtree(TEMP_FOLDER)
|
419 |
-
return "No valid script segments generated."
|
420 |
-
|
421 |
-
clips = []
|
422 |
-
for idx, (media_elem, tts_elem) in enumerate(paired_elements):
|
423 |
-
logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
|
424 |
-
tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
|
425 |
-
if not tts_path:
|
426 |
-
continue
|
427 |
-
|
428 |
-
clip = create_clip(tts_path, tts_elem['text'], idx)
|
429 |
-
if clip:
|
430 |
-
clips.append(clip)
|
431 |
-
|
432 |
-
if not clips:
|
433 |
-
shutil.rmtree(TEMP_FOLDER)
|
434 |
-
return "Failed to create any video clips."
|
435 |
-
|
436 |
-
logger.info("\nConcatenating clips...")
|
437 |
-
final_video = concatenate_videoclips(clips, method="compose")
|
438 |
-
final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
|
439 |
-
|
440 |
-
logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
|
441 |
-
final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
|
442 |
-
logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
|
443 |
-
|
444 |
-
shutil.rmtree(TEMP_FOLDER)
|
445 |
-
logger.info("Temporary files removed.")
|
446 |
-
return OUTPUT_VIDEO_FILENAME
|
447 |
-
|
448 |
-
def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
|
449 |
-
"""Generate video with Gradio options."""
|
450 |
-
global voice_speed, font_size, bg_music_volume, fps, preset
|
451 |
-
|
452 |
-
voice_speed = v_speed
|
453 |
-
font_size = caption_size
|
454 |
-
bg_music_volume = bg_vol
|
455 |
-
fps = video_fps
|
456 |
-
preset = video_preset
|
457 |
-
|
458 |
-
if music_file is not None:
|
459 |
-
shutil.copy(music_file.name, "music.mp3")
|
460 |
-
logger.info(f"Uploaded music saved as: music.mp3")
|
461 |
-
|
462 |
-
return generate_video(user_input, "Short", caption_option)
|
463 |
-
|
464 |
-
def create_interface():
|
465 |
-
"""Create Gradio interface."""
|
466 |
-
iface = gr.Interface(
|
467 |
-
fn=generate_video_with_options,
|
468 |
-
inputs=[
|
469 |
-
gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
|
470 |
-
gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
|
471 |
-
gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
|
472 |
-
gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
|
473 |
-
gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
|
474 |
-
gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
|
475 |
-
value="veryfast", label="Export Preset"),
|
476 |
-
gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
|
477 |
-
gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
|
478 |
-
],
|
479 |
-
outputs=gr.Video(label="Generated Video"),
|
480 |
-
title="AI Documentary Video Generator",
|
481 |
-
description="""
|
482 |
-
Create short documentary videos with AI narration and synchronized captions.
|
483 |
-
1. Enter a topic or concept for your documentary
|
484 |
-
2. Optionally upload background music
|
485 |
-
3. Adjust settings as needed
|
486 |
-
4. Click submit and wait for video generation
|
487 |
-
|
488 |
-
NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
|
489 |
-
"""
|
490 |
)
|
491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
493 |
if __name__ == "__main__":
|
494 |
-
|
495 |
-
demo.launch()
|
496 |
-
else:
|
497 |
-
demo = create_interface()
|
|
|
1 |
+
import gradio as gr
|
|
|
|
|
|
|
2 |
import tempfile
|
3 |
+
import os
|
4 |
+
from moviepy.editor import *
|
|
|
|
|
|
|
5 |
from pydub import AudioSegment
|
6 |
+
import whisper
|
|
|
|
|
|
|
7 |
import json
|
8 |
+
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Configuration
|
11 |
OPENROUTER_API_KEY = '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'
|
12 |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
13 |
+
TARGET_RESOLUTION = (1080, 1920)
|
14 |
OUTPUT_VIDEO_FILENAME = "final_video.mp4"
|
|
|
15 |
CAPTION_COLOR = "white"
|
16 |
|
17 |
+
# Placeholder for Kokoro TTS
|
18 |
+
def kokoro_tts(text):
|
19 |
+
# TODO: Replace with actual Kokoro TTS implementation
|
20 |
+
# Should return path to generated audio file
|
21 |
+
return "dummy_audio.wav"
|
22 |
+
|
23 |
+
def generate_script(topic):
|
24 |
+
prompt = f"Generate a script about {topic} divided into parts, and output it as a JSON array of strings."
|
25 |
+
response = requests.post(
|
26 |
+
"https://api.openrouter.com/v1/completions",
|
27 |
+
headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"},
|
28 |
+
json={"model": OPENROUTER_MODEL, "prompt": prompt}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
+
script_json = response.json()["choices"][0]["text"]
|
31 |
+
return json.loads(script_json)
|
32 |
+
|
33 |
+
def generate_audio(script_parts, temp_folder):
|
34 |
+
full_audio = AudioSegment.empty()
|
35 |
+
for part in script_parts:
|
36 |
+
audio_file = kokoro_tts(part)
|
37 |
+
audio_segment = AudioSegment.from_file(audio_file)
|
38 |
+
silence = AudioSegment.silent(duration=300) # 0.3s gap
|
39 |
+
full_audio += audio_segment + silence
|
40 |
+
full_audio = full_audio[:-300] # Remove last silence
|
41 |
+
audio_path = os.path.join(temp_folder, "full_audio.wav")
|
42 |
+
full_audio.export(audio_path, format="wav")
|
43 |
+
return audio_path
|
44 |
+
|
45 |
+
def generate_subtitles(audio_path):
|
46 |
+
model = whisper.load_model("base")
|
47 |
+
result = model.transcribe(audio_path, word_timestamps=True)
|
48 |
+
return result['segments']
|
49 |
+
|
50 |
+
def process_background_video(audio_duration):
|
51 |
+
background = VideoFileClip("video.mp4")
|
52 |
+
background = background.resize(height=1920)
|
53 |
+
if background.w > 1080:
|
54 |
+
background = background.crop(x_center=background.w/2, width=1080)
|
55 |
+
required_duration = audio_duration + 0.5
|
56 |
+
if background.duration < required_duration:
|
57 |
+
n_loops = int(required_duration / background.duration) + 1
|
58 |
+
background = concatenate_videoclips([background] * n_loops)
|
59 |
+
return background.set_duration(required_duration)
|
60 |
+
|
61 |
+
def create_subtitle_clips(segments, video_height=1920, font_size=24, color='white', highlight_color='yellow'):
|
62 |
+
subtitle_y = video_height - 200
|
63 |
+
all_words = [word for segment in segments for word in segment['words']]
|
64 |
+
chunks = [all_words[i:i+5] for i in range(0, len(all_words), 5)]
|
65 |
+
subtitle_clips = []
|
66 |
+
for chunk in chunks:
|
67 |
+
for i, word in enumerate(chunk):
|
68 |
+
line_clip = create_text_line(chunk, i, font_size, color, highlight_color)
|
69 |
+
line_clip = line_clip.set_start(word['start']).set_end(word['end']).set_pos(('center', subtitle_y))
|
70 |
+
subtitle_clips.append(line_clip)
|
71 |
+
return subtitle_clips
|
72 |
+
|
73 |
+
def create_text_line(words, highlighted_index, font_size, color, highlight_color):
|
74 |
+
space_clip = TextClip(" ", fontsize=font_size, color=color)
|
75 |
+
space_width = space_clip.w
|
76 |
+
text_clips = []
|
77 |
+
total_width = 0
|
78 |
+
for i, word in enumerate(words):
|
79 |
+
c = highlight_color if i == highlighted_index else color
|
80 |
+
text_clip = TextClip(word['word'], fontsize=font_size, color=c)
|
81 |
+
text_clips.append(text_clip)
|
82 |
+
total_width += text_clip.w + (space_width if i < len(words) - 1 else 0)
|
83 |
+
current_x = -total_width / 2
|
84 |
+
positioned_clips = []
|
85 |
+
for clip in text_clips:
|
86 |
+
positioned_clips.append(clip.set_pos((current_x, 0)))
|
87 |
+
current_x += clip.w + space_width
|
88 |
+
return CompositeVideoClip(positioned_clips, size=(total_width, text_clips[0].h))
|
89 |
+
|
90 |
+
def generate_video(topic):
|
91 |
+
with tempfile.TemporaryDirectory() as temp_folder:
|
92 |
+
script_parts = generate_script(topic)
|
93 |
+
audio_path = generate_audio(script_parts, temp_folder)
|
94 |
+
audio_duration = AudioSegment.from_file(audio_path).duration_seconds
|
95 |
+
segments = generate_subtitles(audio_path)
|
96 |
+
background = process_background_video(audio_duration)
|
97 |
+
subtitle_clips = create_subtitle_clips(segments)
|
98 |
+
audio_clip = AudioFileClip(audio_path)
|
99 |
+
final_video = background.set_audio(audio_clip)
|
100 |
+
final_video = CompositeVideoClip([final_video] + subtitle_clips)
|
101 |
+
output_path = os.path.join(temp_folder, OUTPUT_VIDEO_FILENAME)
|
102 |
+
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
103 |
+
return output_path
|
104 |
+
|
105 |
+
# Gradio UI
|
106 |
+
iface = gr.Interface(
|
107 |
+
fn=generate_video,
|
108 |
+
inputs=gr.Textbox(label="Topic"),
|
109 |
+
outputs=gr.Video(label="Generated YouTube Short"),
|
110 |
+
title="YouTube Short Creator"
|
111 |
+
)
|
112 |
|
113 |
if __name__ == "__main__":
|
114 |
+
iface.launch()
|
|
|
|
|
|