Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# Import necessary libraries
|
2 |
import os
|
3 |
import re
|
4 |
import time
|
@@ -9,21 +8,16 @@ import shutil
|
|
9 |
import torch
|
10 |
import numpy as np
|
11 |
import soundfile as sf
|
12 |
-
from PIL import Image, ImageDraw, ImageFont
|
13 |
from pydub import AudioSegment
|
14 |
from gtts import gTTS
|
15 |
-
import whisper
|
16 |
import gradio as gr
|
17 |
import requests
|
18 |
import json
|
19 |
from moviepy.editor import (
|
20 |
VideoFileClip, concatenate_videoclips, AudioFileClip,
|
21 |
-
CompositeVideoClip, TextClip, CompositeAudioClip
|
22 |
)
|
23 |
-
import subprocess
|
24 |
-
import cv2
|
25 |
-
import moviepy.config as mpy_config
|
26 |
-
import moviepy.video.fx.all as vfx
|
27 |
import logging
|
28 |
|
29 |
# Set up logging
|
@@ -31,18 +25,15 @@ logging.basicConfig(level=logging.INFO,
|
|
31 |
format='%(asctime)s - %(levelname)s - %(message)s')
|
32 |
logger = logging.getLogger(__name__)
|
33 |
|
34 |
-
# Configure moviepy
|
35 |
-
mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
|
36 |
-
|
37 |
# Global Configuration Variables
|
38 |
OPENROUTER_API_KEY = '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'
|
39 |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
40 |
-
TARGET_RESOLUTION = (1080, 1920) #
|
41 |
OUTPUT_VIDEO_FILENAME = "final_video.mp4"
|
42 |
TEMP_FOLDER = None
|
43 |
CAPTION_COLOR = "white"
|
44 |
|
45 |
-
# Additional global variables for
|
46 |
selected_voice = 'en_us_001' # Default voice
|
47 |
voice_speed = 1.0 # Default voice speed
|
48 |
font_size = 45 # Default font size
|
@@ -50,7 +41,7 @@ bg_music_volume = 0.08 # Default background music volume
|
|
50 |
fps = 30 # Default FPS
|
51 |
preset = "veryfast" # Default preset
|
52 |
|
53 |
-
# Initialize whisper model globally
|
54 |
whisper_model = None
|
55 |
|
56 |
def load_whisper_model():
|
@@ -65,7 +56,6 @@ def load_whisper_model():
|
|
65 |
logger.error(f"Failed to load Whisper model: {e}")
|
66 |
return False
|
67 |
|
68 |
-
# Helper Functions
|
69 |
def generate_script(user_input):
|
70 |
"""Generate documentary script using OpenRouter API."""
|
71 |
headers = {
|
@@ -94,31 +84,23 @@ Only output the script. No extra comments or text.
|
|
94 |
Example:
|
95 |
|
96 |
[Ocean]
|
97 |
-
|
98 |
The ocean covers over seventy percent of the Earth's surface.
|
99 |
|
100 |
[Currents]
|
101 |
-
|
102 |
Ocean currents distribute heat and regulate global climate patterns.
|
103 |
|
104 |
[Coral Reefs]
|
105 |
-
|
106 |
These ecosystems support over one million species of marine life.
|
107 |
|
108 |
[Pollution]
|
109 |
-
|
110 |
Plastic waste threatens marine biodiversity and food chains.
|
111 |
|
112 |
[Climate Impact]
|
113 |
-
|
114 |
Rising temperatures are causing coral bleaching and habitat loss.
|
115 |
|
116 |
[Subscribe]
|
117 |
-
|
118 |
Follow to explore more about the changing planet we live on.
|
119 |
|
120 |
-
|
121 |
-
|
122 |
Now here is the Topic: {user_input}
|
123 |
"""
|
124 |
|
@@ -147,7 +129,6 @@ Now here is the Topic: {user_input}
|
|
147 |
else:
|
148 |
logger.error(f"API Error {response.status_code}: {response.text}")
|
149 |
return None
|
150 |
-
|
151 |
except Exception as e:
|
152 |
logger.error(f"Request failed: {str(e)}")
|
153 |
return None
|
@@ -182,7 +163,7 @@ def parse_script(script_text):
|
|
182 |
|
183 |
media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
|
184 |
words = narration.split()
|
185 |
-
duration = max(3, len(words) * 0.5) # Estimate duration
|
186 |
tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
|
187 |
elements.append(media_element)
|
188 |
elements.append(tts_element)
|
@@ -205,7 +186,6 @@ def generate_tts(text, voice="en"):
|
|
205 |
|
206 |
# Convert MP3 to WAV
|
207 |
audio = AudioSegment.from_mp3(mp3_path)
|
208 |
-
# Adjust speed if needed
|
209 |
if voice_speed != 1.0:
|
210 |
audio = audio._spawn(audio.raw_data, overrides={
|
211 |
"frame_rate": int(audio.frame_rate * voice_speed)
|
@@ -220,7 +200,7 @@ def generate_tts(text, voice="en"):
|
|
220 |
return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
|
221 |
|
222 |
def generate_silent_audio(duration, sample_rate=24000):
|
223 |
-
"""Generate a silent WAV audio file
|
224 |
num_samples = int(duration * sample_rate)
|
225 |
silence = np.zeros(num_samples, dtype=np.float32)
|
226 |
silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
|
@@ -229,20 +209,14 @@ def generate_silent_audio(duration, sample_rate=24000):
|
|
229 |
return silent_path
|
230 |
|
231 |
def analyze_audio_with_whisper(audio_path):
|
232 |
-
"""
|
233 |
-
Use Whisper to transcribe audio and generate word-level timestamps.
|
234 |
-
Returns a list of dictionaries with word, start_time, and end_time.
|
235 |
-
"""
|
236 |
try:
|
237 |
if whisper_model is None:
|
238 |
load_whisper_model()
|
239 |
|
240 |
logger.info(f"Analyzing audio with Whisper: {audio_path}")
|
241 |
-
|
242 |
-
# Transcribe the audio file
|
243 |
result = whisper_model.transcribe(audio_path, word_timestamps=True)
|
244 |
|
245 |
-
# Extract word-level segments
|
246 |
word_segments = []
|
247 |
for segment in result["segments"]:
|
248 |
for word in segment["words"]:
|
@@ -259,24 +233,19 @@ def analyze_audio_with_whisper(audio_path):
|
|
259 |
return []
|
260 |
|
261 |
def get_video_clip_segment(video_path, start_time, duration):
|
262 |
-
"""
|
263 |
-
Extract a segment from the video file starting at a random position,
|
264 |
-
but ensuring the segment is at least 'duration' seconds long.
|
265 |
-
"""
|
266 |
try:
|
267 |
video = VideoFileClip(video_path)
|
268 |
video_duration = video.duration
|
269 |
|
270 |
if duration > video_duration:
|
271 |
-
logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).
|
272 |
return video
|
273 |
|
274 |
-
# Calculate a random start time ensuring we have enough duration left
|
275 |
max_start_time = video_duration - duration
|
276 |
if start_time is None or start_time > max_start_time:
|
277 |
start_time = random.uniform(0, max_start_time)
|
278 |
|
279 |
-
# Extract the segment
|
280 |
clip = video.subclip(start_time, start_time + duration)
|
281 |
logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
|
282 |
return clip
|
@@ -285,13 +254,9 @@ def get_video_clip_segment(video_path, start_time, duration):
|
|
285 |
return None
|
286 |
|
287 |
def create_word_level_subtitles(clip, words_data, font_size=45):
|
288 |
-
"""
|
289 |
-
Create subtitles that highlight words as they are spoken.
|
290 |
-
Takes a list of word dictionaries with timing information.
|
291 |
-
"""
|
292 |
try:
|
293 |
logger.info("Creating word-level synchronized subtitles")
|
294 |
-
# Group words into chunks of approximately 5 words
|
295 |
chunks = []
|
296 |
current_chunk = []
|
297 |
current_chunk_words = []
|
@@ -310,7 +275,6 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
|
|
310 |
current_chunk = []
|
311 |
current_chunk_words = []
|
312 |
|
313 |
-
# Add any remaining words
|
314 |
if current_chunk_words:
|
315 |
chunks.append({
|
316 |
"text": " ".join(current_chunk_words),
|
@@ -319,25 +283,26 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
|
|
319 |
"end": current_chunk[-1]["end"]
|
320 |
})
|
321 |
|
322 |
-
# Create subtitle clips for each chunk
|
323 |
subtitle_clips = []
|
324 |
-
|
325 |
for chunk in chunks:
|
326 |
txt_clip = TextClip(
|
327 |
chunk["text"],
|
328 |
fontsize=font_size,
|
329 |
-
font='Arial-Bold',
|
330 |
color=CAPTION_COLOR,
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
)
|
338 |
|
339 |
-
|
340 |
-
|
|
|
|
|
|
|
|
|
341 |
|
342 |
logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
|
343 |
return subtitle_clips
|
@@ -346,7 +311,7 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
|
|
346 |
return []
|
347 |
|
348 |
def add_background_music(final_video, bg_music_volume=0.08):
|
349 |
-
"""Add background music to the
|
350 |
try:
|
351 |
bg_music_path = "music.mp3"
|
352 |
if bg_music_path and os.path.exists(bg_music_path):
|
@@ -367,61 +332,54 @@ def add_background_music(final_video, bg_music_volume=0.08):
|
|
367 |
return final_video
|
368 |
except Exception as e:
|
369 |
logger.error(f"Error adding background music: {e}")
|
370 |
-
logger.info("Continuing without background music")
|
371 |
return final_video
|
372 |
|
373 |
def create_clip(tts_path, narration_text, segment_index=0):
|
374 |
-
"""
|
375 |
-
Create a video clip with synchronized subtitles using whisper timestamps.
|
376 |
-
Uses a random segment from video.mp4 matching the audio duration.
|
377 |
-
"""
|
378 |
try:
|
379 |
logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
|
380 |
if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
|
381 |
logger.error("Missing video or TTS file")
|
382 |
return None
|
383 |
|
384 |
-
# Get audio duration
|
385 |
audio_clip = AudioFileClip(tts_path)
|
386 |
audio_duration = audio_clip.duration
|
387 |
-
target_duration = audio_duration + 0.5
|
388 |
|
389 |
-
# Get a random segment from the main video
|
390 |
video_clip = get_video_clip_segment("video.mp4", None, target_duration)
|
391 |
if video_clip is None:
|
392 |
logger.error("Failed to extract video segment")
|
393 |
return None
|
394 |
|
395 |
-
# Resize to target resolution
|
396 |
video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
|
397 |
-
|
398 |
-
# Set the audio
|
399 |
video_clip = video_clip.set_audio(audio_clip)
|
400 |
|
401 |
-
# Generate word-level timestamps with Whisper
|
402 |
word_data = analyze_audio_with_whisper(tts_path)
|
403 |
|
404 |
if word_data:
|
405 |
-
# Create word-level subtitles
|
406 |
subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
|
407 |
if subtitle_clips:
|
408 |
-
# Combine video with subtitles
|
409 |
video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
|
410 |
else:
|
411 |
-
# Fallback to basic subtitle if whisper fails
|
412 |
logger.warning("Falling back to basic subtitles")
|
413 |
txt_clip = TextClip(
|
414 |
narration_text,
|
415 |
fontsize=font_size,
|
416 |
-
font='Arial-Bold',
|
417 |
color=CAPTION_COLOR,
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
|
|
|
|
423 |
|
424 |
-
|
|
|
|
|
|
|
|
|
|
|
425 |
|
426 |
logger.info(f"Clip created: {video_clip.duration:.1f}s")
|
427 |
return video_clip
|
@@ -429,125 +387,82 @@ def create_clip(tts_path, narration_text, segment_index=0):
|
|
429 |
logger.error(f"Error in create_clip: {str(e)}")
|
430 |
return None
|
431 |
|
432 |
-
# Main Video Generation Function
|
433 |
def generate_video(user_input, resolution, caption_option):
|
434 |
-
"""Generate a video based on user input
|
435 |
global TEMP_FOLDER, CAPTION_COLOR
|
436 |
|
437 |
-
# Set caption color based on option
|
438 |
CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
|
439 |
-
|
440 |
-
# Create a unique temporary folder
|
441 |
TEMP_FOLDER = tempfile.mkdtemp()
|
442 |
logger.info(f"Created temporary folder: {TEMP_FOLDER}")
|
443 |
|
444 |
-
# Check if video.mp4 exists
|
445 |
if not os.path.exists("video.mp4"):
|
446 |
-
logger.error("video.mp4 not found
|
447 |
return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
|
448 |
|
449 |
-
# Load Whisper model
|
450 |
load_whisper_model()
|
451 |
-
|
452 |
-
# Generate script
|
453 |
-
logger.info("Generating script from API...")
|
454 |
script = generate_script(user_input)
|
455 |
if not script:
|
456 |
-
logger.error("Failed to generate script.")
|
457 |
shutil.rmtree(TEMP_FOLDER)
|
458 |
-
return "Failed to generate script.
|
459 |
|
460 |
logger.info("Generated Script:\n" + script)
|
461 |
-
|
462 |
-
# Parse script into elements
|
463 |
elements = parse_script(script)
|
464 |
if not elements:
|
465 |
-
logger.error("Failed to parse script into elements.")
|
466 |
shutil.rmtree(TEMP_FOLDER)
|
467 |
-
return "Failed to parse script.
|
468 |
|
469 |
logger.info(f"Parsed {len(elements)//2} script segments.")
|
470 |
-
|
471 |
-
|
472 |
-
paired_elements = []
|
473 |
-
for i in range(0, len(elements), 2):
|
474 |
-
if i + 1 < len(elements):
|
475 |
-
paired_elements.append((elements[i], elements[i + 1]))
|
476 |
-
|
477 |
if not paired_elements:
|
478 |
-
logger.error("No valid script segments found.")
|
479 |
shutil.rmtree(TEMP_FOLDER)
|
480 |
-
return "No valid script segments
|
481 |
-
|
482 |
-
# Create video clips for each segment
|
483 |
clips = []
|
484 |
for idx, (media_elem, tts_elem) in enumerate(paired_elements):
|
485 |
logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
|
486 |
-
|
487 |
-
# Generate TTS for the segment
|
488 |
tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
|
489 |
if not tts_path:
|
490 |
-
logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
|
491 |
continue
|
492 |
|
493 |
-
|
494 |
-
clip = create_clip(
|
495 |
-
tts_path=tts_path,
|
496 |
-
narration_text=tts_elem['text'],
|
497 |
-
segment_index=idx
|
498 |
-
)
|
499 |
-
|
500 |
if clip:
|
501 |
clips.append(clip)
|
502 |
-
|
503 |
-
logger.error(f"Clip creation failed for segment {idx+1}.")
|
504 |
-
|
505 |
if not clips:
|
506 |
-
logger.error("No clips were successfully created.")
|
507 |
shutil.rmtree(TEMP_FOLDER)
|
508 |
-
return "Failed to create any video clips.
|
509 |
-
|
510 |
-
# Concatenate all clips
|
511 |
logger.info("\nConcatenating clips...")
|
512 |
final_video = concatenate_videoclips(clips, method="compose")
|
513 |
-
|
514 |
-
# Add background music if available
|
515 |
final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
|
516 |
-
|
517 |
-
# Export final video
|
518 |
logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
|
519 |
final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
|
520 |
logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
|
521 |
-
|
522 |
-
# Clean up
|
523 |
-
logger.info("Cleaning up temporary files...")
|
524 |
shutil.rmtree(TEMP_FOLDER)
|
525 |
logger.info("Temporary files removed.")
|
526 |
-
|
527 |
return OUTPUT_VIDEO_FILENAME
|
528 |
|
529 |
-
# Gradio Interface Setup
|
530 |
def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
|
|
|
531 |
global voice_speed, font_size, bg_music_volume, fps, preset
|
532 |
|
533 |
-
# Update global variables with user selections
|
534 |
voice_speed = v_speed
|
535 |
font_size = caption_size
|
536 |
bg_music_volume = bg_vol
|
537 |
fps = video_fps
|
538 |
preset = video_preset
|
539 |
|
540 |
-
# Handle music upload
|
541 |
if music_file is not None:
|
542 |
-
|
543 |
-
|
544 |
-
logger.info(f"Uploaded music saved as: {target_path}")
|
545 |
|
546 |
-
# Generate the video (always using vertical resolution)
|
547 |
return generate_video(user_input, "Short", caption_option)
|
548 |
|
549 |
-
# Create the Gradio interface
|
550 |
def create_interface():
|
|
|
551 |
iface = gr.Interface(
|
552 |
fn=generate_video_with_options,
|
553 |
inputs=[
|
@@ -570,16 +485,13 @@ def create_interface():
|
|
570 |
3. Adjust settings as needed
|
571 |
4. Click submit and wait for video generation
|
572 |
|
573 |
-
NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space
|
574 |
"""
|
575 |
)
|
576 |
return iface
|
577 |
|
578 |
-
# Launch the application
|
579 |
if __name__ == "__main__":
|
580 |
-
# Create interface and launch
|
581 |
demo = create_interface()
|
582 |
demo.launch()
|
583 |
else:
|
584 |
-
# For importing as a module
|
585 |
demo = create_interface()
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import time
|
|
|
8 |
import torch
|
9 |
import numpy as np
|
10 |
import soundfile as sf
|
|
|
11 |
from pydub import AudioSegment
|
12 |
from gtts import gTTS
|
13 |
+
import whisper # Ensure this is openai-whisper in requirements.txt
|
14 |
import gradio as gr
|
15 |
import requests
|
16 |
import json
|
17 |
from moviepy.editor import (
|
18 |
VideoFileClip, concatenate_videoclips, AudioFileClip,
|
19 |
+
CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
|
20 |
)
|
|
|
|
|
|
|
|
|
21 |
import logging
|
22 |
|
23 |
# Set up logging
|
|
|
25 |
format='%(asctime)s - %(levelname)s - %(message)s')
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
|
|
|
|
|
|
28 |
# Global Configuration Variables
|
29 |
OPENROUTER_API_KEY = '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'
|
30 |
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
31 |
+
TARGET_RESOLUTION = (1080, 1920) # Vertical format for shorts
|
32 |
OUTPUT_VIDEO_FILENAME = "final_video.mp4"
|
33 |
TEMP_FOLDER = None
|
34 |
CAPTION_COLOR = "white"
|
35 |
|
36 |
+
# Additional global variables for Gradio interface
|
37 |
selected_voice = 'en_us_001' # Default voice
|
38 |
voice_speed = 1.0 # Default voice speed
|
39 |
font_size = 45 # Default font size
|
|
|
41 |
fps = 30 # Default FPS
|
42 |
preset = "veryfast" # Default preset
|
43 |
|
44 |
+
# Initialize whisper model globally
|
45 |
whisper_model = None
|
46 |
|
47 |
def load_whisper_model():
|
|
|
56 |
logger.error(f"Failed to load Whisper model: {e}")
|
57 |
return False
|
58 |
|
|
|
59 |
def generate_script(user_input):
|
60 |
"""Generate documentary script using OpenRouter API."""
|
61 |
headers = {
|
|
|
84 |
Example:
|
85 |
|
86 |
[Ocean]
|
|
|
87 |
The ocean covers over seventy percent of the Earth's surface.
|
88 |
|
89 |
[Currents]
|
|
|
90 |
Ocean currents distribute heat and regulate global climate patterns.
|
91 |
|
92 |
[Coral Reefs]
|
|
|
93 |
These ecosystems support over one million species of marine life.
|
94 |
|
95 |
[Pollution]
|
|
|
96 |
Plastic waste threatens marine biodiversity and food chains.
|
97 |
|
98 |
[Climate Impact]
|
|
|
99 |
Rising temperatures are causing coral bleaching and habitat loss.
|
100 |
|
101 |
[Subscribe]
|
|
|
102 |
Follow to explore more about the changing planet we live on.
|
103 |
|
|
|
|
|
104 |
Now here is the Topic: {user_input}
|
105 |
"""
|
106 |
|
|
|
129 |
else:
|
130 |
logger.error(f"API Error {response.status_code}: {response.text}")
|
131 |
return None
|
|
|
132 |
except Exception as e:
|
133 |
logger.error(f"Request failed: {str(e)}")
|
134 |
return None
|
|
|
163 |
|
164 |
media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
|
165 |
words = narration.split()
|
166 |
+
duration = max(3, len(words) * 0.5) # Estimate duration
|
167 |
tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
|
168 |
elements.append(media_element)
|
169 |
elements.append(tts_element)
|
|
|
186 |
|
187 |
# Convert MP3 to WAV
|
188 |
audio = AudioSegment.from_mp3(mp3_path)
|
|
|
189 |
if voice_speed != 1.0:
|
190 |
audio = audio._spawn(audio.raw_data, overrides={
|
191 |
"frame_rate": int(audio.frame_rate * voice_speed)
|
|
|
200 |
return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
|
201 |
|
202 |
def generate_silent_audio(duration, sample_rate=24000):
|
203 |
+
"""Generate a silent WAV audio file."""
|
204 |
num_samples = int(duration * sample_rate)
|
205 |
silence = np.zeros(num_samples, dtype=np.float32)
|
206 |
silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
|
|
|
209 |
return silent_path
|
210 |
|
211 |
def analyze_audio_with_whisper(audio_path):
|
212 |
+
"""Use Whisper to generate word-level timestamps."""
|
|
|
|
|
|
|
213 |
try:
|
214 |
if whisper_model is None:
|
215 |
load_whisper_model()
|
216 |
|
217 |
logger.info(f"Analyzing audio with Whisper: {audio_path}")
|
|
|
|
|
218 |
result = whisper_model.transcribe(audio_path, word_timestamps=True)
|
219 |
|
|
|
220 |
word_segments = []
|
221 |
for segment in result["segments"]:
|
222 |
for word in segment["words"]:
|
|
|
233 |
return []
|
234 |
|
235 |
def get_video_clip_segment(video_path, start_time, duration):
|
236 |
+
"""Extract a random video segment."""
|
|
|
|
|
|
|
237 |
try:
|
238 |
video = VideoFileClip(video_path)
|
239 |
video_duration = video.duration
|
240 |
|
241 |
if duration > video_duration:
|
242 |
+
logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
|
243 |
return video
|
244 |
|
|
|
245 |
max_start_time = video_duration - duration
|
246 |
if start_time is None or start_time > max_start_time:
|
247 |
start_time = random.uniform(0, max_start_time)
|
248 |
|
|
|
249 |
clip = video.subclip(start_time, start_time + duration)
|
250 |
logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
|
251 |
return clip
|
|
|
254 |
return None
|
255 |
|
256 |
def create_word_level_subtitles(clip, words_data, font_size=45):
|
257 |
+
"""Create synchronized subtitles without ImageMagick."""
|
|
|
|
|
|
|
258 |
try:
|
259 |
logger.info("Creating word-level synchronized subtitles")
|
|
|
260 |
chunks = []
|
261 |
current_chunk = []
|
262 |
current_chunk_words = []
|
|
|
275 |
current_chunk = []
|
276 |
current_chunk_words = []
|
277 |
|
|
|
278 |
if current_chunk_words:
|
279 |
chunks.append({
|
280 |
"text": " ".join(current_chunk_words),
|
|
|
283 |
"end": current_chunk[-1]["end"]
|
284 |
})
|
285 |
|
|
|
286 |
subtitle_clips = []
|
|
|
287 |
for chunk in chunks:
|
288 |
txt_clip = TextClip(
|
289 |
chunk["text"],
|
290 |
fontsize=font_size,
|
|
|
291 |
color=CAPTION_COLOR,
|
292 |
+
method='label'
|
293 |
+
)
|
294 |
+
|
295 |
+
bg_clip = ColorClip(
|
296 |
+
size=(txt_clip.w + 20, txt_clip.h + 10),
|
297 |
+
color=(0, 0, 0, 128) # Semi-transparent black
|
298 |
+
)
|
299 |
|
300 |
+
subtitle_clip = CompositeVideoClip([
|
301 |
+
bg_clip.set_position('center'),
|
302 |
+
txt_clip.set_position('center')
|
303 |
+
])
|
304 |
+
subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
|
305 |
+
subtitle_clips.append(subtitle_clip)
|
306 |
|
307 |
logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
|
308 |
return subtitle_clips
|
|
|
311 |
return []
|
312 |
|
313 |
def add_background_music(final_video, bg_music_volume=0.08):
|
314 |
+
"""Add background music to the video."""
|
315 |
try:
|
316 |
bg_music_path = "music.mp3"
|
317 |
if bg_music_path and os.path.exists(bg_music_path):
|
|
|
332 |
return final_video
|
333 |
except Exception as e:
|
334 |
logger.error(f"Error adding background music: {e}")
|
|
|
335 |
return final_video
|
336 |
|
337 |
def create_clip(tts_path, narration_text, segment_index=0):
|
338 |
+
"""Create a video clip with synchronized subtitles."""
|
|
|
|
|
|
|
339 |
try:
|
340 |
logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
|
341 |
if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
|
342 |
logger.error("Missing video or TTS file")
|
343 |
return None
|
344 |
|
|
|
345 |
audio_clip = AudioFileClip(tts_path)
|
346 |
audio_duration = audio_clip.duration
|
347 |
+
target_duration = audio_duration + 0.5
|
348 |
|
|
|
349 |
video_clip = get_video_clip_segment("video.mp4", None, target_duration)
|
350 |
if video_clip is None:
|
351 |
logger.error("Failed to extract video segment")
|
352 |
return None
|
353 |
|
|
|
354 |
video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
|
|
|
|
|
355 |
video_clip = video_clip.set_audio(audio_clip)
|
356 |
|
|
|
357 |
word_data = analyze_audio_with_whisper(tts_path)
|
358 |
|
359 |
if word_data:
|
|
|
360 |
subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
|
361 |
if subtitle_clips:
|
|
|
362 |
video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
|
363 |
else:
|
|
|
364 |
logger.warning("Falling back to basic subtitles")
|
365 |
txt_clip = TextClip(
|
366 |
narration_text,
|
367 |
fontsize=font_size,
|
|
|
368 |
color=CAPTION_COLOR,
|
369 |
+
method='label'
|
370 |
+
)
|
371 |
+
|
372 |
+
bg_clip = ColorClip(
|
373 |
+
size=(txt_clip.w + 20, txt_clip.h + 10),
|
374 |
+
color=(0, 0, 0, 128)
|
375 |
+
)
|
376 |
|
377 |
+
subtitle_clip = CompositeVideoClip([
|
378 |
+
bg_clip.set_position('center'),
|
379 |
+
txt_clip.set_position('center')
|
380 |
+
])
|
381 |
+
subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
|
382 |
+
video_clip = CompositeVideoClip([video_clip, subtitle_clip])
|
383 |
|
384 |
logger.info(f"Clip created: {video_clip.duration:.1f}s")
|
385 |
return video_clip
|
|
|
387 |
logger.error(f"Error in create_clip: {str(e)}")
|
388 |
return None
|
389 |
|
|
|
390 |
def generate_video(user_input, resolution, caption_option):
|
391 |
+
"""Generate a video based on user input."""
|
392 |
global TEMP_FOLDER, CAPTION_COLOR
|
393 |
|
|
|
394 |
CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
|
|
|
|
|
395 |
TEMP_FOLDER = tempfile.mkdtemp()
|
396 |
logger.info(f"Created temporary folder: {TEMP_FOLDER}")
|
397 |
|
|
|
398 |
if not os.path.exists("video.mp4"):
|
399 |
+
logger.error("video.mp4 not found")
|
400 |
return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
|
401 |
|
|
|
402 |
load_whisper_model()
|
|
|
|
|
|
|
403 |
script = generate_script(user_input)
|
404 |
if not script:
|
|
|
405 |
shutil.rmtree(TEMP_FOLDER)
|
406 |
+
return "Failed to generate script."
|
407 |
|
408 |
logger.info("Generated Script:\n" + script)
|
|
|
|
|
409 |
elements = parse_script(script)
|
410 |
if not elements:
|
|
|
411 |
shutil.rmtree(TEMP_FOLDER)
|
412 |
+
return "Failed to parse script."
|
413 |
|
414 |
logger.info(f"Parsed {len(elements)//2} script segments.")
|
415 |
+
paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
|
416 |
+
|
|
|
|
|
|
|
|
|
|
|
417 |
if not paired_elements:
|
|
|
418 |
shutil.rmtree(TEMP_FOLDER)
|
419 |
+
return "No valid script segments generated."
|
420 |
+
|
|
|
421 |
clips = []
|
422 |
for idx, (media_elem, tts_elem) in enumerate(paired_elements):
|
423 |
logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
|
|
|
|
|
424 |
tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
|
425 |
if not tts_path:
|
|
|
426 |
continue
|
427 |
|
428 |
+
clip = create_clip(tts_path, tts_elem['text'], idx)
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
if clip:
|
430 |
clips.append(clip)
|
431 |
+
|
|
|
|
|
432 |
if not clips:
|
|
|
433 |
shutil.rmtree(TEMP_FOLDER)
|
434 |
+
return "Failed to create any video clips."
|
435 |
+
|
|
|
436 |
logger.info("\nConcatenating clips...")
|
437 |
final_video = concatenate_videoclips(clips, method="compose")
|
|
|
|
|
438 |
final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
|
439 |
+
|
|
|
440 |
logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
|
441 |
final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
|
442 |
logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
|
443 |
+
|
|
|
|
|
444 |
shutil.rmtree(TEMP_FOLDER)
|
445 |
logger.info("Temporary files removed.")
|
|
|
446 |
return OUTPUT_VIDEO_FILENAME
|
447 |
|
|
|
448 |
def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
|
449 |
+
"""Generate video with Gradio options."""
|
450 |
global voice_speed, font_size, bg_music_volume, fps, preset
|
451 |
|
|
|
452 |
voice_speed = v_speed
|
453 |
font_size = caption_size
|
454 |
bg_music_volume = bg_vol
|
455 |
fps = video_fps
|
456 |
preset = video_preset
|
457 |
|
|
|
458 |
if music_file is not None:
|
459 |
+
shutil.copy(music_file.name, "music.mp3")
|
460 |
+
logger.info(f"Uploaded music saved as: music.mp3")
|
|
|
461 |
|
|
|
462 |
return generate_video(user_input, "Short", caption_option)
|
463 |
|
|
|
464 |
def create_interface():
|
465 |
+
"""Create Gradio interface."""
|
466 |
iface = gr.Interface(
|
467 |
fn=generate_video_with_options,
|
468 |
inputs=[
|
|
|
485 |
3. Adjust settings as needed
|
486 |
4. Click submit and wait for video generation
|
487 |
|
488 |
+
NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
|
489 |
"""
|
490 |
)
|
491 |
return iface
|
492 |
|
|
|
493 |
if __name__ == "__main__":
|
|
|
494 |
demo = create_interface()
|
495 |
demo.launch()
|
496 |
else:
|
|
|
497 |
demo = create_interface()
|