Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import numpy as np
|
|
|
2 |
import re
|
3 |
import concurrent.futures
|
4 |
import gradio as gr
|
@@ -382,6 +383,46 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
382 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
383 |
return None
|
384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
386 |
logger.debug(f"Processing entry {i}: {entry}")
|
387 |
error_message = None
|
@@ -394,6 +435,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
394 |
txt_clip = None
|
395 |
|
396 |
audio_segment = None
|
|
|
397 |
if process_mode > 1:
|
398 |
try:
|
399 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
@@ -402,10 +444,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
402 |
|
403 |
speaker = entry.get("speaker", "default")
|
404 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
405 |
-
|
406 |
-
# Assume this is the list of supported languages for the TTS model
|
407 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
408 |
-
|
409 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
410 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
411 |
else:
|
@@ -415,14 +456,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
415 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
416 |
|
417 |
audio_clip = AudioFileClip(segment_audio_path)
|
418 |
-
|
419 |
|
420 |
-
|
421 |
-
silence_duration = desired_duration - audio_clip.duration
|
422 |
-
audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
|
423 |
-
logger.info(f"Padded audio with {silence_duration} seconds of silence.")
|
424 |
-
|
425 |
-
audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
|
426 |
|
427 |
except Exception as e:
|
428 |
err = f"β Failed to generate audio segment for entry {i}: {e}"
|
@@ -430,28 +466,31 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
430 |
error_message = error_message + " | " + err if error_message else err
|
431 |
audio_segment = None
|
432 |
|
433 |
-
return i, txt_clip, audio_segment, error_message
|
434 |
-
|
|
|
435 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
|
|
436 |
video = VideoFileClip(video_path)
|
437 |
font_path = "./NotoSansSC-Regular.ttf"
|
438 |
|
439 |
text_clips = []
|
440 |
audio_segments = []
|
|
|
441 |
error_messages = []
|
442 |
-
|
443 |
if process_mode == 3:
|
444 |
global tts_model
|
445 |
if tts_model is None:
|
446 |
try:
|
447 |
print("π Loading XTTS model...")
|
|
|
448 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
449 |
print("β
XTTS model loaded successfully.")
|
450 |
except Exception as e:
|
451 |
print("β Error loading XTTS model:")
|
452 |
traceback.print_exc()
|
453 |
return f"Error loading XTTS model: {e}"
|
454 |
-
## Need to implmenet backup option.
|
455 |
|
456 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
457 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
@@ -460,50 +499,47 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
460 |
results = []
|
461 |
for future in concurrent.futures.as_completed(futures):
|
462 |
try:
|
463 |
-
i, txt_clip, audio_segment, error = future.result()
|
464 |
-
results.append((i, txt_clip, audio_segment))
|
465 |
if error:
|
466 |
error_messages.append(f"[Entry {i}] {error}")
|
467 |
except Exception as e:
|
468 |
err = f"β Unexpected error in future result: {e}"
|
469 |
-
logger.error(err)
|
470 |
error_messages.append(err)
|
471 |
|
472 |
-
# Sort by entry index to ensure order
|
473 |
results.sort(key=lambda x: x[0])
|
474 |
-
text_clips = [clip for _, clip, _ in results if clip]
|
475 |
-
if
|
476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
|
478 |
final_video = CompositeVideoClip([video] + text_clips)
|
479 |
|
480 |
-
if process_mode>1 and audio_segments:
|
481 |
try:
|
482 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
483 |
|
484 |
if background_audio_path and os.path.exists(background_audio_path):
|
485 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
486 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
487 |
-
# final_audio = voice_audio
|
488 |
-
logger.info("β
Background audio loaded and merged with voiceover.")
|
489 |
else:
|
490 |
final_audio = voice_audio
|
491 |
-
logger.info("β οΈ No background audio found. Using voiceover only.")
|
492 |
|
493 |
final_video = final_video.set_audio(final_audio)
|
494 |
|
495 |
except Exception as e:
|
496 |
-
|
497 |
-
|
498 |
-
logger.info(f"Saving the final video to: {output_path}")
|
499 |
-
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
if error_messages:
|
504 |
-
logger.warning("β οΈ Errors encountered during processing:")
|
505 |
-
for msg in error_messages:
|
506 |
-
logger.warning(msg)
|
507 |
|
508 |
return error_messages
|
509 |
|
|
|
1 |
import numpy as np
|
2 |
+
import cvxpy as cp
|
3 |
import re
|
4 |
import concurrent.futures
|
5 |
import gradio as gr
|
|
|
383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
384 |
return None
|
385 |
|
386 |
+
|
387 |
+
|
388 |
+
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
389 |
+
"""
|
390 |
+
Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
|
391 |
+
Modifies and returns the translated_json with updated 'start' and 'end'.
|
392 |
+
"""
|
393 |
+
N = len(original_segments)
|
394 |
+
d = np.array(generated_durations)
|
395 |
+
m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
|
396 |
+
|
397 |
+
try:
|
398 |
+
s = cp.Variable(N)
|
399 |
+
objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
|
400 |
+
|
401 |
+
constraints = [s[0] >= 0]
|
402 |
+
for i in range(N - 1):
|
403 |
+
constraints.append(s[i] + d[i] <= s[i + 1])
|
404 |
+
constraints.append(s[N - 1] + d[N - 1] == total_duration)
|
405 |
+
|
406 |
+
problem = cp.Problem(objective, constraints)
|
407 |
+
problem.solve()
|
408 |
+
|
409 |
+
if s.value is None:
|
410 |
+
raise ValueError("Solver failed")
|
411 |
+
|
412 |
+
for i in range(N):
|
413 |
+
original_segments[i]['start'] = round(s.value[i], 3)
|
414 |
+
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
415 |
+
|
416 |
+
except Exception as e:
|
417 |
+
print(f"β οΈ Optimization failed: {e}, falling back to greedy alignment.")
|
418 |
+
|
419 |
+
current_time = 0.0
|
420 |
+
for i in range(N):
|
421 |
+
original_segments[i]['start'] = round(current_time, 3)
|
422 |
+
original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
|
423 |
+
current_time += generated_durations[i]
|
424 |
+
|
425 |
+
return original_segments
|
426 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
427 |
logger.debug(f"Processing entry {i}: {entry}")
|
428 |
error_message = None
|
|
|
435 |
txt_clip = None
|
436 |
|
437 |
audio_segment = None
|
438 |
+
actual_duration = 0.0
|
439 |
if process_mode > 1:
|
440 |
try:
|
441 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
|
444 |
|
445 |
speaker = entry.get("speaker", "default")
|
446 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
447 |
+
|
|
|
448 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
449 |
+
|
450 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
451 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
452 |
else:
|
|
|
456 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
457 |
|
458 |
audio_clip = AudioFileClip(segment_audio_path)
|
459 |
+
actual_duration = audio_clip.duration
|
460 |
|
461 |
+
audio_segment = audio_clip # Do not set start here, alignment happens later
|
|
|
|
|
|
|
|
|
|
|
462 |
|
463 |
except Exception as e:
|
464 |
err = f"β Failed to generate audio segment for entry {i}: {e}"
|
|
|
466 |
error_message = error_message + " | " + err if error_message else err
|
467 |
audio_segment = None
|
468 |
|
469 |
+
return i, txt_clip, audio_segment, actual_duration, error_message
|
470 |
+
|
471 |
+
|
472 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
473 |
+
|
474 |
video = VideoFileClip(video_path)
|
475 |
font_path = "./NotoSansSC-Regular.ttf"
|
476 |
|
477 |
text_clips = []
|
478 |
audio_segments = []
|
479 |
+
actual_durations = []
|
480 |
error_messages = []
|
481 |
+
|
482 |
if process_mode == 3:
|
483 |
global tts_model
|
484 |
if tts_model is None:
|
485 |
try:
|
486 |
print("π Loading XTTS model...")
|
487 |
+
from TTS.api import TTS
|
488 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
489 |
print("β
XTTS model loaded successfully.")
|
490 |
except Exception as e:
|
491 |
print("β Error loading XTTS model:")
|
492 |
traceback.print_exc()
|
493 |
return f"Error loading XTTS model: {e}"
|
|
|
494 |
|
495 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
496 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
|
499 |
results = []
|
500 |
for future in concurrent.futures.as_completed(futures):
|
501 |
try:
|
502 |
+
i, txt_clip, audio_segment, actual_duration, error = future.result()
|
503 |
+
results.append((i, txt_clip, audio_segment, actual_duration))
|
504 |
if error:
|
505 |
error_messages.append(f"[Entry {i}] {error}")
|
506 |
except Exception as e:
|
507 |
err = f"β Unexpected error in future result: {e}"
|
|
|
508 |
error_messages.append(err)
|
509 |
|
|
|
510 |
results.sort(key=lambda x: x[0])
|
511 |
+
text_clips = [clip for _, clip, _, _ in results if clip]
|
512 |
+
generated_durations = [dur for _, _, _, dur in results if dur > 0]
|
513 |
+
|
514 |
+
# Align using optimization (modifies translated_json in-place)
|
515 |
+
translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
|
516 |
+
|
517 |
+
# Set aligned timings
|
518 |
+
audio_segments = []
|
519 |
+
for i, entry in enumerate(translated_json):
|
520 |
+
segment = results[i][2] # AudioFileClip
|
521 |
+
if segment:
|
522 |
+
segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
|
523 |
+
audio_segments.append(segment)
|
524 |
|
525 |
final_video = CompositeVideoClip([video] + text_clips)
|
526 |
|
527 |
+
if process_mode > 1 and audio_segments:
|
528 |
try:
|
529 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
530 |
|
531 |
if background_audio_path and os.path.exists(background_audio_path):
|
532 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
533 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
|
|
|
|
534 |
else:
|
535 |
final_audio = voice_audio
|
|
|
536 |
|
537 |
final_video = final_video.set_audio(final_audio)
|
538 |
|
539 |
except Exception as e:
|
540 |
+
print(f"β Failed to set audio: {e}")
|
|
|
|
|
|
|
541 |
|
542 |
+
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
return error_messages
|
545 |
|