qqwjq1981 commited on
Commit
d301d25
Β·
verified Β·
1 Parent(s): dd18209

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -35
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import numpy as np
 
2
  import re
3
  import concurrent.futures
4
  import gradio as gr
@@ -382,6 +383,46 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
382
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
383
  return None
384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
386
  logger.debug(f"Processing entry {i}: {entry}")
387
  error_message = None
@@ -394,6 +435,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
394
  txt_clip = None
395
 
396
  audio_segment = None
 
397
  if process_mode > 1:
398
  try:
399
  segment_audio_path = f"segment_{i}_voiceover.wav"
@@ -402,10 +444,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
402
 
403
  speaker = entry.get("speaker", "default")
404
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
405
-
406
- # Assume this is the list of supported languages for the TTS model
407
  supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
408
-
409
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
410
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
411
  else:
@@ -415,14 +456,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
415
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
416
 
417
  audio_clip = AudioFileClip(segment_audio_path)
418
- logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
419
 
420
- if audio_clip.duration < desired_duration:
421
- silence_duration = desired_duration - audio_clip.duration
422
- audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
423
- logger.info(f"Padded audio with {silence_duration} seconds of silence.")
424
-
425
- audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
426
 
427
  except Exception as e:
428
  err = f"❌ Failed to generate audio segment for entry {i}: {e}"
@@ -430,28 +466,31 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
430
  error_message = error_message + " | " + err if error_message else err
431
  audio_segment = None
432
 
433
- return i, txt_clip, audio_segment, error_message
434
-
 
435
  def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
 
436
  video = VideoFileClip(video_path)
437
  font_path = "./NotoSansSC-Regular.ttf"
438
 
439
  text_clips = []
440
  audio_segments = []
 
441
  error_messages = []
442
-
443
  if process_mode == 3:
444
  global tts_model
445
  if tts_model is None:
446
  try:
447
  print("πŸ”„ Loading XTTS model...")
 
448
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
449
  print("βœ… XTTS model loaded successfully.")
450
  except Exception as e:
451
  print("❌ Error loading XTTS model:")
452
  traceback.print_exc()
453
  return f"Error loading XTTS model: {e}"
454
- ## Need to implmenet backup option.
455
 
456
  with concurrent.futures.ThreadPoolExecutor() as executor:
457
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
@@ -460,50 +499,47 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
460
  results = []
461
  for future in concurrent.futures.as_completed(futures):
462
  try:
463
- i, txt_clip, audio_segment, error = future.result()
464
- results.append((i, txt_clip, audio_segment))
465
  if error:
466
  error_messages.append(f"[Entry {i}] {error}")
467
  except Exception as e:
468
  err = f"❌ Unexpected error in future result: {e}"
469
- logger.error(err)
470
  error_messages.append(err)
471
 
472
- # Sort by entry index to ensure order
473
  results.sort(key=lambda x: x[0])
474
- text_clips = [clip for _, clip, _ in results if clip]
475
- if process_mode>1:
476
- audio_segments = [segment for _, _, segment in results if segment]
 
 
 
 
 
 
 
 
 
 
477
 
478
  final_video = CompositeVideoClip([video] + text_clips)
479
 
480
- if process_mode>1 and audio_segments:
481
  try:
482
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
483
 
484
  if background_audio_path and os.path.exists(background_audio_path):
485
  background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
486
  final_audio = CompositeAudioClip([voice_audio, background_audio])
487
- # final_audio = voice_audio
488
- logger.info("βœ… Background audio loaded and merged with voiceover.")
489
  else:
490
  final_audio = voice_audio
491
- logger.info("⚠️ No background audio found. Using voiceover only.")
492
 
493
  final_video = final_video.set_audio(final_audio)
494
 
495
  except Exception as e:
496
- logger.error(f"❌ Failed to set audio: {e}")
497
-
498
- logger.info(f"Saving the final video to: {output_path}")
499
- final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
500
 
501
- logger.info("Video processing completed successfully.")
502
-
503
- if error_messages:
504
- logger.warning("⚠️ Errors encountered during processing:")
505
- for msg in error_messages:
506
- logger.warning(msg)
507
 
508
  return error_messages
509
 
 
1
  import numpy as np
2
+ import cvxpy as cp
3
  import re
4
  import concurrent.futures
5
  import gradio as gr
 
383
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
384
  return None
385
 
386
+
387
+
388
+ def solve_optimal_alignment(original_segments, generated_durations, total_duration):
389
+ """
390
+ Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
391
+ Modifies and returns the translated_json with updated 'start' and 'end'.
392
+ """
393
+ N = len(original_segments)
394
+ d = np.array(generated_durations)
395
+ m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
396
+
397
+ try:
398
+ s = cp.Variable(N)
399
+ objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
400
+
401
+ constraints = [s[0] >= 0]
402
+ for i in range(N - 1):
403
+ constraints.append(s[i] + d[i] <= s[i + 1])
404
+ constraints.append(s[N - 1] + d[N - 1] == total_duration)
405
+
406
+ problem = cp.Problem(objective, constraints)
407
+ problem.solve()
408
+
409
+ if s.value is None:
410
+ raise ValueError("Solver failed")
411
+
412
+ for i in range(N):
413
+ original_segments[i]['start'] = round(s.value[i], 3)
414
+ original_segments[i]['end'] = round(s.value[i] + d[i], 3)
415
+
416
+ except Exception as e:
417
+ print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
418
+
419
+ current_time = 0.0
420
+ for i in range(N):
421
+ original_segments[i]['start'] = round(current_time, 3)
422
+ original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
423
+ current_time += generated_durations[i]
424
+
425
+ return original_segments
426
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
427
  logger.debug(f"Processing entry {i}: {entry}")
428
  error_message = None
 
435
  txt_clip = None
436
 
437
  audio_segment = None
438
+ actual_duration = 0.0
439
  if process_mode > 1:
440
  try:
441
  segment_audio_path = f"segment_{i}_voiceover.wav"
 
444
 
445
  speaker = entry.get("speaker", "default")
446
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
447
+
 
448
  supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
449
+
450
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
451
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
452
  else:
 
456
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
457
 
458
  audio_clip = AudioFileClip(segment_audio_path)
459
+ actual_duration = audio_clip.duration
460
 
461
+ audio_segment = audio_clip # Do not set start here, alignment happens later
 
 
 
 
 
462
 
463
  except Exception as e:
464
  err = f"❌ Failed to generate audio segment for entry {i}: {e}"
 
466
  error_message = error_message + " | " + err if error_message else err
467
  audio_segment = None
468
 
469
+ return i, txt_clip, audio_segment, actual_duration, error_message
470
+
471
+
472
  def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
473
+
474
  video = VideoFileClip(video_path)
475
  font_path = "./NotoSansSC-Regular.ttf"
476
 
477
  text_clips = []
478
  audio_segments = []
479
+ actual_durations = []
480
  error_messages = []
481
+
482
  if process_mode == 3:
483
  global tts_model
484
  if tts_model is None:
485
  try:
486
  print("πŸ”„ Loading XTTS model...")
487
+ from TTS.api import TTS
488
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
489
  print("βœ… XTTS model loaded successfully.")
490
  except Exception as e:
491
  print("❌ Error loading XTTS model:")
492
  traceback.print_exc()
493
  return f"Error loading XTTS model: {e}"
 
494
 
495
  with concurrent.futures.ThreadPoolExecutor() as executor:
496
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
 
499
  results = []
500
  for future in concurrent.futures.as_completed(futures):
501
  try:
502
+ i, txt_clip, audio_segment, actual_duration, error = future.result()
503
+ results.append((i, txt_clip, audio_segment, actual_duration))
504
  if error:
505
  error_messages.append(f"[Entry {i}] {error}")
506
  except Exception as e:
507
  err = f"❌ Unexpected error in future result: {e}"
 
508
  error_messages.append(err)
509
 
 
510
  results.sort(key=lambda x: x[0])
511
+ text_clips = [clip for _, clip, _, _ in results if clip]
512
+ generated_durations = [dur for _, _, _, dur in results if dur > 0]
513
+
514
+ # Align using optimization (modifies translated_json in-place)
515
+ translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
516
+
517
+ # Set aligned timings
518
+ audio_segments = []
519
+ for i, entry in enumerate(translated_json):
520
+ segment = results[i][2] # AudioFileClip
521
+ if segment:
522
+ segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
523
+ audio_segments.append(segment)
524
 
525
  final_video = CompositeVideoClip([video] + text_clips)
526
 
527
+ if process_mode > 1 and audio_segments:
528
  try:
529
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
530
 
531
  if background_audio_path and os.path.exists(background_audio_path):
532
  background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
533
  final_audio = CompositeAudioClip([voice_audio, background_audio])
 
 
534
  else:
535
  final_audio = voice_audio
 
536
 
537
  final_video = final_video.set_audio(final_audio)
538
 
539
  except Exception as e:
540
+ print(f"❌ Failed to set audio: {e}")
 
 
 
541
 
542
+ final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
 
 
 
 
 
543
 
544
  return error_messages
545