qqwjq1981 commited on
Commit
4e27cf9
·
verified ·
1 Parent(s): 5f6148c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -23
app.py CHANGED
@@ -409,11 +409,11 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
409
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
410
  return None
411
 
412
-
413
  def solve_optimal_alignment(original_segments, generated_durations, total_duration):
414
  """
415
  Aligns speech segments using quadratic programming. If optimization fails,
416
  applies greedy fallback: center shorter segments, stretch longer ones.
 
417
  """
418
  N = len(original_segments)
419
  d = np.array(generated_durations)
@@ -437,9 +437,13 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
437
  for i in range(N):
438
  original_segments[i]['start'] = round(s.value[i], 3)
439
  original_segments[i]['end'] = round(s.value[i] + d[i], 3)
 
 
 
 
440
 
441
  except Exception as e:
442
- print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
443
 
444
  for i in range(N):
445
  orig_start = original_segments[i]['start']
@@ -456,12 +460,10 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
456
  new_start = orig_start - extra
457
  new_end = orig_end + extra
458
 
459
- # Prevent overlap with previous
460
  if i > 0:
461
  prev_end = original_segments[i - 1]['end']
462
  new_start = max(new_start, prev_end + 0.01)
463
 
464
- # Prevent overlap with next
465
  if i < N - 1:
466
  next_start = original_segments[i + 1]['start']
467
  new_end = min(new_end, next_start - 0.01)
@@ -473,6 +475,11 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
473
  original_segments[i]['start'] = round(new_start, 3)
474
  original_segments[i]['end'] = round(new_end, 3)
475
 
 
 
 
 
 
476
  return original_segments
477
 
478
  def get_frame_image_bytes(video, t):
@@ -706,7 +713,10 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
706
  speed=desired_speed,
707
  split_sentences=True
708
  )
709
- msg = "✅ Voice cloning completed successfully."
 
 
 
710
  logger.info(msg)
711
  return output_audio_path, msg, None
712
 
@@ -718,9 +728,9 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
718
 
719
  def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
720
  """
721
- Adds a `speed` (chars/sec) and `target_duration` (sec) field to each segment
722
- using shrinkage-based estimation and language stretch ratios.
723
- Optionally modulates based on tone or style tags (e.g., "dramatic", "urgent").
724
  """
725
  translated_json = copy.deepcopy(translated_json_raw)
726
 
@@ -744,8 +754,8 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
744
 
745
  # Optional style modulation factor
746
  style_modifiers = {
747
- "dramatic": 0.9, # slower
748
- "urgent": 1.1, # faster
749
  "neutral": 1.0
750
  }
751
 
@@ -758,38 +768,45 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
758
  category = entry.get("category", "drama")
759
  source_lang = source_language
760
  target_lang = target_language
761
- style = entry.get("style", "neutral") # Optional field like "dramatic"
762
 
763
  # Observed speed from original
764
  base_text = original_text or translated_text
765
  obs_speed = len(base_text) / duration
766
 
767
- # Prior speed from category + language
768
  prior_speed = priors.get((category, target_lang), default_prior_speed)
769
 
770
- # Shrinkage estimate
771
  shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
772
 
773
- # Adjust for language-specific pacing
774
  ratio = lang_ratio.get((source_lang, target_lang), 1.0)
775
  adjusted_speed = shrink_speed * ratio
776
 
777
- # Optional tone/style modulation (if available)
778
- mod = style_modifiers.get(style.lower(), 1.0)
779
  adjusted_speed *= mod
780
 
781
- # Final estimated duration for synthesized segment
 
 
 
 
 
 
782
  target_chars = len(translated_text)
783
- target_duration = round(target_chars / adjusted_speed, 2)
784
 
785
- # Logging for debugging
786
  logger.info(
787
- f"Segment {idx}: dur={duration:.2f}s, obs={obs_speed:.2f}, "
788
- f"prior={prior_speed:.2f}, shrink={shrink_speed:.2f}, "
789
- f"final_speed={adjusted_speed:.2f}, target_dur={target_duration:.2f}s"
 
790
  )
791
 
792
- entry["speed"] = round(adjusted_speed, 3)
793
  entry["target_duration"] = target_duration
794
 
795
  return translated_json
 
409
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
410
  return None
411
 
 
412
  def solve_optimal_alignment(original_segments, generated_durations, total_duration):
413
  """
414
  Aligns speech segments using quadratic programming. If optimization fails,
415
  applies greedy fallback: center shorter segments, stretch longer ones.
416
+ Logs alignment results for traceability.
417
  """
418
  N = len(original_segments)
419
  d = np.array(generated_durations)
 
437
  for i in range(N):
438
  original_segments[i]['start'] = round(s.value[i], 3)
439
  original_segments[i]['end'] = round(s.value[i] + d[i], 3)
440
+ logger.info(
441
+ f"[OPT] Segment {i}: duration={d[i]:.2f}s | start={original_segments[i]['start']:.2f}s | "
442
+ f"end={original_segments[i]['end']:.2f}s | mid={m[i]:.2f}s"
443
+ )
444
 
445
  except Exception as e:
446
+ logger.warning(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
447
 
448
  for i in range(N):
449
  orig_start = original_segments[i]['start']
 
460
  new_start = orig_start - extra
461
  new_end = orig_end + extra
462
 
 
463
  if i > 0:
464
  prev_end = original_segments[i - 1]['end']
465
  new_start = max(new_start, prev_end + 0.01)
466
 
 
467
  if i < N - 1:
468
  next_start = original_segments[i + 1]['start']
469
  new_end = min(new_end, next_start - 0.01)
 
475
  original_segments[i]['start'] = round(new_start, 3)
476
  original_segments[i]['end'] = round(new_end, 3)
477
 
478
+ logger.info(
479
+ f"[FALLBACK] Segment {i}: duration={gen_duration:.2f}s | start={new_start:.2f}s | "
480
+ f"end={new_end:.2f}s | original_mid={orig_mid:.2f}s"
481
+ )
482
+
483
  return original_segments
484
 
485
  def get_frame_image_bytes(video, t):
 
713
  speed=desired_speed,
714
  split_sentences=True
715
  )
716
+ msg = (
717
+ f"✅ Voice cloning completed successfully. "
718
+ f"[Speaker Wav: {speaker_wav_path}] [Speed: {desired_speed}]"
719
+ )
720
  logger.info(msg)
721
  return output_audio_path, msg, None
722
 
 
728
 
729
  def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
730
  """
731
+ Adds `speed` (relative, 1.0 = normal speed) and `target_duration` (sec) to each segment
732
+ using shrinkage-based estimation, language stretch ratios, and optional style modifiers.
733
+ Speeds are clamped to [0.85, 1.7] to avoid unnatural TTS behavior.
734
  """
735
  translated_json = copy.deepcopy(translated_json_raw)
736
 
 
754
 
755
  # Optional style modulation factor
756
  style_modifiers = {
757
+ "dramatic": 0.9,
758
+ "urgent": 1.1,
759
  "neutral": 1.0
760
  }
761
 
 
768
  category = entry.get("category", "drama")
769
  source_lang = source_language
770
  target_lang = target_language
771
+ style = entry.get("style", "neutral").lower()
772
 
773
  # Observed speed from original
774
  base_text = original_text or translated_text
775
  obs_speed = len(base_text) / duration
776
 
777
+ # Prior speed
778
  prior_speed = priors.get((category, target_lang), default_prior_speed)
779
 
780
+ # Shrinkage
781
  shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
782
 
783
+ # Language pacing adjustment
784
  ratio = lang_ratio.get((source_lang, target_lang), 1.0)
785
  adjusted_speed = shrink_speed * ratio
786
 
787
+ # Style modulation
788
+ mod = style_modifiers.get(style, 1.0)
789
  adjusted_speed *= mod
790
 
791
+ # Final relative speed (normalized to prior)
792
+ relative_speed = adjusted_speed / prior_speed
793
+
794
+ # Clamp relative speed to [0.85, 1.7]
795
+ relative_speed = max(0.85, min(1.7, relative_speed))
796
+
797
+ # Compute target duration for synthesis
798
  target_chars = len(translated_text)
799
+ target_duration = round(target_chars / (prior_speed * relative_speed), 2)
800
 
801
+ # Logging
802
  logger.info(
803
+ f"[Segment {idx}] dur={duration:.2f}s | obs_speed={obs_speed:.2f} | prior={prior_speed:.2f} | "
804
+ f"shrinked={shrink_speed:.2f} | lang_ratio={ratio} | style_mod={mod} | "
805
+ f"adj_speed={adjusted_speed:.2f} | rel_speed={relative_speed:.2f} | "
806
+ f"target_dur={target_duration:.2f}s"
807
  )
808
 
809
+ entry["speed"] = round(relative_speed, 3)
810
  entry["target_duration"] = target_duration
811
 
812
  return translated_json