Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -409,11 +409,11 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
409 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
410 |
return None
|
411 |
|
412 |
-
|
413 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
414 |
"""
|
415 |
Aligns speech segments using quadratic programming. If optimization fails,
|
416 |
applies greedy fallback: center shorter segments, stretch longer ones.
|
|
|
417 |
"""
|
418 |
N = len(original_segments)
|
419 |
d = np.array(generated_durations)
|
@@ -437,9 +437,13 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
437 |
for i in range(N):
|
438 |
original_segments[i]['start'] = round(s.value[i], 3)
|
439 |
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
|
|
|
|
|
|
|
|
440 |
|
441 |
except Exception as e:
|
442 |
-
|
443 |
|
444 |
for i in range(N):
|
445 |
orig_start = original_segments[i]['start']
|
@@ -456,12 +460,10 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
456 |
new_start = orig_start - extra
|
457 |
new_end = orig_end + extra
|
458 |
|
459 |
-
# Prevent overlap with previous
|
460 |
if i > 0:
|
461 |
prev_end = original_segments[i - 1]['end']
|
462 |
new_start = max(new_start, prev_end + 0.01)
|
463 |
|
464 |
-
# Prevent overlap with next
|
465 |
if i < N - 1:
|
466 |
next_start = original_segments[i + 1]['start']
|
467 |
new_end = min(new_end, next_start - 0.01)
|
@@ -473,6 +475,11 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
473 |
original_segments[i]['start'] = round(new_start, 3)
|
474 |
original_segments[i]['end'] = round(new_end, 3)
|
475 |
|
|
|
|
|
|
|
|
|
|
|
476 |
return original_segments
|
477 |
|
478 |
def get_frame_image_bytes(video, t):
|
@@ -706,7 +713,10 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
706 |
speed=desired_speed,
|
707 |
split_sentences=True
|
708 |
)
|
709 |
-
msg =
|
|
|
|
|
|
|
710 |
logger.info(msg)
|
711 |
return output_audio_path, msg, None
|
712 |
|
@@ -718,9 +728,9 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
718 |
|
719 |
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
720 |
"""
|
721 |
-
Adds
|
722 |
-
using shrinkage-based estimation
|
723 |
-
|
724 |
"""
|
725 |
translated_json = copy.deepcopy(translated_json_raw)
|
726 |
|
@@ -744,8 +754,8 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
|
|
744 |
|
745 |
# Optional style modulation factor
|
746 |
style_modifiers = {
|
747 |
-
"dramatic": 0.9,
|
748 |
-
"urgent": 1.1,
|
749 |
"neutral": 1.0
|
750 |
}
|
751 |
|
@@ -758,38 +768,45 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
|
|
758 |
category = entry.get("category", "drama")
|
759 |
source_lang = source_language
|
760 |
target_lang = target_language
|
761 |
-
style = entry.get("style", "neutral")
|
762 |
|
763 |
# Observed speed from original
|
764 |
base_text = original_text or translated_text
|
765 |
obs_speed = len(base_text) / duration
|
766 |
|
767 |
-
# Prior speed
|
768 |
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
769 |
|
770 |
-
# Shrinkage
|
771 |
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
772 |
|
773 |
-
#
|
774 |
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
775 |
adjusted_speed = shrink_speed * ratio
|
776 |
|
777 |
-
#
|
778 |
-
mod = style_modifiers.get(style
|
779 |
adjusted_speed *= mod
|
780 |
|
781 |
-
# Final
|
|
|
|
|
|
|
|
|
|
|
|
|
782 |
target_chars = len(translated_text)
|
783 |
-
target_duration = round(target_chars /
|
784 |
|
785 |
-
# Logging
|
786 |
logger.info(
|
787 |
-
f"Segment {idx}
|
788 |
-
f"
|
789 |
-
f"
|
|
|
790 |
)
|
791 |
|
792 |
-
entry["speed"] = round(
|
793 |
entry["target_duration"] = target_duration
|
794 |
|
795 |
return translated_json
|
|
|
409 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
410 |
return None
|
411 |
|
|
|
412 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
413 |
"""
|
414 |
Aligns speech segments using quadratic programming. If optimization fails,
|
415 |
applies greedy fallback: center shorter segments, stretch longer ones.
|
416 |
+
Logs alignment results for traceability.
|
417 |
"""
|
418 |
N = len(original_segments)
|
419 |
d = np.array(generated_durations)
|
|
|
437 |
for i in range(N):
|
438 |
original_segments[i]['start'] = round(s.value[i], 3)
|
439 |
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
440 |
+
logger.info(
|
441 |
+
f"[OPT] Segment {i}: duration={d[i]:.2f}s | start={original_segments[i]['start']:.2f}s | "
|
442 |
+
f"end={original_segments[i]['end']:.2f}s | mid={m[i]:.2f}s"
|
443 |
+
)
|
444 |
|
445 |
except Exception as e:
|
446 |
+
logger.warning(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
447 |
|
448 |
for i in range(N):
|
449 |
orig_start = original_segments[i]['start']
|
|
|
460 |
new_start = orig_start - extra
|
461 |
new_end = orig_end + extra
|
462 |
|
|
|
463 |
if i > 0:
|
464 |
prev_end = original_segments[i - 1]['end']
|
465 |
new_start = max(new_start, prev_end + 0.01)
|
466 |
|
|
|
467 |
if i < N - 1:
|
468 |
next_start = original_segments[i + 1]['start']
|
469 |
new_end = min(new_end, next_start - 0.01)
|
|
|
475 |
original_segments[i]['start'] = round(new_start, 3)
|
476 |
original_segments[i]['end'] = round(new_end, 3)
|
477 |
|
478 |
+
logger.info(
|
479 |
+
f"[FALLBACK] Segment {i}: duration={gen_duration:.2f}s | start={new_start:.2f}s | "
|
480 |
+
f"end={new_end:.2f}s | original_mid={orig_mid:.2f}s"
|
481 |
+
)
|
482 |
+
|
483 |
return original_segments
|
484 |
|
485 |
def get_frame_image_bytes(video, t):
|
|
|
713 |
speed=desired_speed,
|
714 |
split_sentences=True
|
715 |
)
|
716 |
+
msg = (
|
717 |
+
f"✅ Voice cloning completed successfully. "
|
718 |
+
f"[Speaker Wav: {speaker_wav_path}] [Speed: {desired_speed}]"
|
719 |
+
)
|
720 |
logger.info(msg)
|
721 |
return output_audio_path, msg, None
|
722 |
|
|
|
728 |
|
729 |
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
730 |
"""
|
731 |
+
Adds `speed` (relative, 1.0 = normal speed) and `target_duration` (sec) to each segment
|
732 |
+
using shrinkage-based estimation, language stretch ratios, and optional style modifiers.
|
733 |
+
Speeds are clamped to [0.85, 1.7] to avoid unnatural TTS behavior.
|
734 |
"""
|
735 |
translated_json = copy.deepcopy(translated_json_raw)
|
736 |
|
|
|
754 |
|
755 |
# Optional style modulation factor
|
756 |
style_modifiers = {
|
757 |
+
"dramatic": 0.9,
|
758 |
+
"urgent": 1.1,
|
759 |
"neutral": 1.0
|
760 |
}
|
761 |
|
|
|
768 |
category = entry.get("category", "drama")
|
769 |
source_lang = source_language
|
770 |
target_lang = target_language
|
771 |
+
style = entry.get("style", "neutral").lower()
|
772 |
|
773 |
# Observed speed from original
|
774 |
base_text = original_text or translated_text
|
775 |
obs_speed = len(base_text) / duration
|
776 |
|
777 |
+
# Prior speed
|
778 |
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
779 |
|
780 |
+
# Shrinkage
|
781 |
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
782 |
|
783 |
+
# Language pacing adjustment
|
784 |
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
785 |
adjusted_speed = shrink_speed * ratio
|
786 |
|
787 |
+
# Style modulation
|
788 |
+
mod = style_modifiers.get(style, 1.0)
|
789 |
adjusted_speed *= mod
|
790 |
|
791 |
+
# Final relative speed (normalized to prior)
|
792 |
+
relative_speed = adjusted_speed / prior_speed
|
793 |
+
|
794 |
+
# Clamp relative speed to [0.85, 1.7]
|
795 |
+
relative_speed = max(0.85, min(1.7, relative_speed))
|
796 |
+
|
797 |
+
# Compute target duration for synthesis
|
798 |
target_chars = len(translated_text)
|
799 |
+
target_duration = round(target_chars / (prior_speed * relative_speed), 2)
|
800 |
|
801 |
+
# Logging
|
802 |
logger.info(
|
803 |
+
f"[Segment {idx}] dur={duration:.2f}s | obs_speed={obs_speed:.2f} | prior={prior_speed:.2f} | "
|
804 |
+
f"shrinked={shrink_speed:.2f} | lang_ratio={ratio} | style_mod={mod} | "
|
805 |
+
f"adj_speed={adjusted_speed:.2f} | rel_speed={relative_speed:.2f} | "
|
806 |
+
f"target_dur={target_duration:.2f}s"
|
807 |
)
|
808 |
|
809 |
+
entry["speed"] = round(relative_speed, 3)
|
810 |
entry["target_duration"] = target_duration
|
811 |
|
812 |
return translated_json
|