Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import numpy as np
|
2 |
import cvxpy as cp
|
3 |
import re
|
|
|
4 |
import concurrent.futures
|
5 |
import gradio as gr
|
6 |
from datetime import datetime
|
@@ -556,7 +557,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
556 |
try:
|
557 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
558 |
desired_duration = entry["end"] - entry["start"]
|
559 |
-
desired_speed = calibrated_speed(entry['translated'], desired_duration)
|
560 |
|
561 |
speaker = entry.get("speaker", "default")
|
562 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
@@ -608,7 +609,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
608 |
|
609 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
610 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
611 |
-
for i, entry in enumerate(
|
612 |
|
613 |
results = []
|
614 |
for future in concurrent.futures.as_completed(futures):
|
@@ -715,6 +716,84 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
715 |
logger.error(traceback.format_exc())
|
716 |
return None, err_msg, err_msg
|
717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
def calibrated_speed(text, desired_duration):
|
719 |
"""
|
720 |
Compute a speed factor to help TTS fit audio into desired duration,
|
@@ -757,14 +836,15 @@ def upload_and_manage(file, target_language, process_mode):
|
|
757 |
|
758 |
# Step 2: Translate the transcription
|
759 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
760 |
-
|
761 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
762 |
|
763 |
# translated_json = post_edit_translated_segments(translated_json, file.name)
|
764 |
-
|
|
|
765 |
# Step 3: Add transcript to video based on timestamps
|
766 |
logger.info("Adding translated transcript to video...")
|
767 |
-
add_transcript_voiceover(file.name,
|
768 |
logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
|
769 |
|
770 |
# Convert translated JSON into a format for the editable table
|
|
|
1 |
import numpy as np
|
2 |
import cvxpy as cp
|
3 |
import re
|
4 |
+
import copy
|
5 |
import concurrent.futures
|
6 |
import gradio as gr
|
7 |
from datetime import datetime
|
|
|
557 |
try:
|
558 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
559 |
desired_duration = entry["end"] - entry["start"]
|
560 |
+
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
561 |
|
562 |
speaker = entry.get("speaker", "default")
|
563 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
|
|
609 |
|
610 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
611 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
612 |
+
for i, entry in enumerate(translated_json_withspeed)]
|
613 |
|
614 |
results = []
|
615 |
for future in concurrent.futures.as_completed(futures):
|
|
|
716 |
logger.error(traceback.format_exc())
|
717 |
return None, err_msg, err_msg
|
718 |
|
719 |
+
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
720 |
+
"""
|
721 |
+
Adds a `speed` (chars/sec) and `target_duration` (sec) field to each segment
|
722 |
+
using shrinkage-based estimation and language stretch ratios.
|
723 |
+
Optionally modulates based on tone or style tags (e.g., "dramatic", "urgent").
|
724 |
+
"""
|
725 |
+
translated_json = copy.deepcopy(translated_json_raw)
|
726 |
+
|
727 |
+
# Prior average speech speeds by (category, target language)
|
728 |
+
priors = {
|
729 |
+
("drama", "en"): 5.0,
|
730 |
+
("drama", "zh"): 4.5,
|
731 |
+
("tutorial", "en"): 5.2,
|
732 |
+
("tutorial", "zh"): 4.8,
|
733 |
+
("shortplay", "en"): 5.1,
|
734 |
+
("shortplay", "zh"): 4.7,
|
735 |
+
}
|
736 |
+
|
737 |
+
# Adjustment ratio based on language pair (source → target)
|
738 |
+
lang_ratio = {
|
739 |
+
("zh", "en"): 0.85,
|
740 |
+
("en", "zh"): 1.15,
|
741 |
+
("zh", "jp"): 1.05,
|
742 |
+
("en", "ja"): 0.9,
|
743 |
+
}
|
744 |
+
|
745 |
+
# Optional style modulation factor
|
746 |
+
style_modifiers = {
|
747 |
+
"dramatic": 0.9, # slower
|
748 |
+
"urgent": 1.1, # faster
|
749 |
+
"neutral": 1.0
|
750 |
+
}
|
751 |
+
|
752 |
+
for idx, entry in enumerate(translated_json):
|
753 |
+
start, end = float(entry.get("start", 0)), float(entry.get("end", 0))
|
754 |
+
duration = max(0.1, end - start)
|
755 |
+
|
756 |
+
original_text = entry.get("original", "")
|
757 |
+
translated_text = entry.get("translated", "")
|
758 |
+
category = entry.get("category", "drama")
|
759 |
+
source_lang = source_language
|
760 |
+
target_lang = target_language
|
761 |
+
style = entry.get("style", "neutral") # Optional field like "dramatic"
|
762 |
+
|
763 |
+
# Observed speed from original
|
764 |
+
base_text = original_text or translated_text
|
765 |
+
obs_speed = len(base_text) / duration
|
766 |
+
|
767 |
+
# Prior speed from category + language
|
768 |
+
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
769 |
+
|
770 |
+
# Shrinkage estimate
|
771 |
+
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
772 |
+
|
773 |
+
# Adjust for language-specific pacing
|
774 |
+
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
775 |
+
adjusted_speed = shrink_speed * ratio
|
776 |
+
|
777 |
+
# Optional tone/style modulation (if available)
|
778 |
+
mod = style_modifiers.get(style.lower(), 1.0)
|
779 |
+
adjusted_speed *= mod
|
780 |
+
|
781 |
+
# Final estimated duration for synthesized segment
|
782 |
+
target_chars = len(translated_text)
|
783 |
+
target_duration = round(target_chars / adjusted_speed, 2)
|
784 |
+
|
785 |
+
# Logging for debugging
|
786 |
+
logger.info(
|
787 |
+
f"Segment {idx}: dur={duration:.2f}s, obs={obs_speed:.2f}, "
|
788 |
+
f"prior={prior_speed:.2f}, shrink={shrink_speed:.2f}, "
|
789 |
+
f"final_speed={adjusted_speed:.2f}, target_dur={target_duration:.2f}s"
|
790 |
+
)
|
791 |
+
|
792 |
+
entry["speed"] = round(adjusted_speed, 3)
|
793 |
+
entry["target_duration"] = target_duration
|
794 |
+
|
795 |
+
return translated_json
|
796 |
+
|
797 |
def calibrated_speed(text, desired_duration):
|
798 |
"""
|
799 |
Compute a speed factor to help TTS fit audio into desired duration,
|
|
|
836 |
|
837 |
# Step 2: Translate the transcription
|
838 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
839 |
+
translated_json_raw = translate_text(transcription_json, source_language, target_language)
|
840 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
841 |
|
842 |
# translated_json = post_edit_translated_segments(translated_json, file.name)
|
843 |
+
translated_json = apply_adaptive_speed(translated_json_raw, source_language, target_language)
|
844 |
+
|
845 |
# Step 3: Add transcript to video based on timestamps
|
846 |
logger.info("Adding translated transcript to video...")
|
847 |
+
add_transcript_voiceover(file.name, translated_json_speedcontrol, output_video_path, process_mode, target_language)
|
848 |
logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
|
849 |
|
850 |
# Convert translated JSON into a format for the editable table
|