qqwjq1981 commited on
Commit
e35aecd
·
verified ·
1 Parent(s): 966f4e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -5
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import numpy as np
2
  import cvxpy as cp
3
  import re
 
4
  import concurrent.futures
5
  import gradio as gr
6
  from datetime import datetime
@@ -556,7 +557,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
556
  try:
557
  segment_audio_path = f"segment_{i}_voiceover.wav"
558
  desired_duration = entry["end"] - entry["start"]
559
- desired_speed = calibrated_speed(entry['translated'], desired_duration)
560
 
561
  speaker = entry.get("speaker", "default")
562
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
@@ -608,7 +609,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
608
 
609
  with concurrent.futures.ThreadPoolExecutor() as executor:
610
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
611
- for i, entry in enumerate(translated_json)]
612
 
613
  results = []
614
  for future in concurrent.futures.as_completed(futures):
@@ -715,6 +716,84 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
715
  logger.error(traceback.format_exc())
716
  return None, err_msg, err_msg
717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  def calibrated_speed(text, desired_duration):
719
  """
720
  Compute a speed factor to help TTS fit audio into desired duration,
@@ -757,14 +836,15 @@ def upload_and_manage(file, target_language, process_mode):
757
 
758
  # Step 2: Translate the transcription
759
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
760
- translated_json = translate_text(transcription_json, source_language, target_language)
761
  logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
762
 
763
  # translated_json = post_edit_translated_segments(translated_json, file.name)
764
-
 
765
  # Step 3: Add transcript to video based on timestamps
766
  logger.info("Adding translated transcript to video...")
767
- add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language)
768
  logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
769
 
770
  # Convert translated JSON into a format for the editable table
 
1
  import numpy as np
2
  import cvxpy as cp
3
  import re
4
+ import copy
5
  import concurrent.futures
6
  import gradio as gr
7
  from datetime import datetime
 
557
  try:
558
  segment_audio_path = f"segment_{i}_voiceover.wav"
559
  desired_duration = entry["end"] - entry["start"]
560
+ desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
561
 
562
  speaker = entry.get("speaker", "default")
563
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
 
609
 
610
  with concurrent.futures.ThreadPoolExecutor() as executor:
611
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
612
+ for i, entry in enumerate(translated_json_withspeed)]
613
 
614
  results = []
615
  for future in concurrent.futures.as_completed(futures):
 
716
  logger.error(traceback.format_exc())
717
  return None, err_msg, err_msg
718
 
719
+ def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
720
+ """
721
+ Adds a `speed` (chars/sec) and `target_duration` (sec) field to each segment
722
+ using shrinkage-based estimation and language stretch ratios.
723
+ Optionally modulates based on tone or style tags (e.g., "dramatic", "urgent").
724
+ """
725
+ translated_json = copy.deepcopy(translated_json_raw)
726
+
727
+ # Prior average speech speeds by (category, target language)
728
+ priors = {
729
+ ("drama", "en"): 5.0,
730
+ ("drama", "zh"): 4.5,
731
+ ("tutorial", "en"): 5.2,
732
+ ("tutorial", "zh"): 4.8,
733
+ ("shortplay", "en"): 5.1,
734
+ ("shortplay", "zh"): 4.7,
735
+ }
736
+
737
+ # Adjustment ratio based on language pair (source → target)
738
+ lang_ratio = {
739
+ ("zh", "en"): 0.85,
740
+ ("en", "zh"): 1.15,
741
+ ("zh", "jp"): 1.05,
742
+ ("en", "ja"): 0.9,
743
+ }
744
+
745
+ # Optional style modulation factor
746
+ style_modifiers = {
747
+ "dramatic": 0.9, # slower
748
+ "urgent": 1.1, # faster
749
+ "neutral": 1.0
750
+ }
751
+
752
+ for idx, entry in enumerate(translated_json):
753
+ start, end = float(entry.get("start", 0)), float(entry.get("end", 0))
754
+ duration = max(0.1, end - start)
755
+
756
+ original_text = entry.get("original", "")
757
+ translated_text = entry.get("translated", "")
758
+ category = entry.get("category", "drama")
759
+ source_lang = source_language
760
+ target_lang = target_language
761
+ style = entry.get("style", "neutral") # Optional field like "dramatic"
762
+
763
+ # Observed speed from original
764
+ base_text = original_text or translated_text
765
+ obs_speed = len(base_text) / duration
766
+
767
+ # Prior speed from category + language
768
+ prior_speed = priors.get((category, target_lang), default_prior_speed)
769
+
770
+ # Shrinkage estimate
771
+ shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
772
+
773
+ # Adjust for language-specific pacing
774
+ ratio = lang_ratio.get((source_lang, target_lang), 1.0)
775
+ adjusted_speed = shrink_speed * ratio
776
+
777
+ # Optional tone/style modulation (if available)
778
+ mod = style_modifiers.get(style.lower(), 1.0)
779
+ adjusted_speed *= mod
780
+
781
+ # Final estimated duration for synthesized segment
782
+ target_chars = len(translated_text)
783
+ target_duration = round(target_chars / adjusted_speed, 2)
784
+
785
+ # Logging for debugging
786
+ logger.info(
787
+ f"Segment {idx}: dur={duration:.2f}s, obs={obs_speed:.2f}, "
788
+ f"prior={prior_speed:.2f}, shrink={shrink_speed:.2f}, "
789
+ f"final_speed={adjusted_speed:.2f}, target_dur={target_duration:.2f}s"
790
+ )
791
+
792
+ entry["speed"] = round(adjusted_speed, 3)
793
+ entry["target_duration"] = target_duration
794
+
795
+ return translated_json
796
+
797
  def calibrated_speed(text, desired_duration):
798
  """
799
  Compute a speed factor to help TTS fit audio into desired duration,
 
836
 
837
  # Step 2: Translate the transcription
838
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
839
+ translated_json_raw = translate_text(transcription_json, source_language, target_language)
840
  logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
841
 
842
  # translated_json = post_edit_translated_segments(translated_json, file.name)
843
+ translated_json = apply_adaptive_speed(translated_json_raw, source_language, target_language)
844
+
845
  # Step 3: Add transcript to video based on timestamps
846
  logger.info("Adding translated transcript to video...")
847
+ add_transcript_voiceover(file.name, translated_json_speedcontrol, output_video_path, process_mode, target_language)
848
  logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
849
 
850
  # Convert translated JSON into a format for the editable table