siddhartharyaai commited on
Commit
a37cfc6
·
verified ·
1 Parent(s): 4af0354

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +52 -21
utils.py CHANGED
@@ -409,13 +409,13 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
409
  try:
410
  print(f"[LOG] Generating audio for speaker: {speaker}")
411
 
412
- # Preprocess text
413
- processed_text = _preprocess_text_for_tts(text)
414
 
415
  # Deepgram TTS endpoint
416
  deepgram_api_url = "https://api.deepgram.com/v1/speak"
417
  params = {
418
- "model": "aura-asteria-en", # default
419
  }
420
  if speaker == "John":
421
  params["model"] = "aura-zeus-en"
@@ -468,10 +468,11 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
468
  # ---------------------------------------------------------------------
469
  # TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
470
  # ---------------------------------------------------------------------
471
- def _preprocess_text_for_tts(text: str) -> str:
472
  """
473
  Enhances text for natural-sounding TTS by handling abbreviations,
474
  punctuation, and intelligent filler insertion.
 
475
  """
476
  # 1) Hyphens -> spaces
477
  text = re.sub(r"-", " ", text)
@@ -513,23 +514,23 @@ def _preprocess_text_for_tts(text: str) -> str:
513
  # text = re.sub(r",(\s|$)", r",...\1", text)
514
  # text = re.sub(r"\?(\s|$)", r"?...\1", text)
515
 
516
- # 5) Intelligent filler insertion after specific keywords
517
- def insert_thinking_pause(m):
518
- word = m.group(1)
519
- # Decide randomly whether to insert a filler
520
- if random.random() < 0.3: # 30% chance
521
- filler = random.choice(['hmm,', 'well,', 'let me see,'])
522
- return f"{word}..., {filler}"
523
- else:
524
- return f"{word}...,"
525
-
526
- keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
527
- text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
528
-
529
- # 6) Insert dynamic pauses within sentences (e.g., after conjunctions)
530
- # This adds natural pauses without overusing fillers
531
- conjunctions_pattern = r"\b(and|but|so|because|however)\b"
532
- text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
533
 
534
  # 7) Remove any unintended random fillers (safeguard)
535
  text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
@@ -559,3 +560,33 @@ def _spell_digits(d: str) -> str:
559
  '9': 'nine'
560
  }
561
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  try:
410
  print(f"[LOG] Generating audio for speaker: {speaker}")
411
 
412
+ # Preprocess text with speaker context
413
+ processed_text = _preprocess_text_for_tts(text, speaker)
414
 
415
  # Deepgram TTS endpoint
416
  deepgram_api_url = "https://api.deepgram.com/v1/speak"
417
  params = {
418
+ "model": "aura-luna-en", # default
419
  }
420
  if speaker == "John":
421
  params["model"] = "aura-zeus-en"
 
468
  # ---------------------------------------------------------------------
469
  # TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
470
  # ---------------------------------------------------------------------
471
+ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
472
  """
473
  Enhances text for natural-sounding TTS by handling abbreviations,
474
  punctuation, and intelligent filler insertion.
475
+ Adjustments are made based on the speaker to optimize output quality.
476
  """
477
  # 1) Hyphens -> spaces
478
  text = re.sub(r"-", " ", text)
 
514
  # text = re.sub(r",(\s|$)", r",...\1", text)
515
  # text = re.sub(r"\?(\s|$)", r"?...\1", text)
516
 
517
+ # 5) Intelligent filler insertion after specific keywords (skip for Jane)
518
+ if speaker != "Jane":
519
+ def insert_thinking_pause(m):
520
+ word = m.group(1)
521
+ # Decide randomly whether to insert a filler
522
+ if random.random() < 0.3: # 30% chance
523
+ filler = random.choice(['hmm,', 'well,', 'let me see,'])
524
+ return f"{word}..., {filler}"
525
+ else:
526
+ return f"{word}...,"
527
+ keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
528
+ text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
529
+
530
+ # 6) Insert dynamic pauses within sentences (e.g., after conjunctions) for non-Jane speakers
531
+ if speaker != "Jane":
532
+ conjunctions_pattern = r"\b(and|but|so|because|however)\b"
533
+ text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
534
 
535
  # 7) Remove any unintended random fillers (safeguard)
536
  text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
 
560
  '9': 'nine'
561
  }
562
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
563
+
564
+ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
565
+ """
566
+ Mixes 'spoken' with bg_music.mp3 in the root folder:
567
+ 1) Start with 2 seconds of music alone before speech begins.
568
+ 2) Loop the music if it's shorter than the final audio length.
569
+ 3) Lower the music volume so the speech is clear.
570
+ """
571
+ bg_music_path = "bg_music.mp3" # in root folder
572
+
573
+ try:
574
+ bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
575
+ except Exception as e:
576
+ print("[ERROR] Failed to load background music:", e)
577
+ return spoken
578
+
579
+ # Reduce background music volume further
580
+ bg_music = bg_music - 18.0 # Lower volume (e.g. -18 dB)
581
+
582
+ total_length_ms = len(spoken) + 2000
583
+ looped_music = AudioSegment.empty()
584
+ while len(looped_music) < total_length_ms:
585
+ looped_music += bg_music
586
+
587
+ looped_music = looped_music[:total_length_ms]
588
+
589
+ # Overlay spoken at 2000ms so we get 2s of music first
590
+ final_mix = looped_music.overlay(spoken, position=2000)
591
+
592
+ return final_mix