MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 7

Commit

a37cfc6

verified ·

1 Parent(s): 4af0354

Update utils.py

Browse files

Files changed (1) hide show

utils.py +52 -21

utils.py CHANGED Viewed

@@ -409,13 +409,13 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
-        # Preprocess text
-        processed_text = _preprocess_text_for_tts(text)
         # Deepgram TTS endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
-            "model": "aura-asteria-en",  # default
         }
         if speaker == "John":
             params["model"] = "aura-zeus-en"
@@ -468,10 +468,11 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
 # ---------------------------------------------------------------------
 # TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
 # ---------------------------------------------------------------------
-def _preprocess_text_for_tts(text: str) -> str:
     """
     Enhances text for natural-sounding TTS by handling abbreviations,
     punctuation, and intelligent filler insertion.
     """
     # 1) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
@@ -513,23 +514,23 @@ def _preprocess_text_for_tts(text: str) -> str:
     # text = re.sub(r",(\s|$)", r",...\1", text)
     # text = re.sub(r"\?(\s|$)", r"?...\1", text)
-    # 5) Intelligent filler insertion after specific keywords
-    def insert_thinking_pause(m):
-        word = m.group(1)
-        # Decide randomly whether to insert a filler
-        if random.random() < 0.3:  # 30% chance
-            filler = random.choice(['hmm,', 'well,', 'let me see,'])
-            return f"{word}..., {filler}"
-        else:
-            return f"{word}...,"
-    keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
-    text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-    # 6) Insert dynamic pauses within sentences (e.g., after conjunctions)
-    # This adds natural pauses without overusing fillers
-    conjunctions_pattern = r"\b(and|but|so|because|however)\b"
-    text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
     # 7) Remove any unintended random fillers (safeguard)
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
@@ -559,3 +560,33 @@ def _spell_digits(d: str) -> str:
         '9': 'nine'
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)

     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
+        # Preprocess text with speaker context
+        processed_text = _preprocess_text_for_tts(text, speaker)
         # Deepgram TTS endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
+            "model": "aura-luna-en",  # default
         }
         if speaker == "John":
             params["model"] = "aura-zeus-en"
 # ---------------------------------------------------------------------
 # TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
 # ---------------------------------------------------------------------
+def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
     Enhances text for natural-sounding TTS by handling abbreviations,
     punctuation, and intelligent filler insertion.
+    Adjustments are made based on the speaker to optimize output quality.
     """
     # 1) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
     # text = re.sub(r",(\s|$)", r",...\1", text)
     # text = re.sub(r"\?(\s|$)", r"?...\1", text)
+    # 5) Intelligent filler insertion after specific keywords (skip for Jane)
+    if speaker != "Jane":
+        def insert_thinking_pause(m):
+            word = m.group(1)
+            # Decide randomly whether to insert a filler
+            if random.random() < 0.3:  # 30% chance
+                filler = random.choice(['hmm,', 'well,', 'let me see,'])
+                return f"{word}..., {filler}"
+            else:
+                return f"{word}...,"
+        keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
+        text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
+    # 6) Insert dynamic pauses within sentences (e.g., after conjunctions) for non-Jane speakers
+    if speaker != "Jane":
+        conjunctions_pattern = r"\b(and|but|so|because|however)\b"
+        text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
     # 7) Remove any unintended random fillers (safeguard)
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
         '9': 'nine'
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
+def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
+    """
+    Mixes 'spoken' with bg_music.mp3 in the root folder:
+    1) Start with 2 seconds of music alone before speech begins.
+    2) Loop the music if it's shorter than the final audio length.
+    3) Lower the music volume so the speech is clear.
+    """
+    bg_music_path = "bg_music.mp3"  # in root folder
+    try:
+        bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
+    except Exception as e:
+        print("[ERROR] Failed to load background music:", e)
+        return spoken
+    # Reduce background music volume further
+    bg_music = bg_music - 18.0  # Lower volume (e.g. -18 dB)
+    total_length_ms = len(spoken) + 2000
+    looped_music = AudioSegment.empty()
+    while len(looped_music) < total_length_ms:
+        looped_music += bg_music
+    looped_music = looped_music[:total_length_ms]
+    # Overlay spoken at 2000ms so we get 2s of music first
+    final_mix = looped_music.overlay(spoken, position=2000)
+    return final_mix