MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 6

Commit

faddf89

verified ·

1 Parent(s): 31ff046

Update utils.py

Browse files

Files changed (1) hide show

utils.py +26 -19

utils.py CHANGED Viewed

@@ -284,14 +284,15 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
     return Dialogue(**data)
 # --------------------------------------------------------------
-# TTS Preprocessing to handle decimals, hyphens, and selective fillers
 # --------------------------------------------------------------
 def _preprocess_text_for_tts(text: str) -> str:
     """
     1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
-    2) Replace hyphens with spaces.
-    3) Insert filler words only in certain contexts (like "I think", or after '?').
     """
     # 1) Convert decimals
     def convert_decimal(m):
         number_str = m.group()  # e.g. "3.14"
@@ -302,26 +303,31 @@ def _preprocess_text_for_tts(text: str) -> str:
     text = re.sub(r"\d+\.\d+", convert_decimal, text)
-    # 2) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
-    # 3) Targeted filler insertion
-    # a) Insert "uh" after "I think" or "I'm not sure", etc. (very naive approach)
-    text = re.sub(
-        r"(I think|I'm not sure|I guess)([,.]?\s)",
-        r"\1, uh,\2",
-        text,
-        flags=re.IGNORECASE
-    )
-    # b) If there's a "?" then sometimes insert "um," right after it
-    text = text.replace("?", "?<QMARK>")
-    def insert_filler_qmark(m):
         if random.random() < 0.5:
-            return "? um,"
         else:
-            return "?"
-    text = re.sub(r"\?<QMARK>", insert_filler_qmark, text)
     return text.strip()
@@ -339,11 +345,12 @@ def _spell_digits(d: str) -> str:
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Main TTS function, calls Deepgram with preprocessed text.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
-        # Preprocess text (decimal/hyphen/fillers)
         processed_text = _preprocess_text_for_tts(text)
         # Define Deepgram API endpoint

     return Dialogue(**data)
 # --------------------------------------------------------------
+# TTS Preprocessing to handle decimals, hyphens, short thinking pauses, etc.
 # --------------------------------------------------------------
 def _preprocess_text_for_tts(text: str) -> str:
     """
     1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
+    2) Replace hyphens with spaces (so TTS doesn't say 'dash').
+    3) Insert filler words or '...' for natural-sounding pauses at significant points.
     """
     # 1) Convert decimals
     def convert_decimal(m):
         number_str = m.group()  # e.g. "3.14"
     text = re.sub(r"\d+\.\d+", convert_decimal, text)
+    # 2) Replace hyphens with spaces
+    #    e.g. "mother-in-law" -> "mother in law"
     text = re.sub(r"-", " ", text)
+    # 3) Insert natural-sounding short pauses:
+    #    a) After exclamation points or question marks, add "..." with small chance
+    #    b) Random small "thinking" filler for major statements
+    # Step 3a: Exclamations / questions
+    text = re.sub(r"(!+)", r"\1...", text)  # e.g. "Wow!" -> "Wow!..."
+    text = re.sub(r"(\?+)", r"\1...", text) # e.g. "Really?" -> "Really?..."
+    # Step 3b: Insert small breaks for "thinking"
+    # We'll define some keywords that might indicate a "significant point."
+    # e.g. "important", "significant", "crucial", "point", "topic"
+    # Then we insert '..., hmm,' or '..., well,' afterwards with a small chance.
+    def insert_thinking_pause(m):
+        word = m.group(1)
         if random.random() < 0.5:
+            return f"{word}..., hmm,"
         else:
+            return f"{word}..., well,"
+    keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
+    text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
     return text.strip()
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Main TTS function, calls Deepgram with preprocessed text.
+    Returns path to a temporary MP3 file.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
+        # Preprocess text (decimal/hyphen/pause insertion)
         processed_text = _preprocess_text_for_tts(text)
         # Define Deepgram API endpoint