siddhartharyaai commited on
Commit
faddf89
·
verified ·
1 Parent(s): 31ff046

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +26 -19
utils.py CHANGED
@@ -284,14 +284,15 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
284
  return Dialogue(**data)
285
 
286
  # --------------------------------------------------------------
287
- # TTS Preprocessing to handle decimals, hyphens, and selective fillers
288
  # --------------------------------------------------------------
289
  def _preprocess_text_for_tts(text: str) -> str:
290
  """
291
  1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
292
- 2) Replace hyphens with spaces.
293
- 3) Insert filler words only in certain contexts (like "I think", or after '?').
294
  """
 
295
  # 1) Convert decimals
296
  def convert_decimal(m):
297
  number_str = m.group() # e.g. "3.14"
@@ -302,26 +303,31 @@ def _preprocess_text_for_tts(text: str) -> str:
302
 
303
  text = re.sub(r"\d+\.\d+", convert_decimal, text)
304
 
305
- # 2) Hyphens -> spaces
 
306
  text = re.sub(r"-", " ", text)
307
 
308
- # 3) Targeted filler insertion
309
- # a) Insert "uh" after "I think" or "I'm not sure", etc. (very naive approach)
310
- text = re.sub(
311
- r"(I think|I'm not sure|I guess)([,.]?\s)",
312
- r"\1, uh,\2",
313
- text,
314
- flags=re.IGNORECASE
315
- )
316
 
317
- # b) If there's a "?" then sometimes insert "um," right after it
318
- text = text.replace("?", "?<QMARK>")
319
- def insert_filler_qmark(m):
 
 
 
 
 
 
 
320
  if random.random() < 0.5:
321
- return "? um,"
322
  else:
323
- return "?"
324
- text = re.sub(r"\?<QMARK>", insert_filler_qmark, text)
 
 
325
 
326
  return text.strip()
327
 
@@ -339,11 +345,12 @@ def _spell_digits(d: str) -> str:
339
  def generate_audio_mp3(text: str, speaker: str) -> str:
340
  """
341
  Main TTS function, calls Deepgram with preprocessed text.
 
342
  """
343
  try:
344
  print(f"[LOG] Generating audio for speaker: {speaker}")
345
 
346
- # Preprocess text (decimal/hyphen/fillers)
347
  processed_text = _preprocess_text_for_tts(text)
348
 
349
  # Define Deepgram API endpoint
 
284
  return Dialogue(**data)
285
 
286
  # --------------------------------------------------------------
287
+ # TTS Preprocessing to handle decimals, hyphens, short thinking pauses, etc.
288
  # --------------------------------------------------------------
289
  def _preprocess_text_for_tts(text: str) -> str:
290
  """
291
  1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
292
+ 2) Replace hyphens with spaces (so TTS doesn't say 'dash').
293
+ 3) Insert filler words or '...' for natural-sounding pauses at significant points.
294
  """
295
+
296
  # 1) Convert decimals
297
  def convert_decimal(m):
298
  number_str = m.group() # e.g. "3.14"
 
303
 
304
  text = re.sub(r"\d+\.\d+", convert_decimal, text)
305
 
306
+ # 2) Replace hyphens with spaces
307
+ # e.g. "mother-in-law" -> "mother in law"
308
  text = re.sub(r"-", " ", text)
309
 
310
+ # 3) Insert natural-sounding short pauses:
311
+ # a) After exclamation points or question marks, add "..." with small chance
312
+ # b) Random small "thinking" filler for major statements
 
 
 
 
 
313
 
314
+ # Step 3a: Exclamations / questions
315
+ text = re.sub(r"(!+)", r"\1...", text) # e.g. "Wow!" -> "Wow!..."
316
+ text = re.sub(r"(\?+)", r"\1...", text) # e.g. "Really?" -> "Really?..."
317
+
318
+ # Step 3b: Insert small breaks for "thinking"
319
+ # We'll define some keywords that might indicate a "significant point."
320
+ # e.g. "important", "significant", "crucial", "point", "topic"
321
+ # Then we insert '..., hmm,' or '..., well,' afterwards with a small chance.
322
+ def insert_thinking_pause(m):
323
+ word = m.group(1)
324
  if random.random() < 0.5:
325
+ return f"{word}..., hmm,"
326
  else:
327
+ return f"{word}..., well,"
328
+
329
+ keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
330
+ text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
331
 
332
  return text.strip()
333
 
 
345
  def generate_audio_mp3(text: str, speaker: str) -> str:
346
  """
347
  Main TTS function, calls Deepgram with preprocessed text.
348
+ Returns path to a temporary MP3 file.
349
  """
350
  try:
351
  print(f"[LOG] Generating audio for speaker: {speaker}")
352
 
353
+ # Preprocess text (decimal/hyphen/pause insertion)
354
  processed_text = _preprocess_text_for_tts(text)
355
 
356
  # Define Deepgram API endpoint