Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -284,14 +284,15 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
|
|
284 |
return Dialogue(**data)
|
285 |
|
286 |
# --------------------------------------------------------------
|
287 |
-
# TTS Preprocessing to handle decimals, hyphens,
|
288 |
# --------------------------------------------------------------
|
289 |
def _preprocess_text_for_tts(text: str) -> str:
|
290 |
"""
|
291 |
1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
|
292 |
-
2) Replace hyphens with spaces.
|
293 |
-
3) Insert filler words
|
294 |
"""
|
|
|
295 |
# 1) Convert decimals
|
296 |
def convert_decimal(m):
|
297 |
number_str = m.group() # e.g. "3.14"
|
@@ -302,26 +303,31 @@ def _preprocess_text_for_tts(text: str) -> str:
|
|
302 |
|
303 |
text = re.sub(r"\d+\.\d+", convert_decimal, text)
|
304 |
|
305 |
-
# 2)
|
|
|
306 |
text = re.sub(r"-", " ", text)
|
307 |
|
308 |
-
# 3)
|
309 |
-
#
|
310 |
-
|
311 |
-
r"(I think|I'm not sure|I guess)([,.]?\s)",
|
312 |
-
r"\1, uh,\2",
|
313 |
-
text,
|
314 |
-
flags=re.IGNORECASE
|
315 |
-
)
|
316 |
|
317 |
-
#
|
318 |
-
text =
|
319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
if random.random() < 0.5:
|
321 |
-
return "
|
322 |
else:
|
323 |
-
return "
|
324 |
-
|
|
|
|
|
325 |
|
326 |
return text.strip()
|
327 |
|
@@ -339,11 +345,12 @@ def _spell_digits(d: str) -> str:
|
|
339 |
def generate_audio_mp3(text: str, speaker: str) -> str:
|
340 |
"""
|
341 |
Main TTS function, calls Deepgram with preprocessed text.
|
|
|
342 |
"""
|
343 |
try:
|
344 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
345 |
|
346 |
-
# Preprocess text (decimal/hyphen/
|
347 |
processed_text = _preprocess_text_for_tts(text)
|
348 |
|
349 |
# Define Deepgram API endpoint
|
|
|
284 |
return Dialogue(**data)
|
285 |
|
286 |
# --------------------------------------------------------------
|
287 |
+
# TTS Preprocessing to handle decimals, hyphens, short thinking pauses, etc.
|
288 |
# --------------------------------------------------------------
|
289 |
def _preprocess_text_for_tts(text: str) -> str:
|
290 |
"""
|
291 |
1) Convert decimals to spelled-out words ("3.14" -> "three point one four").
|
292 |
+
2) Replace hyphens with spaces (so TTS doesn't say 'dash').
|
293 |
+
3) Insert filler words or '...' for natural-sounding pauses at significant points.
|
294 |
"""
|
295 |
+
|
296 |
# 1) Convert decimals
|
297 |
def convert_decimal(m):
|
298 |
number_str = m.group() # e.g. "3.14"
|
|
|
303 |
|
304 |
text = re.sub(r"\d+\.\d+", convert_decimal, text)
|
305 |
|
306 |
+
# 2) Replace hyphens with spaces
|
307 |
+
# e.g. "mother-in-law" -> "mother in law"
|
308 |
text = re.sub(r"-", " ", text)
|
309 |
|
310 |
+
# 3) Insert natural-sounding short pauses:
|
311 |
+
# a) After exclamation points or question marks, add "..." with small chance
|
312 |
+
# b) Random small "thinking" filler for major statements
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
+
# Step 3a: Exclamations / questions
|
315 |
+
text = re.sub(r"(!+)", r"\1...", text) # e.g. "Wow!" -> "Wow!..."
|
316 |
+
text = re.sub(r"(\?+)", r"\1...", text) # e.g. "Really?" -> "Really?..."
|
317 |
+
|
318 |
+
# Step 3b: Insert small breaks for "thinking"
|
319 |
+
# We'll define some keywords that might indicate a "significant point."
|
320 |
+
# e.g. "important", "significant", "crucial", "point", "topic"
|
321 |
+
# Then we insert '..., hmm,' or '..., well,' afterwards with a small chance.
|
322 |
+
def insert_thinking_pause(m):
|
323 |
+
word = m.group(1)
|
324 |
if random.random() < 0.5:
|
325 |
+
return f"{word}..., hmm,"
|
326 |
else:
|
327 |
+
return f"{word}..., well,"
|
328 |
+
|
329 |
+
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
330 |
+
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
331 |
|
332 |
return text.strip()
|
333 |
|
|
|
345 |
def generate_audio_mp3(text: str, speaker: str) -> str:
|
346 |
"""
|
347 |
Main TTS function, calls Deepgram with preprocessed text.
|
348 |
+
Returns path to a temporary MP3 file.
|
349 |
"""
|
350 |
try:
|
351 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
352 |
|
353 |
+
# Preprocess text (decimal/hyphen/pause insertion)
|
354 |
processed_text = _preprocess_text_for_tts(text)
|
355 |
|
356 |
# Define Deepgram API endpoint
|