Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -409,13 +409,13 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
|
|
409 |
try:
|
410 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
411 |
|
412 |
-
# Preprocess text
|
413 |
-
processed_text = _preprocess_text_for_tts(text)
|
414 |
|
415 |
# Deepgram TTS endpoint
|
416 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
417 |
params = {
|
418 |
-
"model": "aura-
|
419 |
}
|
420 |
if speaker == "John":
|
421 |
params["model"] = "aura-zeus-en"
|
@@ -468,10 +468,11 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
|
|
468 |
# ---------------------------------------------------------------------
|
469 |
# TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
|
470 |
# ---------------------------------------------------------------------
|
471 |
-
def _preprocess_text_for_tts(text: str) -> str:
|
472 |
"""
|
473 |
Enhances text for natural-sounding TTS by handling abbreviations,
|
474 |
punctuation, and intelligent filler insertion.
|
|
|
475 |
"""
|
476 |
# 1) Hyphens -> spaces
|
477 |
text = re.sub(r"-", " ", text)
|
@@ -513,23 +514,23 @@ def _preprocess_text_for_tts(text: str) -> str:
|
|
513 |
# text = re.sub(r",(\s|$)", r",...\1", text)
|
514 |
# text = re.sub(r"\?(\s|$)", r"?...\1", text)
|
515 |
|
516 |
-
# 5) Intelligent filler insertion after specific keywords
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
# 6) Insert dynamic pauses within sentences (e.g., after conjunctions)
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
|
534 |
# 7) Remove any unintended random fillers (safeguard)
|
535 |
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
@@ -559,3 +560,33 @@ def _spell_digits(d: str) -> str:
|
|
559 |
'9': 'nine'
|
560 |
}
|
561 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
try:
|
410 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
411 |
|
412 |
+
# Preprocess text with speaker context
|
413 |
+
processed_text = _preprocess_text_for_tts(text, speaker)
|
414 |
|
415 |
# Deepgram TTS endpoint
|
416 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
417 |
params = {
|
418 |
+
"model": "aura-luna-en", # default
|
419 |
}
|
420 |
if speaker == "John":
|
421 |
params["model"] = "aura-zeus-en"
|
|
|
468 |
# ---------------------------------------------------------------------
|
469 |
# TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
|
470 |
# ---------------------------------------------------------------------
|
471 |
+
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
472 |
"""
|
473 |
Enhances text for natural-sounding TTS by handling abbreviations,
|
474 |
punctuation, and intelligent filler insertion.
|
475 |
+
Adjustments are made based on the speaker to optimize output quality.
|
476 |
"""
|
477 |
# 1) Hyphens -> spaces
|
478 |
text = re.sub(r"-", " ", text)
|
|
|
514 |
# text = re.sub(r",(\s|$)", r",...\1", text)
|
515 |
# text = re.sub(r"\?(\s|$)", r"?...\1", text)
|
516 |
|
517 |
+
# 5) Intelligent filler insertion after specific keywords (skip for Jane)
|
518 |
+
if speaker != "Jane":
|
519 |
+
def insert_thinking_pause(m):
|
520 |
+
word = m.group(1)
|
521 |
+
# Decide randomly whether to insert a filler
|
522 |
+
if random.random() < 0.3: # 30% chance
|
523 |
+
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
524 |
+
return f"{word}..., {filler}"
|
525 |
+
else:
|
526 |
+
return f"{word}...,"
|
527 |
+
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
528 |
+
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
529 |
+
|
530 |
+
# 6) Insert dynamic pauses within sentences (e.g., after conjunctions) for non-Jane speakers
|
531 |
+
if speaker != "Jane":
|
532 |
+
conjunctions_pattern = r"\b(and|but|so|because|however)\b"
|
533 |
+
text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
534 |
|
535 |
# 7) Remove any unintended random fillers (safeguard)
|
536 |
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
|
|
560 |
'9': 'nine'
|
561 |
}
|
562 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
563 |
+
|
564 |
+
def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
|
565 |
+
"""
|
566 |
+
Mixes 'spoken' with bg_music.mp3 in the root folder:
|
567 |
+
1) Start with 2 seconds of music alone before speech begins.
|
568 |
+
2) Loop the music if it's shorter than the final audio length.
|
569 |
+
3) Lower the music volume so the speech is clear.
|
570 |
+
"""
|
571 |
+
bg_music_path = "bg_music.mp3" # in root folder
|
572 |
+
|
573 |
+
try:
|
574 |
+
bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
|
575 |
+
except Exception as e:
|
576 |
+
print("[ERROR] Failed to load background music:", e)
|
577 |
+
return spoken
|
578 |
+
|
579 |
+
# Reduce background music volume further
|
580 |
+
bg_music = bg_music - 18.0 # Lower volume (e.g. -18 dB)
|
581 |
+
|
582 |
+
total_length_ms = len(spoken) + 2000
|
583 |
+
looped_music = AudioSegment.empty()
|
584 |
+
while len(looped_music) < total_length_ms:
|
585 |
+
looped_music += bg_music
|
586 |
+
|
587 |
+
looped_music = looped_music[:total_length_ms]
|
588 |
+
|
589 |
+
# Overlay spoken at 2000ms so we get 2s of music first
|
590 |
+
final_mix = looped_music.overlay(spoken, position=2000)
|
591 |
+
|
592 |
+
return final_mix
|