Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -445,8 +445,10 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
|
|
445 |
try:
|
446 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
447 |
processed_text = _preprocess_text_for_tts(text, speaker)
|
448 |
-
|
449 |
-
|
|
|
|
|
450 |
|
451 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
452 |
params = {
|
@@ -498,82 +500,59 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
|
|
498 |
|
499 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
500 |
"""
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
def insert_periods_for_abbrev(m):
|
515 |
-
abbr = m.group(0)
|
516 |
-
parted = ".".join(list(abbr)) + "."
|
517 |
-
return parted
|
518 |
-
text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
|
519 |
-
text = re.sub(r"\.\.", ".", text)
|
520 |
-
def remove_periods_for_tts(m):
|
521 |
-
chunk = m.group(0)
|
522 |
-
return chunk.replace(".", " ").strip()
|
523 |
-
text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
|
524 |
-
|
525 |
-
# 3) Hyphens -> spaces
|
526 |
-
text = re.sub(r"-", " ", text)
|
527 |
-
|
528 |
-
# 4) Convert decimals (e.g. "3.14")
|
529 |
-
def convert_decimal(m):
|
530 |
-
number_str = m.group()
|
531 |
-
parts = number_str.split('.')
|
532 |
-
whole_part = _spell_digits(parts[0])
|
533 |
-
decimal_part = " ".join(_spell_digits(d) for d in parts[1])
|
534 |
-
return f"{whole_part} point {decimal_part}"
|
535 |
-
text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
|
536 |
-
|
537 |
-
# 5) Convert pure integer => words
|
538 |
-
def convert_int_to_words(m):
|
539 |
-
num_str = m.group()
|
540 |
-
# Remove commas before conversion
|
541 |
-
num_str_clean = num_str.replace(',', '')
|
542 |
try:
|
543 |
-
|
544 |
-
|
545 |
-
return
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
text = re.sub(r
|
568 |
-
text = re.sub(r
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
|
|
|
|
|
|
|
|
|
|
577 |
|
578 |
return text.strip()
|
579 |
|
|
|
445 |
try:
|
446 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
447 |
processed_text = _preprocess_text_for_tts(text, speaker)
|
448 |
+
|
449 |
+
# Debug: Print the processed text to verify number conversion
|
450 |
+
print("[DEBUG] Processed text for TTS:")
|
451 |
+
print(processed_text)
|
452 |
|
453 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
454 |
params = {
|
|
|
500 |
|
501 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
502 |
"""
|
503 |
+
Comprehensive preprocessing to ensure natural speech:
|
504 |
+
1) Convert multi-digit numbers to words.
|
505 |
+
2) Handle abbreviations.
|
506 |
+
3) Convert decimals.
|
507 |
+
4) Handle emotive expressions.
|
508 |
+
5) Remove random fillers.
|
509 |
+
6) Capitalize sentence starts.
|
510 |
+
"""
|
511 |
+
# 1) Convert multi-digit numbers (including those with commas) to words
|
512 |
+
def convert_number(match):
|
513 |
+
num_str = match.group()
|
514 |
+
# Remove commas
|
515 |
+
num_clean = num_str.replace(',', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
try:
|
517 |
+
# Convert to integer if possible
|
518 |
+
number = int(num_clean)
|
519 |
+
return num2words(number)
|
520 |
+
except ValueError:
|
521 |
+
try:
|
522 |
+
# If not integer, try float
|
523 |
+
number = float(num_clean)
|
524 |
+
return num2words(number)
|
525 |
+
except ValueError:
|
526 |
+
# If not a number, return as is
|
527 |
+
return num_str
|
528 |
+
|
529 |
+
# Regex to match numbers with optional commas and decimal points
|
530 |
+
text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', convert_number, text)
|
531 |
+
|
532 |
+
# 2) Handle abbreviations (e.g., NIA -> N I A)
|
533 |
+
def expand_abbreviation(match):
|
534 |
+
abbr = match.group()
|
535 |
+
# Do not add spaces if the abbreviation is part of a word
|
536 |
+
return ' '.join(abbr)
|
537 |
+
|
538 |
+
text = re.sub(r'\b[A-Z]{2,}\b', expand_abbreviation, text)
|
539 |
+
|
540 |
+
# 3) Handle emotive expressions
|
541 |
+
text = re.sub(r'\b(ha|haha|heh|lol)\b', '(* laughs *)', text, flags=re.IGNORECASE)
|
542 |
+
text = re.sub(r'\bsigh\b', '(* sighs *)', text, flags=re.IGNORECASE)
|
543 |
+
text = re.sub(r'\b(groan|moan)\b', '(* groans *)', text, flags=re.IGNORECASE)
|
544 |
+
|
545 |
+
# 4) Remove random fillers
|
546 |
+
text = re.sub(r'\b(uh|um|ah)\b', '', text, flags=re.IGNORECASE)
|
547 |
+
|
548 |
+
# 5) Capitalize sentence starts
|
549 |
+
def capitalize_sentence(match):
|
550 |
+
return match.group().upper()
|
551 |
+
|
552 |
+
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_sentence, text)
|
553 |
+
|
554 |
+
# 6) Replace multiple spaces with single space
|
555 |
+
text = re.sub(r'\s+', ' ', text)
|
556 |
|
557 |
return text.strip()
|
558 |
|