File size: 5,574 Bytes
383520d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
Text normalization and cleaning utilities for CSM-1B TTS system.
Handles common issues like contractions, numbers, and special characters.
"""
import re
import logging

logger = logging.getLogger(__name__)

class TextNormalizer:
    """Text normalization utilities for TTS."""
    
    # Common English contractions mapping
    CONTRACTIONS = {
        "don't": "dont",
        "won't": "wont",
        "can't": "cant",
        "isn't": "isnt",
        "he's": "hes",
        "she's": "shes",
        "they're": "theyre",
        "we're": "were",
        "you're": "youre",
        "that's": "thats",
        "it's": "its",
        "what's": "whats",
        "let's": "lets",
        "who's": "whos",
        "how's": "hows",
        "where's": "wheres",
        "there's": "theres",
        "wouldn't": "wouldnt",
        "shouldn't": "shouldnt",
        "couldn't": "couldnt",
        "hasn't": "hasnt",
        "haven't": "havent",
        "hadn't": "hadnt",
        "didn't": "didnt",
        "i'm": "im",
        "i've": "ive",
        "i'd": "id",
        "i'll": "ill",
        "you've": "youve",
        "you'll": "youll",
        "you'd": "youd",
        "we've": "weve",
        "we'll": "well",
        "we'd": "wed",
        "they've": "theyve",
        "they'll": "theyll",
        "they'd": "theyd",
        "aren't": "arent",
        "weren't": "werent",
        "wasn't": "wasnt",
    }
    
    # Common abbreviations to expand
    ABBREVIATIONS = {
        "Mr.": "Mister",
        "Mrs.": "Misses",
        "Dr.": "Doctor",
        "Prof.": "Professor",
        "St.": "Street",
        "Rd.": "Road",
        "Ave.": "Avenue",
        "vs.": "versus",
        "etc.": "etcetera",
        "e.g.": "for example",
        "i.e.": "that is",
        "approx.": "approximately",
    }
    
    # Simple number words for common numbers
    NUMBER_WORDS = {
        "0": "zero",
        "1": "one",
        "2": "two",
        "3": "three",
        "4": "four",
        "5": "five",
        "6": "six",
        "7": "seven",
        "8": "eight",
        "9": "nine",
        "10": "ten",
        "11": "eleven",
        "12": "twelve",
        "13": "thirteen",
        "14": "fourteen",
        "15": "fifteen",
        "16": "sixteen",
        "17": "seventeen",
        "18": "eighteen",
        "19": "nineteen",
        "20": "twenty",
        "30": "thirty",
        "40": "forty",
        "50": "fifty",
        "60": "sixty",
        "70": "seventy",
        "80": "eighty",
        "90": "ninety",
        "100": "one hundred",
        "1000": "one thousand",
        "1000000": "one million",
        "1000000000": "one billion",
    }
    
    @classmethod
    def normalize_text(cls, text: str) -> str:
        """
        Normalize text for TTS: handle contractions, punctuation, and special cases.
        
        Args:
            text: Input text to normalize
            
        Returns:
            Normalized text ready for TTS
        """
        if not text:
            return text
            
        # Log original text for debugging
        logger.debug(f"Normalizing text: '{text}'")
        
        # Remove voice instructions in square brackets
        text = re.sub(r'\[.*?\]', '', text)
        
        # Handle contractions - preserving case sensitivity
        for contraction, replacement in cls.CONTRACTIONS.items():
            # Case insensitive replacement
            text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE)
        
        # Expand common abbreviations
        for abbr, expanded in cls.ABBREVIATIONS.items():
            text = text.replace(abbr, expanded)
        
        # Handle numbers - only convert standalone numbers
        def replace_number(match):
            number = match.group(0)
            if number in cls.NUMBER_WORDS:
                return cls.NUMBER_WORDS[number]
            return number
            
        text = re.sub(r'\b\d+\b', replace_number, text)
        
        # Replace problematic symbols
        text = text.replace("&", " and ")
        text = text.replace("%", " percent ")
        text = text.replace("@", " at ")
        text = text.replace("#", " number ")
        text = text.replace("$", " dollar ")
        text = text.replace("€", " euro ")
        text = text.replace("£", " pound ")
        text = text.replace("¥", " yen ")
        
        # Handle dates in MM/DD/YYYY format
        text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text)
        
        # Fix excessive spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Ensure sentence ends with punctuation
        if not text[-1] in ['.', '!', '?', ';', ':', ',']:
            text = text + '.'
            
        logger.debug(f"Normalized text: '{text}'")
        return text
    
    @classmethod
    def split_into_sentences(cls, text: str) -> list:
        """
        Split text into sentences for better TTS performance.
        
        Args:
            text: Input text to split
            
        Returns:
            List of sentences
        """
        # Normalize first
        text = cls.normalize_text(text)
        
        # Split on sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # Remove empty sentences
        sentences = [s for s in sentences if s.strip()]
        
        return sentences

def clean_text_for_tts(text: str) -> str:
    """Clean and normalize text for TTS processing."""
    return TextNormalizer.normalize_text(text)