Spaces:

jameszokah
/

jamiya

Running

File size: 5,574 Bytes

383520d

"""
Text normalization and cleaning utilities for CSM-1B TTS system.
Handles common issues like contractions, numbers, and special characters.
"""
import re
import logging

logger = logging.getLogger(__name__)

class TextNormalizer:
    """Text normalization utilities for TTS."""
    
    # Common English contractions mapping
    CONTRACTIONS = {
        "don't": "dont",
        "won't": "wont",
        "can't": "cant",
        "isn't": "isnt",
        "he's": "hes",
        "she's": "shes",
        "they're": "theyre",
        "we're": "were",
        "you're": "youre",
        "that's": "thats",
        "it's": "its",
        "what's": "whats",
        "let's": "lets",
        "who's": "whos",
        "how's": "hows",
        "where's": "wheres",
        "there's": "theres",
        "wouldn't": "wouldnt",
        "shouldn't": "shouldnt",
        "couldn't": "couldnt",
        "hasn't": "hasnt",
        "haven't": "havent",
        "hadn't": "hadnt",
        "didn't": "didnt",
        "i'm": "im",
        "i've": "ive",
        "i'd": "id",
        "i'll": "ill",
        "you've": "youve",
        "you'll": "youll",
        "you'd": "youd",
        "we've": "weve",
        "we'll": "well",
        "we'd": "wed",
        "they've": "theyve",
        "they'll": "theyll",
        "they'd": "theyd",
        "aren't": "arent",
        "weren't": "werent",
        "wasn't": "wasnt",
    }
    
    # Common abbreviations to expand
    ABBREVIATIONS = {
        "Mr.": "Mister",
        "Mrs.": "Misses",
        "Dr.": "Doctor",
        "Prof.": "Professor",
        "St.": "Street",
        "Rd.": "Road",
        "Ave.": "Avenue",
        "vs.": "versus",
        "etc.": "etcetera",
        "e.g.": "for example",
        "i.e.": "that is",
        "approx.": "approximately",
    }
    
    # Simple number words for common numbers
    NUMBER_WORDS = {
        "0": "zero",
        "1": "one",
        "2": "two",
        "3": "three",
        "4": "four",
        "5": "five",
        "6": "six",
        "7": "seven",
        "8": "eight",
        "9": "nine",
        "10": "ten",
        "11": "eleven",
        "12": "twelve",
        "13": "thirteen",
        "14": "fourteen",
        "15": "fifteen",
        "16": "sixteen",
        "17": "seventeen",
        "18": "eighteen",
        "19": "nineteen",
        "20": "twenty",
        "30": "thirty",
        "40": "forty",
        "50": "fifty",
        "60": "sixty",
        "70": "seventy",
        "80": "eighty",
        "90": "ninety",
        "100": "one hundred",
        "1000": "one thousand",
        "1000000": "one million",
        "1000000000": "one billion",
    }
    
    @classmethod
    def normalize_text(cls, text: str) -> str:
        """
        Normalize text for TTS: handle contractions, punctuation, and special cases.
        
        Args:
            text: Input text to normalize
            
        Returns:
            Normalized text ready for TTS
        """
        if not text:
            return text
            
        # Log original text for debugging
        logger.debug(f"Normalizing text: '{text}'")
        
        # Remove voice instructions in square brackets
        text = re.sub(r'\[.*?\]', '', text)
        
        # Handle contractions - preserving case sensitivity
        for contraction, replacement in cls.CONTRACTIONS.items():
            # Case insensitive replacement
            text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE)
        
        # Expand common abbreviations
        for abbr, expanded in cls.ABBREVIATIONS.items():
            text = text.replace(abbr, expanded)
        
        # Handle numbers - only convert standalone numbers
        def replace_number(match):
            number = match.group(0)
            if number in cls.NUMBER_WORDS:
                return cls.NUMBER_WORDS[number]
            return number
            
        text = re.sub(r'\b\d+\b', replace_number, text)
        
        # Replace problematic symbols
        text = text.replace("&", " and ")
        text = text.replace("%", " percent ")
        text = text.replace("@", " at ")
        text = text.replace("#", " number ")
        text = text.replace("$", " dollar ")
        text = text.replace("€", " euro ")
        text = text.replace("£", " pound ")
        text = text.replace("¥", " yen ")
        
        # Handle dates in MM/DD/YYYY format
        text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text)
        
        # Fix excessive spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Ensure sentence ends with punctuation
        if not text[-1] in ['.', '!', '?', ';', ':', ',']:
            text = text + '.'
            
        logger.debug(f"Normalized text: '{text}'")
        return text
    
    @classmethod
    def split_into_sentences(cls, text: str) -> list:
        """
        Split text into sentences for better TTS performance.
        
        Args:
            text: Input text to split
            
        Returns:
            List of sentences
        """
        # Normalize first
        text = cls.normalize_text(text)
        
        # Split on sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # Remove empty sentences
        sentences = [s for s in sentences if s.strip()]
        
        return sentences

def clean_text_for_tts(text: str) -> str:
    """Clean and normalize text for TTS processing."""
    return TextNormalizer.normalize_text(text)