# Standard library imports import warnings from typing import List, Dict, Union class TokenizerUtils: """ Utility class for handling token-related operations, particularly for identifying tokens that contain numerals or specific symbols. This class includes an __init__ method for completeness, but it does not perform any initialization since the class is intended to be used as a static utility class. Methods ------- find_numeral_symbol_tokens(tokenizer) Returns a list of token IDs that include numerals or symbols like '%', '$', or '£'. """ def __init__(self): """Initialize the TokenizerUtils class. This method is present for completeness.""" pass @staticmethod def find_numeral_symbol_tokens(tokenizer) -> List[int]: """ Identifies tokens that contain numerals or certain symbols in the tokenizer vocabulary. Parameters ---------- tokenizer : Any Tokenizer object with a 'get_vocab' method, typically from Hugging Face's tokenizer library. Returns ------- List[int] List of token IDs for tokens that contain numerals or symbols. Examples -------- >>> TokenizerUtils.find_numeral_symbol_tokens(tokenizer) [-1, 123, 456, 789] """ numeral_symbol_tokens = [-1] for token, token_id in tokenizer.get_vocab().items(): if any(c in "0123456789%$£" for c in token): numeral_symbol_tokens.append(token_id) return numeral_symbol_tokens class Formatter: """ A utility class for formatting audio-related data, such as sentence-speaker mappings. Methods ------- add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]: Adds an index key to each item in the SSM list and checks for length mismatches with a reference. format_ssm_as_dialogue( ssm: List[Dict], print_output: bool = False, return_dict: bool = False ) -> Union[str, Dict[str, List[str]]]: Formats sentence-speaker mappings into a readable dialogue format and optionally prints it or returns a dictionary grouped by speakers. """ @staticmethod def add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]: """ Adds an index key to each item in the SSM list and optionally checks for length mismatches with a reference length. Parameters ---------- ssm : List[Dict] The final SSM data. reference_length : int, optional A reference length to compare the SSM length against, default is None. Returns ------- List[Dict] The SSM data with added index keys and any necessary adjustments. """ if reference_length is not None and len(ssm) != reference_length: warnings.warn( f"Mismatch: SSM Length = {len(ssm)}, Reference Length = {reference_length}. " f"Adjusting to match lengths...", UserWarning, ) for idx, item in enumerate(ssm): item["index"] = idx if reference_length is not None: if len(ssm) > reference_length: ssm = ssm[:reference_length] elif len(ssm) < reference_length: for i in range(len(ssm), reference_length): ssm.append({ "index": i, "speaker": "Unknown", "start_time": None, "end_time": None, "text": "[Placeholder]" }) return ssm @staticmethod def format_ssm_as_dialogue( ssm: List[Dict], print_output: bool = False, return_dict: bool = False ) -> Union[str, Dict[str, List[str]]]: """ Formats the sentence-speaker mapping (ssm) as a dialogue and optionally prints the result or returns it as a dictionary grouped by speakers. Parameters ---------- ssm : List[Dict] List of sentences with speaker labels. print_output : bool, optional Whether to print the formatted dialogue, default is False. return_dict : bool, optional Whether to return the response as a dictionary grouped by speakers, default is False. Returns ------- Union[str, Dict[str, List[str]]] If `return_dict` is True, returns a dictionary with speakers as keys and lists of their sentences as values. Otherwise, returns the formatted dialogue string. """ dialogue_dict: Dict[str, List[str]] = {} for sentence in ssm: speaker = sentence['speaker'] text = sentence['text'].strip() if speaker in dialogue_dict: dialogue_dict[speaker].append(text) else: dialogue_dict[speaker] = [text] if print_output: print("Formatted Dialogue:") for speaker, texts in dialogue_dict.items(): for text in texts: print(f"{speaker}: {text}") print() if return_dict: return dialogue_dict formatted_dialogue = "\n\n".join( [f"{speaker}: {text}" for speaker, texts in dialogue_dict.items() for text in texts] ) return formatted_dialogue if __name__ == "__main__": # noinspection PyMissingOrEmptyDocstring class DummyTokenizer: @staticmethod def get_vocab(): return { "hello": 1, "world": 2, "100%": 3, "$value": 4, "item_123": 5, "£price": 6 } dummy_tokenizer = DummyTokenizer() numeral_tokens = TokenizerUtils.find_numeral_symbol_tokens(dummy_tokenizer) print(f"Numeral and symbol tokens: {numeral_tokens}") speaker_sentence_mapping = [ {"speaker": "Speaker 1", "text": "Hello, how are you?"}, {"speaker": "Speaker 2", "text": "I'm fine, thank you! And you?"}, {"speaker": "Speaker 1", "text": "I'm doing great, thanks for asking."} ] formatted_dialogue_str = Formatter.format_ssm_as_dialogue(speaker_sentence_mapping, print_output=True) print(f"Formatted Dialogue:\n{formatted_dialogue_str}")