Spaces:

bunyaminergen
/

CallyticsDemo

Running

File size: 6,516 Bytes

1b97239

# Standard library imports
import warnings
from typing import List, Dict, Union


class TokenizerUtils:
    """
    Utility class for handling token-related operations, particularly for identifying tokens
    that contain numerals or specific symbols.

    This class includes an __init__ method for completeness, but it does not perform any
    initialization since the class is intended to be used as a static utility class.

    Methods
    -------
    find_numeral_symbol_tokens(tokenizer)
        Returns a list of token IDs that include numerals or symbols like '%', '$', or '£'.
    """

    def __init__(self):
        """Initialize the TokenizerUtils class. This method is present for completeness."""
        pass

    @staticmethod
    def find_numeral_symbol_tokens(tokenizer) -> List[int]:
        """
        Identifies tokens that contain numerals or certain symbols in the tokenizer vocabulary.

        Parameters
        ----------
        tokenizer : Any
            Tokenizer object with a 'get_vocab' method, typically from Hugging Face's tokenizer library.

        Returns
        -------
        List[int]
            List of token IDs for tokens that contain numerals or symbols.

        Examples
        --------
        >>> TokenizerUtils.find_numeral_symbol_tokens(tokenizer)
        [-1, 123, 456, 789]
        """
        numeral_symbol_tokens = [-1]
        for token, token_id in tokenizer.get_vocab().items():
            if any(c in "0123456789%$£" for c in token):
                numeral_symbol_tokens.append(token_id)
        return numeral_symbol_tokens


class Formatter:
    """
    A utility class for formatting audio-related data, such as sentence-speaker mappings.

    Methods
    -------
    add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]:
        Adds an index key to each item in the SSM list and checks for length mismatches with a reference.
    format_ssm_as_dialogue(
        ssm: List[Dict],
        print_output: bool = False,
        return_dict: bool = False
    ) -> Union[str, Dict[str, List[str]]]:
        Formats sentence-speaker mappings into a readable dialogue format and optionally prints it or returns a
        dictionary grouped by speakers.
    """

    @staticmethod
    def add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]:
        """
        Adds an index key to each item in the SSM list and optionally checks for length mismatches with a reference
        length.

        Parameters
        ----------
        ssm : List[Dict]
            The final SSM data.
        reference_length : int, optional
            A reference length to compare the SSM length against, default is None.

        Returns
        -------
        List[Dict]
            The SSM data with added index keys and any necessary adjustments.
        """
        if reference_length is not None and len(ssm) != reference_length:
            warnings.warn(
                f"Mismatch: SSM Length = {len(ssm)}, Reference Length = {reference_length}. "
                f"Adjusting to match lengths...",
                UserWarning,
            )

        for idx, item in enumerate(ssm):
            item["index"] = idx

        if reference_length is not None:
            if len(ssm) > reference_length:
                ssm = ssm[:reference_length]
            elif len(ssm) < reference_length:
                for i in range(len(ssm), reference_length):
                    ssm.append({
                        "index": i,
                        "speaker": "Unknown",
                        "start_time": None,
                        "end_time": None,
                        "text": "[Placeholder]"
                    })

        return ssm

    @staticmethod
    def format_ssm_as_dialogue(
            ssm: List[Dict],
            print_output: bool = False,
            return_dict: bool = False
    ) -> Union[str, Dict[str, List[str]]]:
        """
        Formats the sentence-speaker mapping (ssm) as a dialogue and optionally prints the result or returns it as a
        dictionary grouped by speakers.

        Parameters
        ----------
        ssm : List[Dict]
            List of sentences with speaker labels.
        print_output : bool, optional
            Whether to print the formatted dialogue, default is False.
        return_dict : bool, optional
            Whether to return the response as a dictionary grouped by speakers, default is False.

        Returns
        -------
        Union[str, Dict[str, List[str]]]
            If `return_dict` is True, returns a dictionary with speakers as keys and lists of their sentences as values.
            Otherwise, returns the formatted dialogue string.
        """
        dialogue_dict: Dict[str, List[str]] = {}

        for sentence in ssm:
            speaker = sentence['speaker']
            text = sentence['text'].strip()

            if speaker in dialogue_dict:
                dialogue_dict[speaker].append(text)
            else:
                dialogue_dict[speaker] = [text]

        if print_output:
            print("Formatted Dialogue:")
            for speaker, texts in dialogue_dict.items():
                for text in texts:
                    print(f"{speaker}: {text}")
                print()

        if return_dict:
            return dialogue_dict

        formatted_dialogue = "\n\n".join(
            [f"{speaker}: {text}" for speaker, texts in dialogue_dict.items() for text in texts]
        )
        return formatted_dialogue


if __name__ == "__main__":
    # noinspection PyMissingOrEmptyDocstring
    class DummyTokenizer:
        @staticmethod
        def get_vocab():
            return {
                "hello": 1,
                "world": 2,
                "100%": 3,
                "$value": 4,
                "item_123": 5,
                "£price": 6
            }


    dummy_tokenizer = DummyTokenizer()
    numeral_tokens = TokenizerUtils.find_numeral_symbol_tokens(dummy_tokenizer)
    print(f"Numeral and symbol tokens: {numeral_tokens}")

    speaker_sentence_mapping = [
        {"speaker": "Speaker 1", "text": "Hello, how are you?"},
        {"speaker": "Speaker 2", "text": "I'm fine, thank you! And you?"},
        {"speaker": "Speaker 1", "text": "I'm doing great, thanks for asking."}
    ]

    formatted_dialogue_str = Formatter.format_ssm_as_dialogue(speaker_sentence_mapping, print_output=True)
    print(f"Formatted Dialogue:\n{formatted_dialogue_str}")