Spaces:

bunyaminergen
/

CallyticsDemo

Running

App Files Files Community

CallyticsDemo / src /audio /utils.py

bunyaminergen

Initial

1b97239 23 days ago

raw

history blame contribute delete

6.52 kB

	# Standard library imports
	import warnings
	from typing import List, Dict, Union


	class TokenizerUtils:
	"""
	Utility class for handling token-related operations, particularly for identifying tokens
	that contain numerals or specific symbols.

	This class includes an __init__ method for completeness, but it does not perform any
	initialization since the class is intended to be used as a static utility class.

	Methods
	-------
	find_numeral_symbol_tokens(tokenizer)
	Returns a list of token IDs that include numerals or symbols like '%', '$', or '£'.
	"""

	def __init__(self):
	"""Initialize the TokenizerUtils class. This method is present for completeness."""
	pass

	@staticmethod
	def find_numeral_symbol_tokens(tokenizer) -> List[int]:
	"""
	Identifies tokens that contain numerals or certain symbols in the tokenizer vocabulary.

	Parameters
	----------
	tokenizer : Any
	Tokenizer object with a 'get_vocab' method, typically from Hugging Face's tokenizer library.

	Returns
	-------
	List[int]
	List of token IDs for tokens that contain numerals or symbols.

	Examples
	--------
	>>> TokenizerUtils.find_numeral_symbol_tokens(tokenizer)
	[-1, 123, 456, 789]
	"""
	numeral_symbol_tokens = [-1]
	for token, token_id in tokenizer.get_vocab().items():
	if any(c in "0123456789%$£" for c in token):
	numeral_symbol_tokens.append(token_id)
	return numeral_symbol_tokens


	class Formatter:
	"""
	A utility class for formatting audio-related data, such as sentence-speaker mappings.

	Methods
	-------
	add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]:
	Adds an index key to each item in the SSM list and checks for length mismatches with a reference.
	format_ssm_as_dialogue(
	ssm: List[Dict],
	print_output: bool = False,
	return_dict: bool = False
	) -> Union[str, Dict[str, List[str]]]:
	Formats sentence-speaker mappings into a readable dialogue format and optionally prints it or returns a
	dictionary grouped by speakers.
	"""

	@staticmethod
	def add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]:
	"""
	Adds an index key to each item in the SSM list and optionally checks for length mismatches with a reference
	length.

	Parameters
	----------
	ssm : List[Dict]
	The final SSM data.
	reference_length : int, optional
	A reference length to compare the SSM length against, default is None.

	Returns
	-------
	List[Dict]
	The SSM data with added index keys and any necessary adjustments.
	"""
	if reference_length is not None and len(ssm) != reference_length:
	warnings.warn(
	f"Mismatch: SSM Length = {len(ssm)}, Reference Length = {reference_length}. "
	f"Adjusting to match lengths...",
	UserWarning,
	)

	for idx, item in enumerate(ssm):
	item["index"] = idx

	if reference_length is not None:
	if len(ssm) > reference_length:
	ssm = ssm[:reference_length]
	elif len(ssm) < reference_length:
	for i in range(len(ssm), reference_length):
	ssm.append({
	"index": i,
	"speaker": "Unknown",
	"start_time": None,
	"end_time": None,
	"text": "[Placeholder]"
	})

	return ssm

	@staticmethod
	def format_ssm_as_dialogue(
	ssm: List[Dict],
	print_output: bool = False,
	return_dict: bool = False
	) -> Union[str, Dict[str, List[str]]]:
	"""
	Formats the sentence-speaker mapping (ssm) as a dialogue and optionally prints the result or returns it as a
	dictionary grouped by speakers.

	Parameters
	----------
	ssm : List[Dict]
	List of sentences with speaker labels.
	print_output : bool, optional
	Whether to print the formatted dialogue, default is False.
	return_dict : bool, optional
	Whether to return the response as a dictionary grouped by speakers, default is False.

	Returns
	-------
	Union[str, Dict[str, List[str]]]
	If `return_dict` is True, returns a dictionary with speakers as keys and lists of their sentences as values.
	Otherwise, returns the formatted dialogue string.
	"""
	dialogue_dict: Dict[str, List[str]] = {}

	for sentence in ssm:
	speaker = sentence['speaker']
	text = sentence['text'].strip()

	if speaker in dialogue_dict:
	dialogue_dict[speaker].append(text)
	else:
	dialogue_dict[speaker] = [text]

	if print_output:
	print("Formatted Dialogue:")
	for speaker, texts in dialogue_dict.items():
	for text in texts:
	print(f"{speaker}: {text}")
	print()

	if return_dict:
	return dialogue_dict

	formatted_dialogue = "\n\n".join(
	[f"{speaker}: {text}" for speaker, texts in dialogue_dict.items() for text in texts]
	)
	return formatted_dialogue


	if __name__ == "__main__":
	# noinspection PyMissingOrEmptyDocstring
	class DummyTokenizer:
	@staticmethod
	def get_vocab():
	return {
	"hello": 1,
	"world": 2,
	"100%": 3,
	"$value": 4,
	"item_123": 5,
	"£price": 6
	}


	dummy_tokenizer = DummyTokenizer()
	numeral_tokens = TokenizerUtils.find_numeral_symbol_tokens(dummy_tokenizer)
	print(f"Numeral and symbol tokens: {numeral_tokens}")

	speaker_sentence_mapping = [
	{"speaker": "Speaker 1", "text": "Hello, how are you?"},
	{"speaker": "Speaker 2", "text": "I'm fine, thank you! And you?"},
	{"speaker": "Speaker 1", "text": "I'm doing great, thanks for asking."}
	]

	formatted_dialogue_str = Formatter.format_ssm_as_dialogue(speaker_sentence_mapping, print_output=True)
	print(f"Formatted Dialogue:\n{formatted_dialogue_str}")