from dataclasses import dataclass from tokenizers import Tokenizer @dataclass class Word: tokens: list[int] text: str logprob: float context: list[int] def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]: words: list[Word] = [] current_word: list[int] = [] current_log_probs: list[float] = [] current_word_first_token_index: int = 0 all_tokens: list[int] = [token_id for token_id, _ in token_probs] for i, (token_id, logprob) in enumerate(token_probs): token: str = tokenizer.decode([token_id]) if not token.startswith(chr(9601)) and token.isalpha(): current_word.append(token_id) current_log_probs.append(logprob) else: if current_word: words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), all_tokens[:current_word_first_token_index])) current_word = [token_id] current_log_probs = [logprob] current_word_first_token_index = i if current_word: words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), all_tokens[:current_word_first_token_index])) return words