mebubo commited on
Commit
e72ea09
·
1 Parent(s): 230a441
Files changed (1) hide show
  1. text_processing.py +10 -3
text_processing.py CHANGED
@@ -6,13 +6,14 @@ class Word:
6
  tokens: list[int]
7
  text: str
8
  logprob: float
9
- first_token_index: int
10
 
11
  def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
12
  words: list[Word] = []
13
  current_word: list[int] = []
14
  current_log_probs: list[float] = []
15
  current_word_first_token_index: int = 0
 
16
 
17
  for i, (token_id, logprob) in enumerate(token_probs):
18
  token: str = tokenizer.decode([token_id])
@@ -21,12 +22,18 @@ def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer)
21
  current_log_probs.append(logprob)
22
  else:
23
  if current_word:
24
- words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
 
 
 
25
  current_word = [token_id]
26
  current_log_probs = [logprob]
27
  current_word_first_token_index = i
28
 
29
  if current_word:
30
- words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
 
 
 
31
 
32
  return words
 
6
  tokens: list[int]
7
  text: str
8
  logprob: float
9
+ context: list[int]
10
 
11
  def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
12
  words: list[Word] = []
13
  current_word: list[int] = []
14
  current_log_probs: list[float] = []
15
  current_word_first_token_index: int = 0
16
+ all_tokens: list[int] = [token_id for token_id, _ in token_probs]
17
 
18
  for i, (token_id, logprob) in enumerate(token_probs):
19
  token: str = tokenizer.decode([token_id])
 
22
  current_log_probs.append(logprob)
23
  else:
24
  if current_word:
25
+ words.append(Word(current_word,
26
+ tokenizer.decode(current_word),
27
+ sum(current_log_probs),
28
+ all_tokens[:current_word_first_token_index]))
29
  current_word = [token_id]
30
  current_log_probs = [logprob]
31
  current_word_first_token_index = i
32
 
33
  if current_word:
34
+ words.append(Word(current_word,
35
+ tokenizer.decode(current_word),
36
+ sum(current_log_probs),
37
+ all_tokens[:current_word_first_token_index]))
38
 
39
  return words