mebubo commited on
Commit
87af3eb
·
1 Parent(s): da342d0
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -15,6 +15,9 @@ class Word:
15
  logprob: float
16
  context: list[int]
17
 
 
 
 
18
  def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
19
  words: list[Word] = []
20
  current_word: list[int] = []
@@ -31,7 +34,7 @@ def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer)
31
 
32
  for i, (token_id, logprob) in enumerate(token_probs):
33
  token: str = tokenizer.convert_ids_to_tokens([token_id])[0]
34
- if not token.startswith(chr(9601)) and token.isalpha():
35
  current_word.append(token_id)
36
  current_log_probs.append(logprob)
37
  else:
@@ -80,12 +83,13 @@ def generate_outputs(model: PreTrainedModel, inputs: BatchEncoding, num_samples:
80
  outputs = model.generate(
81
  input_ids=input_ids,
82
  attention_mask=attention_mask,
83
- max_length=input_ids.shape[-1] + 5,
84
  num_return_sequences=num_samples,
85
  temperature=1.0,
86
  top_k=50,
87
  top_p=0.95,
88
  do_sample=True
 
89
  )
90
  return outputs
91
 
@@ -96,8 +100,8 @@ def extract_replacements(outputs: GenerateOutput | torch.LongTensor, tokenizer:
96
  for j in range(num_samples):
97
  generated_ids = outputs[i * num_samples + j][input_len:]
98
  new_word = tokenizer.convert_ids_to_tokens(generated_ids.tolist())[0]
99
- if new_word.startswith(chr(9601)):
100
- replacements.append(new_word)
101
  all_new_words.append(replacements)
102
  return all_new_words
103
 
@@ -105,20 +109,18 @@ def extract_replacements(outputs: GenerateOutput | torch.LongTensor, tokenizer:
105
 
106
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
107
 
108
- model_name = "mistralai/Mistral-7B-v0.1"
 
109
  model, tokenizer = load_model_and_tokenizer(model_name, device)
110
 
111
  #%%
112
-
113
  input_text = "He asked me to prostrate myself before the king, but I rifused."
114
  inputs: BatchEncoding = tokenize(input_text, tokenizer, device)
115
 
116
  #%%
117
-
118
  token_probs: list[tuple[int, float]] = calculate_log_probabilities(model, tokenizer, inputs)
119
 
120
  #%%
121
-
122
  words = split_into_words(token_probs, tokenizer)
123
  log_prob_threshold = -5.0
124
  low_prob_words = [word for word in words if word.logprob < log_prob_threshold]
@@ -129,7 +131,6 @@ inputs = prepare_inputs(contexts, tokenizer, device)
129
  input_ids = inputs["input_ids"]
130
 
131
  #%%
132
-
133
  num_samples = 5
134
  start_time = time.time()
135
  outputs = generate_outputs(model, inputs, num_samples)
@@ -140,13 +141,11 @@ print(f"Total time taken for replacements: {end_time - start_time:.4f} seconds")
140
  replacements_batch = extract_replacements(outputs, tokenizer, input_ids.shape[0], input_ids.shape[1], num_samples)
141
 
142
  #%%
143
-
144
  for word, replacements in zip(low_prob_words, replacements_batch):
145
  print(f"Original word: {word.text}, Log Probability: {word.logprob:.4f}")
146
  print(f"Proposed replacements: {replacements}")
147
 
148
  # %%
149
-
150
  generated_ids = outputs[:, input_ids.shape[-1]:]
151
  for g in generated_ids:
152
  print(tokenizer.convert_ids_to_tokens(g.tolist()))
 
15
  logprob: float
16
  context: list[int]
17
 
18
+ def starts_with_space(token: str) -> bool:
19
+ return token.startswith(chr(9601)) or token.startswith(chr(288))
20
+
21
  def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
22
  words: list[Word] = []
23
  current_word: list[int] = []
 
34
 
35
  for i, (token_id, logprob) in enumerate(token_probs):
36
  token: str = tokenizer.convert_ids_to_tokens([token_id])[0]
37
+ if not starts_with_space(token) and token.isalpha():
38
  current_word.append(token_id)
39
  current_log_probs.append(logprob)
40
  else:
 
83
  outputs = model.generate(
84
  input_ids=input_ids,
85
  attention_mask=attention_mask,
86
+ max_new_tokens=4,
87
  num_return_sequences=num_samples,
88
  temperature=1.0,
89
  top_k=50,
90
  top_p=0.95,
91
  do_sample=True
92
+ # num_beams=num_samples
93
  )
94
  return outputs
95
 
 
100
  for j in range(num_samples):
101
  generated_ids = outputs[i * num_samples + j][input_len:]
102
  new_word = tokenizer.convert_ids_to_tokens(generated_ids.tolist())[0]
103
+ if starts_with_space(new_word):
104
+ replacements.append(new_word[1:])
105
  all_new_words.append(replacements)
106
  return all_new_words
107
 
 
109
 
110
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111
 
112
+ # model_name = "mistralai/Mistral-7B-v0.1"
113
+ model_name = "unsloth/Llama-3.2-1B"
114
  model, tokenizer = load_model_and_tokenizer(model_name, device)
115
 
116
  #%%
 
117
  input_text = "He asked me to prostrate myself before the king, but I rifused."
118
  inputs: BatchEncoding = tokenize(input_text, tokenizer, device)
119
 
120
  #%%
 
121
  token_probs: list[tuple[int, float]] = calculate_log_probabilities(model, tokenizer, inputs)
122
 
123
  #%%
 
124
  words = split_into_words(token_probs, tokenizer)
125
  log_prob_threshold = -5.0
126
  low_prob_words = [word for word in words if word.logprob < log_prob_threshold]
 
131
  input_ids = inputs["input_ids"]
132
 
133
  #%%
 
134
  num_samples = 5
135
  start_time = time.time()
136
  outputs = generate_outputs(model, inputs, num_samples)
 
141
  replacements_batch = extract_replacements(outputs, tokenizer, input_ids.shape[0], input_ids.shape[1], num_samples)
142
 
143
  #%%
 
144
  for word, replacements in zip(low_prob_words, replacements_batch):
145
  print(f"Original word: {word.text}, Log Probability: {word.logprob:.4f}")
146
  print(f"Proposed replacements: {replacements}")
147
 
148
  # %%
 
149
  generated_ids = outputs[:, input_ids.shape[-1]:]
150
  for g in generated_ids:
151
  print(tokenizer.convert_ids_to_tokens(g.tolist()))