Owos commited on
Commit
8c233e9
·
1 Parent(s): 5f24ebc

updated token error

Browse files
Files changed (1) hide show
  1. app.py +21 -20
app.py CHANGED
@@ -42,12 +42,12 @@ class MLMDataset(Dataset):
42
  def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
  self.sentence = sentence
44
  self.tokenizer = tokenizer
45
- self.num_samples = num_samples
46
 
47
  self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
48
- self.batch_input = self.tensor_input.repeat(len(self.sentence), 1)
49
 
50
- self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],len(self.sentence),replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
51
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
52
 
53
  # Added by Chris Emezue on 29.01.2023
@@ -178,25 +178,26 @@ models = get_model_infos(multilingual=None)
178
  selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
179
 
180
  )
181
-
182
- progress_text = "Computing recommendation Scores"
183
- st.write(help(st.progress))
184
- my_bar = st.progress(0)
185
 
 
 
186
 
 
 
187
 
188
- scores={}
189
- for index, model_id in enumerate(selected_models):
190
- tokenizer = AutoTokenizer.from_pretrained(model_id)
191
- model = AutoModelWithLMHead.from_pretrained(model_id)
192
- if model_id == 'castorini/afriberta_base':
193
- tokenizer.model_max_length = 512
194
- MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
195
- MLM_UNK_TOKEN = tokenizer.unk_token_id
196
 
197
- BATCH_SIZE = 1
198
- score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
199
- scores[model_id] = score
200
- my_bar.progress(index + 1, text=progress_text)
 
 
 
 
201
 
202
- st.write("Our recommendation is:", scores)
 
 
 
 
 
 
42
  def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
  self.sentence = sentence
44
  self.tokenizer = tokenizer
45
+ self.num_samples = len(self.sentence) - 2
46
 
47
  self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
48
+ self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
49
 
50
+ self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
51
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
52
 
53
  # Added by Chris Emezue on 29.01.2023
 
178
  selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
179
 
180
  )
 
 
 
 
181
 
182
+ run = st.button("Get Scores")
183
+ if run:
184
 
185
+ progress_text = "Computing recommendation Scores"
186
+ my_bar = st.progress(0)
187
 
 
 
 
 
 
 
 
 
188
 
189
+ scores={}
190
+ for index, model_id in enumerate(selected_models):
191
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
192
+ model = AutoModelWithLMHead.from_pretrained(model_id)
193
+ if model_id == 'castorini/afriberta_base':
194
+ tokenizer.model_max_length = 512
195
+ MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
196
+ MLM_UNK_TOKEN = tokenizer.unk_token_id
197
 
198
+ BATCH_SIZE = 1
199
+ score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
200
+ scores[model_id] = score
201
+ my_bar.progress(index + 1, text=progress_text)
202
+ scores = sort_dictionary(scores)
203
+ st.write("Our recommendation is:", scores)