Spaces:

koya-recommender
/

system

Runtime error

App Files Files Community

Owos commited on Feb 24, 2023

Commit

8c233e9

1 Parent(s): 5f24ebc

updated token error

Browse files

Files changed (1) hide show

app.py +21 -20

app.py CHANGED Viewed

@@ -42,12 +42,12 @@ class MLMDataset(Dataset):
     def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
         self.sentence = sentence
         self.tokenizer = tokenizer
-        self.num_samples = num_samples
         self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
-        self.batch_input = self.tensor_input.repeat(len(self.sentence), 1)
-        self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],len(self.sentence),replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
         self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
         # Added by Chris Emezue on 29.01.2023
@@ -178,25 +178,26 @@ models = get_model_infos(multilingual=None)
 selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
 )
-progress_text = "Computing recommendation Scores"
-st.write(help(st.progress))
-my_bar = st.progress(0)
-scores={}
-for index, model_id in enumerate(selected_models):
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelWithLMHead.from_pretrained(model_id)
-    if model_id == 'castorini/afriberta_base':
-        tokenizer.model_max_length = 512
-    MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
-    MLM_UNK_TOKEN = tokenizer.unk_token_id
-    BATCH_SIZE = 1
-    score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
-    scores[model_id] = score
-    my_bar.progress(index + 1, text=progress_text)
-st.write("Our recommendation is:", scores)

     def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
         self.sentence = sentence
         self.tokenizer = tokenizer
+        self.num_samples = len(self.sentence) - 2
         self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
+        self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
+        self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
         self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
         # Added by Chris Emezue on 29.01.2023
 selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
 )
+run = st.button("Get Scores")
+if run:
+    progress_text = "Computing recommendation Scores"
+    my_bar = st.progress(0)
+    scores={}
+    for index, model_id in enumerate(selected_models):
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelWithLMHead.from_pretrained(model_id)
+        if model_id == 'castorini/afriberta_base':
+            tokenizer.model_max_length = 512
+        MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
+        MLM_UNK_TOKEN = tokenizer.unk_token_id
+        BATCH_SIZE = 1
+        score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
+        scores[model_id] = score
+        my_bar.progress(index + 1, text=progress_text)
+    scores = sort_dictionary(scores)
+    st.write("Our recommendation is:", scores)