Spaces:
Running
Running
added causal lm eval
Browse files- human_eval.py +109 -23
human_eval.py
CHANGED
@@ -33,6 +33,22 @@ def create_html_media(media_path, is_gif=False):
|
|
33 |
"""
|
34 |
return html_string
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
class LMBattleArena:
|
37 |
def __init__(self, dataset_path):
|
38 |
"""Initialize battle arena with dataset"""
|
@@ -40,23 +56,29 @@ class LMBattleArena:
|
|
40 |
print(self.df.head())
|
41 |
self.current_index = 0
|
42 |
self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
|
43 |
-
self.
|
|
|
44 |
self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
|
45 |
|
46 |
-
def get_next_battle_pair(self):
|
47 |
"""Retrieve next pair of summaries for comparison"""
|
48 |
if self.current_index >= len(self.df):
|
49 |
return None
|
50 |
|
51 |
row = self.df.iloc[self.current_index]
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
selected_models = random.sample(model_summary_cols, 2)
|
58 |
battle_data = {
|
59 |
-
'prompt': row['
|
60 |
'model_1': row[selected_models[0]],
|
61 |
'model_2': row[selected_models[1]],
|
62 |
'model1_name': selected_models[0],
|
@@ -65,7 +87,7 @@ class LMBattleArena:
|
|
65 |
self.current_index += 1
|
66 |
return battle_data
|
67 |
|
68 |
-
def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
|
69 |
"""Record user's model preference and update scores"""
|
70 |
self.model_scores[model1_name]['total_comparisons'] += 1
|
71 |
self.model_scores[model2_name]['total_comparisons'] += 1
|
@@ -87,14 +109,23 @@ class LMBattleArena:
|
|
87 |
'model2_name': model2_name,
|
88 |
'preferred_models': preferred_models
|
89 |
}
|
90 |
-
|
|
|
|
|
|
|
91 |
|
92 |
-
return self.get_model_scores_df()
|
93 |
|
94 |
-
def get_model_scores_df(self):
|
95 |
"""Convert model scores to DataFrame"""
|
96 |
scores_data = []
|
97 |
for model, stats in self.model_scores.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
|
99 |
scores_data.append({
|
100 |
'Model': model,
|
@@ -113,11 +144,11 @@ class LMBattleArena:
|
|
113 |
return results_df
|
114 |
|
115 |
|
116 |
-
def create_battle_arena(dataset_path, is_gif):
|
117 |
arena = LMBattleArena(dataset_path)
|
118 |
|
119 |
-
def battle_round():
|
120 |
-
battle_data = arena.get_next_battle_pair()
|
121 |
|
122 |
if battle_data is None:
|
123 |
return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
|
@@ -131,11 +162,11 @@ def create_battle_arena(dataset_path, is_gif):
|
|
131 |
gr.DataFrame(visible=True)
|
132 |
)
|
133 |
|
134 |
-
def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
|
135 |
scores_df = arena.record_evaluation(
|
136 |
-
preferred_models, input_text, output_1, output_2, model1_name, model2_name
|
137 |
)
|
138 |
-
next_battle = battle_round()
|
139 |
return (*next_battle[:-1], scores_df)
|
140 |
|
141 |
with gr.Blocks(css="footer{display:none !important}") as demo:
|
@@ -145,9 +176,60 @@ def create_battle_arena(dataset_path, is_gif):
|
|
145 |
gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
|
146 |
|
147 |
with gr.Tabs():
|
148 |
-
with gr.Tab("Battle Arena"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
150 |
|
|
|
|
|
|
|
151 |
input_text = gr.Textbox(
|
152 |
label="Input prompt",
|
153 |
interactive=False,
|
@@ -180,18 +262,22 @@ def create_battle_arena(dataset_path, is_gif):
|
|
180 |
|
181 |
submit_btn.click(
|
182 |
submit_preference,
|
183 |
-
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
|
184 |
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
185 |
)
|
186 |
|
187 |
-
demo.load(
|
|
|
|
|
|
|
|
|
188 |
|
189 |
return demo
|
190 |
|
191 |
if __name__ == "__main__":
|
192 |
|
193 |
# load the existing dataset that contains outputs of the LMs
|
194 |
-
human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-
|
195 |
|
196 |
# precision
|
197 |
torch_dtype = torch.float16
|
@@ -200,5 +286,5 @@ if __name__ == "__main__":
|
|
200 |
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
|
201 |
dataset_path = 'human_eval_dataset.csv'
|
202 |
is_gif = True
|
203 |
-
demo = create_battle_arena(dataset_path, is_gif)
|
204 |
demo.launch(debug=True)
|
|
|
33 |
"""
|
34 |
return html_string
|
35 |
|
36 |
+
MASKED_LM_MODELS = [
|
37 |
+
"BounharAbdelaziz/XLM-RoBERTa-Morocco",
|
38 |
+
"SI2M-Lab/DarijaBERT",
|
39 |
+
"BounharAbdelaziz/ModernBERT-Morocco",
|
40 |
+
"google-bert/bert-base-multilingual-cased",
|
41 |
+
"FacebookAI/xlm-roberta-large",
|
42 |
+
"aubmindlab/bert-base-arabertv02",
|
43 |
+
]
|
44 |
+
|
45 |
+
CAUSAL_LM_MODELS = [
|
46 |
+
"BounharAbdelaziz/Al-Atlas-LLM-0.5B",
|
47 |
+
"Qwen/Qwen2.5-0.5B",
|
48 |
+
"tiiuae/Falcon3-1B-Base",
|
49 |
+
"MBZUAI-Paris/Atlas-Chat-2B",
|
50 |
+
]
|
51 |
+
|
52 |
class LMBattleArena:
|
53 |
def __init__(self, dataset_path):
|
54 |
"""Initialize battle arena with dataset"""
|
|
|
56 |
print(self.df.head())
|
57 |
self.current_index = 0
|
58 |
self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
|
59 |
+
self.evaluation_results_masked = []
|
60 |
+
self.evaluation_results_causal = []
|
61 |
self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
|
62 |
|
63 |
+
def get_next_battle_pair(self, is_causal):
|
64 |
"""Retrieve next pair of summaries for comparison"""
|
65 |
if self.current_index >= len(self.df):
|
66 |
return None
|
67 |
|
68 |
row = self.df.iloc[self.current_index]
|
69 |
+
if is_causal:
|
70 |
+
model_summary_cols = [
|
71 |
+
col
|
72 |
+
for col in CAUSAL_LM_MODELS
|
73 |
+
]
|
74 |
+
else:
|
75 |
+
model_summary_cols = [
|
76 |
+
col
|
77 |
+
for col in MASKED_LM_MODELS
|
78 |
+
]
|
79 |
selected_models = random.sample(model_summary_cols, 2)
|
80 |
battle_data = {
|
81 |
+
'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
|
82 |
'model_1': row[selected_models[0]],
|
83 |
'model_2': row[selected_models[1]],
|
84 |
'model1_name': selected_models[0],
|
|
|
87 |
self.current_index += 1
|
88 |
return battle_data
|
89 |
|
90 |
+
def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
|
91 |
"""Record user's model preference and update scores"""
|
92 |
self.model_scores[model1_name]['total_comparisons'] += 1
|
93 |
self.model_scores[model2_name]['total_comparisons'] += 1
|
|
|
109 |
'model2_name': model2_name,
|
110 |
'preferred_models': preferred_models
|
111 |
}
|
112 |
+
if is_causal:
|
113 |
+
self.evaluation_results_causal.append(evaluation)
|
114 |
+
else:
|
115 |
+
self.evaluation_results_masked.append(evaluation)
|
116 |
|
117 |
+
return self.get_model_scores_df(is_causal)
|
118 |
|
119 |
+
def get_model_scores_df(self, is_causal):
|
120 |
"""Convert model scores to DataFrame"""
|
121 |
scores_data = []
|
122 |
for model, stats in self.model_scores.items():
|
123 |
+
if is_causal:
|
124 |
+
if model not in CAUSAL_LM_MODELS:
|
125 |
+
continue
|
126 |
+
else:
|
127 |
+
if model not in MASKED_LM_MODELS:
|
128 |
+
continue
|
129 |
win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
|
130 |
scores_data.append({
|
131 |
'Model': model,
|
|
|
144 |
return results_df
|
145 |
|
146 |
|
147 |
+
def create_battle_arena(dataset_path, is_gif, is_causal):
|
148 |
arena = LMBattleArena(dataset_path)
|
149 |
|
150 |
+
def battle_round(is_causal):
|
151 |
+
battle_data = arena.get_next_battle_pair(is_causal)
|
152 |
|
153 |
if battle_data is None:
|
154 |
return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
|
|
|
162 |
gr.DataFrame(visible=True)
|
163 |
)
|
164 |
|
165 |
+
def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
|
166 |
scores_df = arena.record_evaluation(
|
167 |
+
preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
|
168 |
)
|
169 |
+
next_battle = battle_round(is_causal)
|
170 |
return (*next_battle[:-1], scores_df)
|
171 |
|
172 |
with gr.Blocks(css="footer{display:none !important}") as demo:
|
|
|
176 |
gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
|
177 |
|
178 |
with gr.Tabs():
|
179 |
+
with gr.Tab("Masked LM Battle Arena"):
|
180 |
+
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
181 |
+
|
182 |
+
# Use gr.State to store the boolean value without displaying it
|
183 |
+
is_causal = gr.State(value=False)
|
184 |
+
|
185 |
+
input_text = gr.Textbox(
|
186 |
+
label="Input prompt",
|
187 |
+
interactive=False,
|
188 |
+
)
|
189 |
+
|
190 |
+
with gr.Row():
|
191 |
+
output_1 = gr.Textbox(
|
192 |
+
label="Model A",
|
193 |
+
interactive=False
|
194 |
+
)
|
195 |
+
model1_name = gr.State() # Hidden state for model1 name
|
196 |
+
|
197 |
+
with gr.Row():
|
198 |
+
output_2 = gr.Textbox(
|
199 |
+
label="Model B",
|
200 |
+
interactive=False
|
201 |
+
)
|
202 |
+
model2_name = gr.State() # Hidden state for model2 name
|
203 |
+
|
204 |
+
preferred_models = gr.Radio(
|
205 |
+
label="Which model is better?",
|
206 |
+
choices=["Model A", "Model B", "Both Good", "Both Bad"]
|
207 |
+
)
|
208 |
+
submit_btn = gr.Button("Vote", variant="primary")
|
209 |
+
|
210 |
+
scores_table = gr.DataFrame(
|
211 |
+
headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
|
212 |
+
label="π Leaderboard"
|
213 |
+
)
|
214 |
+
|
215 |
+
submit_btn.click(
|
216 |
+
submit_preference,
|
217 |
+
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
|
218 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
219 |
+
)
|
220 |
+
|
221 |
+
demo.load(
|
222 |
+
battle_round,
|
223 |
+
inputs=[is_causal],
|
224 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
225 |
+
)
|
226 |
+
|
227 |
+
with gr.Tab("Causal LM Battle Arena"):
|
228 |
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
229 |
|
230 |
+
# Use gr.State to store the boolean value without displaying it
|
231 |
+
is_causal = gr.State(value=True)
|
232 |
+
|
233 |
input_text = gr.Textbox(
|
234 |
label="Input prompt",
|
235 |
interactive=False,
|
|
|
262 |
|
263 |
submit_btn.click(
|
264 |
submit_preference,
|
265 |
+
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
|
266 |
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
267 |
)
|
268 |
|
269 |
+
demo.load(
|
270 |
+
battle_round,
|
271 |
+
inputs=[is_causal],
|
272 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
273 |
+
)
|
274 |
|
275 |
return demo
|
276 |
|
277 |
if __name__ == "__main__":
|
278 |
|
279 |
# load the existing dataset that contains outputs of the LMs
|
280 |
+
human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')
|
281 |
|
282 |
# precision
|
283 |
torch_dtype = torch.float16
|
|
|
286 |
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
|
287 |
dataset_path = 'human_eval_dataset.csv'
|
288 |
is_gif = True
|
289 |
+
demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
|
290 |
demo.launch(debug=True)
|