File size: 11,321 Bytes
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a40a3
 
 
 
 
 
 
ad3a876
 
e9a40a3
 
ad3a876
e9a40a3
 
 
 
 
ad3a876
 
 
 
 
 
 
 
 
 
e9a40a3
 
ad3a876
e9a40a3
 
 
 
 
 
 
 
ad3a876
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
 
 
 
e9a40a3
ad3a876
e9a40a3
ad3a876
e9a40a3
 
 
ad3a876
 
 
 
 
 
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
 
ad3a876
 
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
ad3a876
e9a40a3
ad3a876
e9a40a3
 
 
 
 
 
 
 
 
ad3a876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9a40a3
 
ad3a876
 
 
e9a40a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3a876
e9a40a3
 
 
ad3a876
 
 
 
 
e9a40a3
 
 
 
 
 
ad3a876
e9a40a3
 
 
 
 
 
 
 
ad3a876
5f36137
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import gradio as gr
from collections import defaultdict
import os
import base64
import torch
from datasets import (
    Dataset,
    load_dataset,
)
import random
import pandas as pd
from collections import defaultdict

def encode_image_to_base64(image_path):
    """Encode an image or GIF file to base64."""
    with open(image_path, "rb") as file:
        encoded_string = base64.b64encode(file.read()).decode()
    return encoded_string

def create_html_media(media_path, is_gif=False):
    """Create HTML for displaying an image or GIF."""
    media_base64 = encode_image_to_base64(media_path)
    media_type = "gif" if is_gif else "jpeg"
    
    html_string = f"""
    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
        <div style="max-width: 450px; margin: auto;">
            <img src="data:image/{media_type};base64,{media_base64}"
                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
                 alt="Displayed Media">
        </div>
    </div>
    """
    return html_string

MASKED_LM_MODELS = [
    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
    "SI2M-Lab/DarijaBERT",
    "BounharAbdelaziz/ModernBERT-Morocco",
    "google-bert/bert-base-multilingual-cased",
    "FacebookAI/xlm-roberta-large",
    "aubmindlab/bert-base-arabertv02",
]

CAUSAL_LM_MODELS = [
    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
    "Qwen/Qwen2.5-0.5B",
    "tiiuae/Falcon3-1B-Base",
    "MBZUAI-Paris/Atlas-Chat-2B",
]

class LMBattleArena:
    def __init__(self, dataset_path):
        """Initialize battle arena with dataset"""
        self.df = pd.read_csv(dataset_path)
        print(self.df.head())
        self.current_index = 0
        self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
        self.evaluation_results_masked = []
        self.evaluation_results_causal = []
        self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
    
    def get_next_battle_pair(self, is_causal):
        """Retrieve next pair of summaries for comparison"""
        if self.current_index >= len(self.df):
            return None
        
        row = self.df.iloc[self.current_index]
        if is_causal:
            model_summary_cols = [
                col 
                for col in CAUSAL_LM_MODELS
            ]
        else:
            model_summary_cols = [
                col 
                for col in MASKED_LM_MODELS
            ]
        selected_models = random.sample(model_summary_cols, 2)
        battle_data = {
            'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
            'model_1': row[selected_models[0]],
            'model_2': row[selected_models[1]],
            'model1_name': selected_models[0],
            'model2_name': selected_models[1]
        }
        self.current_index += 1
        return battle_data
    
    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
        """Record user's model preference and update scores"""
        self.model_scores[model1_name]['total_comparisons'] += 1
        self.model_scores[model2_name]['total_comparisons'] += 1
        
        if preferred_models == "Both Good":
            self.model_scores[model1_name]['wins'] += 1
            self.model_scores[model2_name]['wins'] += 1
        elif preferred_models == "Model A":  # Maps to first model
            self.model_scores[model1_name]['wins'] += 1
        elif preferred_models == "Model B":  # Maps to second model
            self.model_scores[model2_name]['wins'] += 1
        # "Both Bad" case - no wins recorded
        
        evaluation = {
            'input_text': input_text,
            'output1': output1,
            'output2': output2,
            'model1_name': model1_name,
            'model2_name': model2_name,
            'preferred_models': preferred_models
        }
        if is_causal:
            self.evaluation_results_causal.append(evaluation)
        else:
            self.evaluation_results_masked.append(evaluation)
        
        return self.get_model_scores_df(is_causal)
    
    def get_model_scores_df(self, is_causal):
        """Convert model scores to DataFrame"""
        scores_data = []
        for model, stats in self.model_scores.items():
            if is_causal:
                if model not in CAUSAL_LM_MODELS:
                    continue
            else:
                if model not in MASKED_LM_MODELS:
                    continue
            win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
            scores_data.append({
                'Model': model,
                'Wins': stats['wins'],
                'Total Comparisons': stats['total_comparisons'],
                'Win Rate (%)': round(win_rate, 2)
            })
        results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
        
        # save the results in a huggingface dataset
        if self.current_index % self.saving_freq == 0 and self.current_index > 0:
            # results_dataset = Dataset.from_pandas(results_df)
            # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
            results_df.to_csv('human_eval_results.csv')
            
        return results_df
    

def create_battle_arena(dataset_path, is_gif, is_causal):
    arena = LMBattleArena(dataset_path)
    
    def battle_round(is_causal):
        battle_data = arena.get_next_battle_pair(is_causal)
        
        if battle_data is None:
            return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
        
        return (
            battle_data['prompt'], 
            battle_data['model_1'], 
            battle_data['model_2'],
            battle_data['model1_name'], 
            battle_data['model2_name'],
            gr.DataFrame(visible=True)
        )
    
    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
        scores_df = arena.record_evaluation(
            preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
        )
        next_battle = battle_round(is_causal)
        return (*next_battle[:-1], scores_df)

    with gr.Blocks(css="footer{display:none !important}") as demo:
        
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
        gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
        
        with gr.Tabs():
            with gr.Tab("Masked LM Battle Arena"):
                gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=False)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="πŸ† Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
            with gr.Tab("Causal LM Battle Arena"):
                gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=True)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="πŸ† Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                        
    return demo

if __name__ == "__main__":
    
    # load the existing dataset that contains outputs of the LMs
    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')

    # precision
    torch_dtype = torch.float16

    # inference device
    device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
    dataset_path = 'human_eval_dataset.csv'
    is_gif = True
    demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
    demo.launch(debug=True)