Spaces:

atlasia
/

Atlaset-Arena

Running

App Files Files Community

BounharAbdelaziz commited on Feb 26

Commit

ad3a876

verified ·

1 Parent(s): 1f1dbb3

added causal lm eval

Browse files

Files changed (1) hide show

human_eval.py +109 -23

human_eval.py CHANGED Viewed

@@ -33,6 +33,22 @@ def create_html_media(media_path, is_gif=False):
     """
     return html_string
 class LMBattleArena:
     def __init__(self, dataset_path):
         """Initialize battle arena with dataset"""
@@ -40,23 +56,29 @@ class LMBattleArena:
         print(self.df.head())
         self.current_index = 0
         self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
-        self.evaluation_results = []
         self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
-    def get_next_battle_pair(self):
         """Retrieve next pair of summaries for comparison"""
         if self.current_index >= len(self.df):
             return None
         row = self.df.iloc[self.current_index]
-        model_summary_cols = [
-            col
-            for col in row.index
-            if col.upper() != 'PROMPT'
-        ]
         selected_models = random.sample(model_summary_cols, 2)
         battle_data = {
-            'prompt': row['prompt'],
             'model_1': row[selected_models[0]],
             'model_2': row[selected_models[1]],
             'model1_name': selected_models[0],
@@ -65,7 +87,7 @@ class LMBattleArena:
         self.current_index += 1
         return battle_data
-    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
         """Record user's model preference and update scores"""
         self.model_scores[model1_name]['total_comparisons'] += 1
         self.model_scores[model2_name]['total_comparisons'] += 1
@@ -87,14 +109,23 @@ class LMBattleArena:
             'model2_name': model2_name,
             'preferred_models': preferred_models
         }
-        self.evaluation_results.append(evaluation)
-        return self.get_model_scores_df()
-    def get_model_scores_df(self):
         """Convert model scores to DataFrame"""
         scores_data = []
         for model, stats in self.model_scores.items():
             win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
             scores_data.append({
                 'Model': model,
@@ -113,11 +144,11 @@ class LMBattleArena:
         return results_df
-def create_battle_arena(dataset_path, is_gif):
     arena = LMBattleArena(dataset_path)
-    def battle_round():
-        battle_data = arena.get_next_battle_pair()
         if battle_data is None:
             return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
@@ -131,11 +162,11 @@ def create_battle_arena(dataset_path, is_gif):
             gr.DataFrame(visible=True)
         )
-    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
         scores_df = arena.record_evaluation(
-            preferred_models, input_text, output_1, output_2, model1_name, model2_name
         )
-        next_battle = battle_round()
         return (*next_battle[:-1], scores_df)
     with gr.Blocks(css="footer{display:none !important}") as demo:
@@ -145,9 +176,60 @@ def create_battle_arena(dataset_path, is_gif):
         gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
         with gr.Tabs():
-            with gr.Tab("Battle Arena"):
                 gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
                 input_text = gr.Textbox(
                     label="Input prompt",
                     interactive=False,
@@ -180,18 +262,22 @@ def create_battle_arena(dataset_path, is_gif):
                 submit_btn.click(
                     submit_preference,
-                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
                     outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                 )
-                demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
     return demo
 if __name__ == "__main__":
     # load the existing dataset that contains outputs of the LMs
-    human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
     # precision
     torch_dtype = torch.float16
@@ -200,5 +286,5 @@ if __name__ == "__main__":
     device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
     dataset_path = 'human_eval_dataset.csv'
     is_gif = True
-    demo = create_battle_arena(dataset_path, is_gif)
     demo.launch(debug=True)

     """
     return html_string
+MASKED_LM_MODELS = [
+    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
+    "SI2M-Lab/DarijaBERT",
+    "BounharAbdelaziz/ModernBERT-Morocco",
+    "google-bert/bert-base-multilingual-cased",
+    "FacebookAI/xlm-roberta-large",
+    "aubmindlab/bert-base-arabertv02",
+]
+CAUSAL_LM_MODELS = [
+    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
+    "Qwen/Qwen2.5-0.5B",
+    "tiiuae/Falcon3-1B-Base",
+    "MBZUAI-Paris/Atlas-Chat-2B",
+]
 class LMBattleArena:
     def __init__(self, dataset_path):
         """Initialize battle arena with dataset"""
         print(self.df.head())
         self.current_index = 0
         self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
+        self.evaluation_results_masked = []
+        self.evaluation_results_causal = []
         self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
+    def get_next_battle_pair(self, is_causal):
         """Retrieve next pair of summaries for comparison"""
         if self.current_index >= len(self.df):
             return None
         row = self.df.iloc[self.current_index]
+        if is_causal:
+            model_summary_cols = [
+                col
+                for col in CAUSAL_LM_MODELS
+            ]
+        else:
+            model_summary_cols = [
+                col
+                for col in MASKED_LM_MODELS
+            ]
         selected_models = random.sample(model_summary_cols, 2)
         battle_data = {
+            'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
             'model_1': row[selected_models[0]],
             'model_2': row[selected_models[1]],
             'model1_name': selected_models[0],
         self.current_index += 1
         return battle_data
+    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
         """Record user's model preference and update scores"""
         self.model_scores[model1_name]['total_comparisons'] += 1
         self.model_scores[model2_name]['total_comparisons'] += 1
             'model2_name': model2_name,
             'preferred_models': preferred_models
         }
+        if is_causal:
+            self.evaluation_results_causal.append(evaluation)
+        else:
+            self.evaluation_results_masked.append(evaluation)
+        return self.get_model_scores_df(is_causal)
+    def get_model_scores_df(self, is_causal):
         """Convert model scores to DataFrame"""
         scores_data = []
         for model, stats in self.model_scores.items():
+            if is_causal:
+                if model not in CAUSAL_LM_MODELS:
+                    continue
+            else:
+                if model not in MASKED_LM_MODELS:
+                    continue
             win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
             scores_data.append({
                 'Model': model,
         return results_df
+def create_battle_arena(dataset_path, is_gif, is_causal):
     arena = LMBattleArena(dataset_path)
+    def battle_round(is_causal):
+        battle_data = arena.get_next_battle_pair(is_causal)
         if battle_data is None:
             return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
             gr.DataFrame(visible=True)
         )
+    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
         scores_df = arena.record_evaluation(
+            preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
         )
+        next_battle = battle_round(is_causal)
         return (*next_battle[:-1], scores_df)
     with gr.Blocks(css="footer{display:none !important}") as demo:
         gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
         with gr.Tabs():
+            with gr.Tab("Masked LM Battle Arena"):
+                gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
+                # Use gr.State to store the boolean value without displaying it
+                is_causal = gr.State(value=False)
+                input_text = gr.Textbox(
+                    label="Input prompt",
+                    interactive=False,
+                )
+                with gr.Row():
+                    output_1 = gr.Textbox(
+                        label="Model A",
+                        interactive=False
+                    )
+                    model1_name = gr.State()  # Hidden state for model1 name
+                with gr.Row():
+                    output_2 = gr.Textbox(
+                        label="Model B",
+                        interactive=False
+                    )
+                    model2_name = gr.State()  # Hidden state for model2 name
+                preferred_models = gr.Radio(
+                    label="Which model is better?",
+                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
+                )
+                submit_btn = gr.Button("Vote", variant="primary")
+                scores_table = gr.DataFrame(
+                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
+                    label="🏆 Leaderboard"
+                )
+                submit_btn.click(
+                    submit_preference,
+                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
+                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
+                )
+                demo.load(
+                    battle_round,
+                    inputs=[is_causal],
+                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
+                )
+            with gr.Tab("Causal LM Battle Arena"):
                 gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
+                # Use gr.State to store the boolean value without displaying it
+                is_causal = gr.State(value=True)
                 input_text = gr.Textbox(
                     label="Input prompt",
                     interactive=False,
                 submit_btn.click(
                     submit_preference,
+                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                     outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                 )
+                demo.load(
+                    battle_round,
+                    inputs=[is_causal],
+                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
+                )
     return demo
 if __name__ == "__main__":
     # load the existing dataset that contains outputs of the LMs
+    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')
     # precision
     torch_dtype = torch.float16
     device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
     dataset_path = 'human_eval_dataset.csv'
     is_gif = True
+    demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
     demo.launch(debug=True)