Spaces:

atlasia
/

Atlaset-Arena

Running

File size: 11,321 Bytes

import gradio as gr
from collections import defaultdict
import os
import base64
import torch
from datasets import (
    Dataset,
    load_dataset,
)
import random
import pandas as pd
from collections import defaultdict

def encode_image_to_base64(image_path):
    """Encode an image or GIF file to base64."""
    with open(image_path, "rb") as file:
        encoded_string = base64.b64encode(file.read()).decode()
    return encoded_string

def create_html_media(media_path, is_gif=False):
    """Create HTML for displaying an image or GIF."""
    media_base64 = encode_image_to_base64(media_path)
    media_type = "gif" if is_gif else "jpeg"
    
    html_string = f"""
    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
        <div style="max-width: 450px; margin: auto;">
            <img src="data:image/{media_type};base64,{media_base64}"
                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
                 alt="Displayed Media">
        </div>
    </div>
    """
    return html_string

MASKED_LM_MODELS = [
    "BounharAbdelaziz/XLM-RoBERTa-Morocco",
    "SI2M-Lab/DarijaBERT",
    "BounharAbdelaziz/ModernBERT-Morocco",
    "google-bert/bert-base-multilingual-cased",
    "FacebookAI/xlm-roberta-large",
    "aubmindlab/bert-base-arabertv02",
]

CAUSAL_LM_MODELS = [
    "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
    "Qwen/Qwen2.5-0.5B",
    "tiiuae/Falcon3-1B-Base",
    "MBZUAI-Paris/Atlas-Chat-2B",
]

class LMBattleArena:
    def __init__(self, dataset_path):
        """Initialize battle arena with dataset"""
        self.df = pd.read_csv(dataset_path)
        print(self.df.head())
        self.current_index = 0
        self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
        self.evaluation_results_masked = []
        self.evaluation_results_causal = []
        self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
    
    def get_next_battle_pair(self, is_causal):
        """Retrieve next pair of summaries for comparison"""
        if self.current_index >= len(self.df):
            return None
        
        row = self.df.iloc[self.current_index]
        if is_causal:
            model_summary_cols = [
                col 
                for col in CAUSAL_LM_MODELS
            ]
        else:
            model_summary_cols = [
                col 
                for col in MASKED_LM_MODELS
            ]
        selected_models = random.sample(model_summary_cols, 2)
        battle_data = {
            'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
            'model_1': row[selected_models[0]],
            'model_2': row[selected_models[1]],
            'model1_name': selected_models[0],
            'model2_name': selected_models[1]
        }
        self.current_index += 1
        return battle_data
    
    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
        """Record user's model preference and update scores"""
        self.model_scores[model1_name]['total_comparisons'] += 1
        self.model_scores[model2_name]['total_comparisons'] += 1
        
        if preferred_models == "Both Good":
            self.model_scores[model1_name]['wins'] += 1
            self.model_scores[model2_name]['wins'] += 1
        elif preferred_models == "Model A":  # Maps to first model
            self.model_scores[model1_name]['wins'] += 1
        elif preferred_models == "Model B":  # Maps to second model
            self.model_scores[model2_name]['wins'] += 1
        # "Both Bad" case - no wins recorded
        
        evaluation = {
            'input_text': input_text,
            'output1': output1,
            'output2': output2,
            'model1_name': model1_name,
            'model2_name': model2_name,
            'preferred_models': preferred_models
        }
        if is_causal:
            self.evaluation_results_causal.append(evaluation)
        else:
            self.evaluation_results_masked.append(evaluation)
        
        return self.get_model_scores_df(is_causal)
    
    def get_model_scores_df(self, is_causal):
        """Convert model scores to DataFrame"""
        scores_data = []
        for model, stats in self.model_scores.items():
            if is_causal:
                if model not in CAUSAL_LM_MODELS:
                    continue
            else:
                if model not in MASKED_LM_MODELS:
                    continue
            win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
            scores_data.append({
                'Model': model,
                'Wins': stats['wins'],
                'Total Comparisons': stats['total_comparisons'],
                'Win Rate (%)': round(win_rate, 2)
            })
        results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
        
        # save the results in a huggingface dataset
        if self.current_index % self.saving_freq == 0 and self.current_index > 0:
            # results_dataset = Dataset.from_pandas(results_df)
            # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
            results_df.to_csv('human_eval_results.csv')
            
        return results_df
    

def create_battle_arena(dataset_path, is_gif, is_causal):
    arena = LMBattleArena(dataset_path)
    
    def battle_round(is_causal):
        battle_data = arena.get_next_battle_pair(is_causal)
        
        if battle_data is None:
            return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
        
        return (
            battle_data['prompt'], 
            battle_data['model_1'], 
            battle_data['model_2'],
            battle_data['model1_name'], 
            battle_data['model2_name'],
            gr.DataFrame(visible=True)
        )
    
    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
        scores_df = arena.record_evaluation(
            preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
        )
        next_battle = battle_round(is_causal)
        return (*next_battle[:-1], scores_df)

    with gr.Blocks(css="footer{display:none !important}") as demo:
        
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
        gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
        
        with gr.Tabs():
            with gr.Tab("Masked LM Battle Arena"):
                gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=False)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="🏆 Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
            with gr.Tab("Causal LM Battle Arena"):
                gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
                
                # Use gr.State to store the boolean value without displaying it
                is_causal = gr.State(value=True)
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="🏆 Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(
                    battle_round, 
                    inputs=[is_causal],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                        
    return demo

if __name__ == "__main__":
    
    # load the existing dataset that contains outputs of the LMs
    human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')

    # precision
    torch_dtype = torch.float16

    # inference device
    device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
    dataset_path = 'human_eval_dataset.csv'
    is_gif = True
    demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
    demo.launch(debug=True)