Spaces:

atlasia
/

Atlaset-Arena

Running

App Files Files Community

Atlaset-Arena / human_eval.py

BounharAbdelaziz

added causal lm eval

ad3a876 verified 2 months ago

raw

history blame

11.3 kB

	import gradio as gr
	from collections import defaultdict
	import os
	import base64
	import torch
	from datasets import (
	Dataset,
	load_dataset,
	)
	import random
	import pandas as pd
	from collections import defaultdict

	def encode_image_to_base64(image_path):
	"""Encode an image or GIF file to base64."""
	with open(image_path, "rb") as file:
	encoded_string = base64.b64encode(file.read()).decode()
	return encoded_string

	def create_html_media(media_path, is_gif=False):
	"""Create HTML for displaying an image or GIF."""
	media_base64 = encode_image_to_base64(media_path)
	media_type = "gif" if is_gif else "jpeg"

	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 450px; margin: auto;">
	<img src="data:image/{media_type};base64,{media_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Media">
	</div>
	</div>
	"""
	return html_string

	MASKED_LM_MODELS = [
	"BounharAbdelaziz/XLM-RoBERTa-Morocco",
	"SI2M-Lab/DarijaBERT",
	"BounharAbdelaziz/ModernBERT-Morocco",
	"google-bert/bert-base-multilingual-cased",
	"FacebookAI/xlm-roberta-large",
	"aubmindlab/bert-base-arabertv02",
	]

	CAUSAL_LM_MODELS = [
	"BounharAbdelaziz/Al-Atlas-LLM-0.5B",
	"Qwen/Qwen2.5-0.5B",
	"tiiuae/Falcon3-1B-Base",
	"MBZUAI-Paris/Atlas-Chat-2B",
	]

	class LMBattleArena:
	def __init__(self, dataset_path):
	"""Initialize battle arena with dataset"""
	self.df = pd.read_csv(dataset_path)
	print(self.df.head())
	self.current_index = 0
	self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
	self.evaluation_results_masked = []
	self.evaluation_results_causal = []
	self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})

	def get_next_battle_pair(self, is_causal):
	"""Retrieve next pair of summaries for comparison"""
	if self.current_index >= len(self.df):
	return None

	row = self.df.iloc[self.current_index]
	if is_causal:
	model_summary_cols = [
	col
	for col in CAUSAL_LM_MODELS
	]
	else:
	model_summary_cols = [
	col
	for col in MASKED_LM_MODELS
	]
	selected_models = random.sample(model_summary_cols, 2)
	battle_data = {
	'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
	'model_1': row[selected_models[0]],
	'model_2': row[selected_models[1]],
	'model1_name': selected_models[0],
	'model2_name': selected_models[1]
	}
	self.current_index += 1
	return battle_data

	def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
	"""Record user's model preference and update scores"""
	self.model_scores[model1_name]['total_comparisons'] += 1
	self.model_scores[model2_name]['total_comparisons'] += 1

	if preferred_models == "Both Good":
	self.model_scores[model1_name]['wins'] += 1
	self.model_scores[model2_name]['wins'] += 1
	elif preferred_models == "Model A": # Maps to first model
	self.model_scores[model1_name]['wins'] += 1
	elif preferred_models == "Model B": # Maps to second model
	self.model_scores[model2_name]['wins'] += 1
	# "Both Bad" case - no wins recorded

	evaluation = {
	'input_text': input_text,
	'output1': output1,
	'output2': output2,
	'model1_name': model1_name,
	'model2_name': model2_name,
	'preferred_models': preferred_models
	}
	if is_causal:
	self.evaluation_results_causal.append(evaluation)
	else:
	self.evaluation_results_masked.append(evaluation)

	return self.get_model_scores_df(is_causal)

	def get_model_scores_df(self, is_causal):
	"""Convert model scores to DataFrame"""
	scores_data = []
	for model, stats in self.model_scores.items():
	if is_causal:
	if model not in CAUSAL_LM_MODELS:
	continue
	else:
	if model not in MASKED_LM_MODELS:
	continue
	win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
	scores_data.append({
	'Model': model,
	'Wins': stats['wins'],
	'Total Comparisons': stats['total_comparisons'],
	'Win Rate (%)': round(win_rate, 2)
	})
	results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)

	# save the results in a huggingface dataset
	if self.current_index % self.saving_freq == 0 and self.current_index > 0:
	# results_dataset = Dataset.from_pandas(results_df)
	# results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
	results_df.to_csv('human_eval_results.csv')

	return results_df


	def create_battle_arena(dataset_path, is_gif, is_causal):
	arena = LMBattleArena(dataset_path)

	def battle_round(is_causal):
	battle_data = arena.get_next_battle_pair(is_causal)

	if battle_data is None:
	return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)

	return (
	battle_data['prompt'],
	battle_data['model_1'],
	battle_data['model_2'],
	battle_data['model1_name'],
	battle_data['model2_name'],
	gr.DataFrame(visible=True)
	)

	def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
	scores_df = arena.record_evaluation(
	preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
	)
	next_battle = battle_round(is_causal)
	return (*next_battle[:-1], scores_df)

	with gr.Blocks(css="footer{display:none !important}") as demo:

	base_path = os.path.dirname(__file__)
	local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
	gr.HTML(create_html_media(local_image_path, is_gif=is_gif))

	with gr.Tabs():
	with gr.Tab("Masked LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=False)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	with gr.Tab("Causal LM Battle Arena"):
	gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")

	# Use gr.State to store the boolean value without displaying it
	is_causal = gr.State(value=True)

	input_text = gr.Textbox(
	label="Input prompt",
	interactive=False,
	)

	with gr.Row():
	output_1 = gr.Textbox(
	label="Model A",
	interactive=False
	)
	model1_name = gr.State() # Hidden state for model1 name

	with gr.Row():
	output_2 = gr.Textbox(
	label="Model B",
	interactive=False
	)
	model2_name = gr.State() # Hidden state for model2 name

	preferred_models = gr.Radio(
	label="Which model is better?",
	choices=["Model A", "Model B", "Both Good", "Both Bad"]
	)
	submit_btn = gr.Button("Vote", variant="primary")

	scores_table = gr.DataFrame(
	headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
	label="🏆 Leaderboard"
	)

	submit_btn.click(
	submit_preference,
	inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	demo.load(
	battle_round,
	inputs=[is_causal],
	outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
	)

	return demo

	if __name__ == "__main__":

	# load the existing dataset that contains outputs of the LMs
	human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')

	# precision
	torch_dtype = torch.float16

	# inference device
	device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
	dataset_path = 'human_eval_dataset.csv'
	is_gif = True
	demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
	demo.launch(debug=True)