Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

ArmBench-LLM / app.py

Bagratuni

commit

1b75b9d 2 months ago

raw

history blame

5.62 kB

	import gradio as gr
	import pandas as pd
	import plotly.express as px

	def display_table(exam_type):
	if exam_type == "Armenian Exams":
	df = pd.read_csv('unified_exam_results.csv')
	df = df.sort_values(by='Average', ascending=False)
	cols = df.columns.tolist()
	cols.insert(1, cols.pop(cols.index('Average')))
	df = df[cols]
	df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
	df = df.round(4)
	elif exam_type == "MMLU-Pro-Hy":
	df = pd.read_csv('mmlu_pro_hy_results.csv')
	subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
	df['Average'] = df[subject_cols].mean(axis=1)
	df = df.sort_values(by='Average', ascending=False)
	cols = df.columns.tolist()
	cols.remove('Accuracy')
	cols.insert(1, cols.pop(cols.index('Average')))
	cols.append(cols.pop(cols.index('Other')))
	df = df[cols]
	df = df.round(4)
	return df

	def create_bar_chart(exam_type, plot_column):
	if exam_type == "Armenian Exams":
	df = pd.read_csv('unified_exam_results.csv')
	df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
	x_col = plot_column
	title = f'{plot_column}'
	x_range_max = 20
	def get_label(score):
	if score < 8:
	return "Fail"
	elif 8 <= score <= 18:
	return "Pass"
	else:
	return "Distinction"
	df['Test Result'] = df[plot_column].apply(get_label)
	color_discrete_map = {
	"Fail": "#ff5f56",
	"Pass": "#ffbd2e",
	"Distinction": "#27c93f"
	}
	fig = px.bar(df,
	x=x_col,
	y='Model',
	color=df['Test Result'],
	color_discrete_map=color_discrete_map,
	labels={x_col: 'Score', 'Model': 'Model'},
	title=title,
	orientation='h')
	fig.update_layout(
	xaxis=dict(range=[0, x_range_max]),
	title=dict(text=title, font=dict(size=16)),
	xaxis_title=dict(font=dict(size=12)),
	yaxis_title=dict(font=dict(size=12)),
	yaxis=dict(autorange="reversed"),
	autosize=True
	)
	return fig
	elif exam_type == "MMLU-Pro-Hy":
	df = pd.read_csv('mmlu_pro_hy_results.csv')
	subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
	df['Average'] = df[subject_cols].mean(axis=1)
	df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)
	df = df.drop(columns=['Accuracy'])
	x_col = plot_column
	title = f'{plot_column}'
	x_range_max = 1.0
	fig = px.bar(df,
	x=x_col,
	y='Model',
	color=x_col,
	color_continuous_scale='Viridis',
	labels={x_col: 'Accuracy', 'Model': 'Model'},
	title=title,
	orientation='h',
	range_color=[0,1])
	fig.update_layout(
	xaxis=dict(range=[0, x_range_max]),
	title=dict(text=title, font=dict(size=16)),
	xaxis_title=dict(font=dict(size=12)),
	yaxis_title=dict(font=dict(size=12)),
	yaxis=dict(autorange="reversed"),
	autosize=True
	)
	return fig

	with gr.Blocks() as app:
	with gr.Tabs():
	with gr.TabItem("Armenian Unified Exams"):
	gr.Markdown("# Armenian Unified Test Exams")
	gr.HTML(f"""
	<div style="font-size: 16px;">
	This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
	</div>
	""")
	table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
	plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
	plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
	with gr.TabItem("MMLU-Pro-Hy"):
	gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
	gr.HTML(f"""
	<div style="font-size: 16px;">
	This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
	</div>
	""")
	table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
	subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
	plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
	plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
	app.launch(share=True, debug=True)