Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

ArmBench-LLM / app.py

Bagratuni

commit

e82f5d4 about 2 months ago

raw

history blame

6.8 kB

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	from model_handler import ModelHandler
	from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart

	global_unified_exam_df = None
	global_mmlu_df = None
	global_output_armenian = None
	global_output_mmlu = None

	def refresh_data():
	global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu

	model_handler = ModelHandler()
	global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

	global_output_armenian = unified_exam_result_table(global_unified_exam_df)
	global_output_mmlu = mmlu_result_table(global_mmlu_df)

	return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')

	def main():
	# global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
	# model_handler = ModelHandler()
	# global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

	# global_output_armenian = unified_exam_result_table(global_unified_exam_df)
	# global_output_mmlu = mmlu_result_table(global_mmlu_df)

	with gr.Blocks() as app:
	with gr.Tabs():
	# with gr.TabItem("Armenian Unified Exams"):
	# gr.Markdown("# Armenian Unified Test Exams")
	# gr.Markdown(
	# """
	# This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
	# """
	# )
	# table_output_armenian = gr.DataFrame(value=global_output_armenian)
	# plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
	# plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
	# with gr.TabItem("MMLU-Pro-Hy"):
	# gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
	# gr.Markdown(
	# """
	# This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
	# """
	# )
	# table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
	# subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
	# plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
	# plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
	with gr.TabItem("About"):
	gr.Markdown("# About the Benchmark")
	gr.Markdown(
	"""
	This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.

	Creator Company: Metric AI Research Lab, Yerevan, Armenia."""
	)
	gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
	gr.Markdown("""
	- [Website](https://metric.am/)
	- [Hugging Face](https://huggingface.co/Metric-AI)

	MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
	"""
	)
	gr.Markdown("## Submission Guide")
	gr.Markdown(
	"""
	To submit a model for evaluation, please follow these steps:
	1. Evaluate your model:
	- Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
	2. Format your submission file:
	- After evaluation, you will get a `result.json` file. Ensure the file follows this format:
	```json
	{
	"mmlu_results": [
	{
	"category": "category_name",
	"score": score_value
	},
	...
	],
	"unified_exam_results": [
	{
	"category": "category_name",
	"score": score_value
	},
	...
	]
	}
	```
	3. Submit your model:
	- Add the `arm_bench` tag and the `result.json` file to your model card.
	- Click on the "Refresh Data" button in this app, and you will see your model's results.
	"""
	)
	gr.Markdown("## Contributing")
	gr.Markdown(
	"""
	You can contribute to this benchmark in several ways:
	- Providing API credits for evaluating API-based models.
	- Citing our work in your research and publications.
	- Contributing to the development of the benchmark itself.
	"""
	)

	refresh_button = gr.Button("Refresh Data")
	refresh_button.click(
	fn=refresh_data,
	outputs=[table_output_armenian,
	table_output_mmlu,
	plot_output_armenian,
	plot_output_mmlu],
	)
	app.launch(share=True, debug=True)

	if __name__ == "__main__":
	main()