ArmBench-LLM / app.py
Bagratuni's picture
commit
e82f5d4
raw
history blame
6.8 kB
import gradio as gr
import pandas as pd
import plotly.express as px
from model_handler import ModelHandler
from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
global_unified_exam_df = None
global_mmlu_df = None
global_output_armenian = None
global_output_mmlu = None
def refresh_data():
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
model_handler = ModelHandler()
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
global_output_armenian = unified_exam_result_table(global_unified_exam_df)
global_output_mmlu = mmlu_result_table(global_mmlu_df)
return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
def main():
# global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
# model_handler = ModelHandler()
# global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
# global_output_armenian = unified_exam_result_table(global_unified_exam_df)
# global_output_mmlu = mmlu_result_table(global_mmlu_df)
with gr.Blocks() as app:
with gr.Tabs():
# with gr.TabItem("Armenian Unified Exams"):
# gr.Markdown("# Armenian Unified Test Exams")
# gr.Markdown(
# """
# This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
# """
# )
# table_output_armenian = gr.DataFrame(value=global_output_armenian)
# plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
# plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
# with gr.TabItem("MMLU-Pro-Hy"):
# gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
# gr.Markdown(
# """
# This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
# """
# )
# table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
# subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
# plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
# plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
with gr.TabItem("About"):
gr.Markdown("# About the Benchmark")
gr.Markdown(
"""
This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.
**Creator Company:** Metric AI Research Lab, Yerevan, Armenia."""
)
gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
gr.Markdown("""
- [Website](https://metric.am/)
- [Hugging Face](https://huggingface.co/Metric-AI)
MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
"""
)
gr.Markdown("## Submission Guide")
gr.Markdown(
"""
To submit a model for evaluation, please follow these steps:
1. **Evaluate your model**:
- Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
2. **Format your submission file**:
- After evaluation, you will get a `result.json` file. Ensure the file follows this format:
```json
{
"mmlu_results": [
{
"category": "category_name",
"score": score_value
},
...
],
"unified_exam_results": [
{
"category": "category_name",
"score": score_value
},
...
]
}
```
3. **Submit your model**:
- Add the `arm_bench` tag and the `result.json` file to your model card.
- Click on the "Refresh Data" button in this app, and you will see your model's results.
"""
)
gr.Markdown("## Contributing")
gr.Markdown(
"""
You can contribute to this benchmark in several ways:
- Providing API credits for evaluating API-based models.
- Citing our work in your research and publications.
- Contributing to the development of the benchmark itself.
"""
)
refresh_button = gr.Button("Refresh Data")
refresh_button.click(
fn=refresh_data,
outputs=[table_output_armenian,
table_output_mmlu,
plot_output_armenian,
plot_output_mmlu],
)
app.launch(share=True, debug=True)
if __name__ == "__main__":
main()