Spaces:

Metric-AI
/

ArmBench-LLM

Running

File size: 6,974 Bytes

4781b83
 
 
a4d362f
 
4781b83
779cbde
 
 
 
4781b83
779cbde
 
 
 
 
 
 
2f6fff2
6b4ef20
 
 
 
 
779cbde
 
cabb2f4
 
 
4781b83
cabb2f4
2f6fff2
779cbde
 
 
cabb2f4
 
 
 
 
 
 
 
 
 
2f6fff2
 
 
 
 
 
 
 
 
 
 
779cbde
 
 
 
 
 
 
 
 
6b4ef20
779cbde
 
6b4ef20
779cbde
 
 
 
 
 
 
 
6b4ef20
1e273fd
 
779cbde
1e273fd
 
 
779cbde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e273fd
 
 
779cbde
 
 
 
 
 
 
 
 
 
 
 
 
cabb2f4
 
 
6b4ef20
cabb2f4
6b4ef20
779cbde

import gradio as gr
import pandas as pd
import plotly.express as px
from model_handler import ModelHandler
from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart

global_unified_exam_df = None
global_mmlu_df = None
global_output_armenian = None
global_output_mmlu = None

def refresh_data():
    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu

    model_handler = ModelHandler()
    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
    global_output_mmlu = mmlu_result_table(global_mmlu_df)

    unified_chart = unified_exam_chart(global_output_armenian, 'Average')
    mmlu_chart_output = mmlu_chart(global_output_mmlu, 'Average')

    return global_output_armenian, global_output_mmlu, unified_chart, mmlu_chart_output, 'Average', 'Average'

def main():
    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
    model_handler = ModelHandler()
    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
    global_output_mmlu = mmlu_result_table(global_mmlu_df)

    with gr.Blocks() as app:
        with gr.Tabs():
            with gr.TabItem("Armenian Unified Exams"):
                gr.Markdown("# Armenian Unified Test Exams")
                gr.Markdown(
                    """
                    This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
                    """
                )
                table_output_armenian = gr.DataFrame(value=global_output_armenian)
                plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
                plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
            with gr.TabItem("MMLU-Pro-Hy"):
                gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
                gr.Markdown(
                    """
                    This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
                    """
                )
                table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
                subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
                plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
                plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
            with gr.TabItem("About"):
                gr.Markdown("# About the Benchmark")
                gr.Markdown(
                    """
                    This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.

                    **Creator Company:** Metric AI Research Lab, Yerevan, Armenia."""
                )
                gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
                gr.Markdown("""  
                    - [Website](https://metric.am/)
                    - [Hugging Face](https://huggingface.co/Metric-AI)

                    MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
                    """
                )
                gr.Markdown("## Submission Guide")
                gr.Markdown(
                    """
                    To submit a model for evaluation, please follow these steps:
                    1. **Evaluate your model**:
                        - Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
                        - For more details about the evaluation process, read the README in the Arm-LLM-Benchmark GitHub repository.

                    2. **Format your submission file**:
                        - After evaluation, you will get a `results.json` file. Ensure the file follows this format:
                        
                    ```json
                        {
                            "mmlu_results": [
                                {
                                    "category": "category_name",
                                    "score": score_value
                                },
                                ...
                            ],
                            "unified_exam_results": [
                                {
                                    "category": "category_name",
                                    "score": score_value
                                },
                                ...
                            ]
                        }
                    ```
                    3. **Submit your model**:
                        - Add the `Arm-LLM-Bench` tag and the `results.json` file to your model card.
                        - Click on the "Refresh Data" button in this app, and you will see your model's results.
                    """
                )
                gr.Markdown("## Contributing")
                gr.Markdown(
                    """
                    You can contribute to this benchmark in several ways:
                    - Providing API credits for evaluating API-based models.
                    - Citing our work in your research and publications.
                    - Contributing to the development of the benchmark itself.
                    """
                )

        refresh_button = gr.Button("Refresh Data")
        refresh_button.click(
            fn=refresh_data,
            outputs=[table_output_armenian, table_output_mmlu, plot_output_armenian, plot_output_mmlu, plot_column_dropdown_unified_exam, plot_column_dropdown_mmlu],
        )

    app.launch(share=True, debug=True)

if __name__ == "__main__":
    main()