File size: 5,619 Bytes
4781b83
 
 
 
 
 
 
0d208c0
4781b83
0d208c0
4781b83
1b75b9d
 
4781b83
 
0d208c0
 
 
 
 
 
 
 
1b75b9d
4781b83
 
 
 
 
 
 
0d208c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4781b83
 
 
 
 
1b75b9d
 
4781b83
 
 
 
0d208c0
 
1b75b9d
0d208c0
 
 
4781b83
 
0d208c0
 
 
 
 
 
 
 
4781b83
 
 
 
 
1b75b9d
 
4781b83
 
 
 
 
 
0d208c0
1b75b9d
 
 
 
 
4781b83
0d208c0
4781b83
 
0d208c0
1b75b9d
 
 
 
 
4781b83
1b75b9d
0d208c0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
import pandas as pd
import plotly.express as px

def display_table(exam_type):
    if exam_type == "Armenian Exams":
        df = pd.read_csv('unified_exam_results.csv')
        df = df.sort_values(by='Average', ascending=False)
        cols = df.columns.tolist()
        cols.insert(1, cols.pop(cols.index('Average')))
        df = df[cols]
        df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
        df = df.round(4)
    elif exam_type == "MMLU-Pro-Hy":
        df = pd.read_csv('mmlu_pro_hy_results.csv')
        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
        df['Average'] = df[subject_cols].mean(axis=1)
        df = df.sort_values(by='Average', ascending=False)
        cols = df.columns.tolist()
        cols.remove('Accuracy')
        cols.insert(1, cols.pop(cols.index('Average')))
        cols.append(cols.pop(cols.index('Other')))
        df = df[cols]
        df = df.round(4)
    return df

def create_bar_chart(exam_type, plot_column):
    if exam_type == "Armenian Exams":
        df = pd.read_csv('unified_exam_results.csv')
        df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
        x_col = plot_column
        title = f'{plot_column}'
        x_range_max = 20
        def get_label(score):
            if score < 8:
                return "Fail"
            elif 8 <= score <= 18:
                return "Pass"
            else:
                return "Distinction"
        df['Test Result'] = df[plot_column].apply(get_label)
        color_discrete_map = {
            "Fail": "#ff5f56",
            "Pass": "#ffbd2e",
            "Distinction": "#27c93f"
        }
        fig = px.bar(df,
            x=x_col,
            y='Model',
            color=df['Test Result'],
            color_discrete_map=color_discrete_map,
            labels={x_col: 'Score', 'Model': 'Model'},
            title=title,
            orientation='h')
        fig.update_layout(
            xaxis=dict(range=[0, x_range_max]),
            title=dict(text=title, font=dict(size=16)),
            xaxis_title=dict(font=dict(size=12)),
            yaxis_title=dict(font=dict(size=12)),
            yaxis=dict(autorange="reversed"),
            autosize=True
        )
        return fig
    elif exam_type == "MMLU-Pro-Hy":
        df = pd.read_csv('mmlu_pro_hy_results.csv')
        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
        df['Average'] = df[subject_cols].mean(axis=1)
        df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)  
        df = df.drop(columns=['Accuracy'])
        x_col = plot_column
        title = f'{plot_column}'
        x_range_max = 1.0
        fig = px.bar(df,
            x=x_col,
            y='Model',
            color=x_col,
            color_continuous_scale='Viridis',
            labels={x_col: 'Accuracy', 'Model': 'Model'},
            title=title,
            orientation='h',
            range_color=[0,1])
        fig.update_layout(
            xaxis=dict(range=[0, x_range_max]),
            title=dict(text=title, font=dict(size=16)),
            xaxis_title=dict(font=dict(size=12)),
            yaxis_title=dict(font=dict(size=12)),
            yaxis=dict(autorange="reversed"),
            autosize=True
        )
        return fig

with gr.Blocks() as app:
    with gr.Tabs():
        with gr.TabItem("Armenian Unified Exams"):
            gr.Markdown("# Armenian Unified Test Exams")
            gr.HTML(f"""
                <div style="font-size: 16px;">
                    This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
                </div>
            """)
            table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
            plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
            plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
        with gr.TabItem("MMLU-Pro-Hy"):
            gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
            gr.HTML(f"""
                <div style="font-size: 16px;">
                    This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
                </div>
            """)
            table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
            subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
            plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
            plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
app.launch(share=True, debug=True)