Spaces:
Running
Running
daniel7an
commited on
Commit
·
0d208c0
1
Parent(s):
4781b83
updates
Browse files- app.py +60 -67
- mmlu_pro_hy_results.csv +8 -5
- unified_exam_results.csv +1 -1
app.py
CHANGED
@@ -5,62 +5,52 @@ import plotly.express as px
|
|
5 |
def display_table(exam_type):
|
6 |
if exam_type == "Armenian Exams":
|
7 |
df = pd.read_csv('unified_exam_results.csv')
|
8 |
-
df = df.sort_values(by='Average
|
9 |
cols = df.columns.tolist()
|
10 |
-
cols.insert(1, cols.pop(cols.index('Average
|
11 |
df = df[cols]
|
12 |
elif exam_type == "MMLU-Pro-Hy":
|
13 |
df = pd.read_csv('mmlu_pro_hy_results.csv')
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
return df
|
16 |
|
17 |
def create_bar_chart(exam_type, plot_column):
|
18 |
if exam_type == "Armenian Exams":
|
19 |
df = pd.read_csv('unified_exam_results.csv')
|
20 |
-
df = df.sort_values(by='Average score', ascending=False)
|
21 |
df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
|
22 |
|
23 |
x_col = plot_column
|
24 |
-
title = f'{plot_column}
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
elif 8 <= score <= 18:
|
35 |
-
return "Pass"
|
36 |
-
else:
|
37 |
-
return "Distinction"
|
38 |
-
df['Test Result'] = df[plot_column].apply(get_label)
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
"Pass": "#edd8be",
|
54 |
-
"Distinction": "#059492"
|
55 |
-
}
|
56 |
-
fig = px.bar(df,
|
57 |
-
x=x_col,
|
58 |
-
y='Model',
|
59 |
-
color=df['Test Result'],
|
60 |
-
color_discrete_map=color_discrete_map,
|
61 |
-
labels={x_col: plot_column, 'Model': 'Model'},
|
62 |
-
title=title,
|
63 |
-
orientation='h')
|
64 |
|
65 |
fig.update_layout(
|
66 |
xaxis=dict(range=[0, x_range_max]),
|
@@ -74,30 +64,23 @@ def create_bar_chart(exam_type, plot_column):
|
|
74 |
|
75 |
elif exam_type == "MMLU-Pro-Hy":
|
76 |
df = pd.read_csv('mmlu_pro_hy_results.csv')
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
x_range_max = 1.0
|
82 |
-
if plot_column != 'Accuracy':
|
83 |
-
def get_label(accuracy):
|
84 |
-
if accuracy < 0.5:
|
85 |
-
return "Low"
|
86 |
-
elif 0.5 <= accuracy <= 0.8:
|
87 |
-
return "Medium"
|
88 |
-
else:
|
89 |
-
return "High"
|
90 |
-
df['Test Result'] = df['Accuracy'].apply(get_label)
|
91 |
|
92 |
fig = px.bar(df,
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
fig.update_layout(
|
103 |
xaxis=dict(range=[0, x_range_max]),
|
@@ -112,11 +95,21 @@ def create_bar_chart(exam_type, plot_column):
|
|
112 |
with gr.Blocks() as app:
|
113 |
with gr.Tabs():
|
114 |
with gr.TabItem("Armenian Unified Exams"):
|
|
|
|
|
|
|
|
|
115 |
table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
|
116 |
-
plot_column_dropdown = gr.Dropdown(choices=['Average
|
117 |
plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
|
118 |
with gr.TabItem("MMLU-Pro-Hy"):
|
|
|
|
|
|
|
|
|
119 |
table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
|
120 |
-
|
|
|
|
|
121 |
|
122 |
-
app.launch(share=True)
|
|
|
5 |
def display_table(exam_type):
|
6 |
if exam_type == "Armenian Exams":
|
7 |
df = pd.read_csv('unified_exam_results.csv')
|
8 |
+
df = df.sort_values(by='Average', ascending=False)
|
9 |
cols = df.columns.tolist()
|
10 |
+
cols.insert(1, cols.pop(cols.index('Average')))
|
11 |
df = df[cols]
|
12 |
elif exam_type == "MMLU-Pro-Hy":
|
13 |
df = pd.read_csv('mmlu_pro_hy_results.csv')
|
14 |
+
subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
|
15 |
+
df['Average'] = df[subject_cols].mean(axis=1)
|
16 |
+
df = df.sort_values(by='Average', ascending=False)
|
17 |
+
cols = df.columns.tolist()
|
18 |
+
cols.remove('Accuracy')
|
19 |
+
cols.insert(1, cols.pop(cols.index('Average')))
|
20 |
+
cols.append(cols.pop(cols.index('Other')))
|
21 |
+
df = df[cols]
|
22 |
return df
|
23 |
|
24 |
def create_bar_chart(exam_type, plot_column):
|
25 |
if exam_type == "Armenian Exams":
|
26 |
df = pd.read_csv('unified_exam_results.csv')
|
|
|
27 |
df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
|
28 |
|
29 |
x_col = plot_column
|
30 |
+
title = f'{plot_column}'
|
31 |
+
x_range_max = 20
|
32 |
+
def get_label(score):
|
33 |
+
if score < 8:
|
34 |
+
return "Fail"
|
35 |
+
elif 8 <= score <= 18:
|
36 |
+
return "Pass"
|
37 |
+
else:
|
38 |
+
return "Distinction"
|
39 |
+
df['Test Result'] = df[plot_column].apply(get_label)
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
color_discrete_map = {
|
42 |
+
"Fail": "#ff5f56",
|
43 |
+
"Pass": "#ffbd2e",
|
44 |
+
"Distinction": "#27c93f"
|
45 |
+
}
|
46 |
+
fig = px.bar(df,
|
47 |
+
x=x_col,
|
48 |
+
y='Model',
|
49 |
+
color=df['Test Result'],
|
50 |
+
color_discrete_map=color_discrete_map,
|
51 |
+
labels={x_col: 'Score', 'Model': 'Model'},
|
52 |
+
title=title,
|
53 |
+
orientation='h')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
fig.update_layout(
|
56 |
xaxis=dict(range=[0, x_range_max]),
|
|
|
64 |
|
65 |
elif exam_type == "MMLU-Pro-Hy":
|
66 |
df = pd.read_csv('mmlu_pro_hy_results.csv')
|
67 |
+
subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
|
68 |
+
df['Average'] = df[subject_cols].mean(axis=1)
|
69 |
+
df = df.sort_values(by='Average', ascending=False)
|
70 |
+
df = df.drop(columns=['Accuracy'])
|
71 |
+
x_col = plot_column
|
72 |
+
title = f'{plot_column}'
|
73 |
x_range_max = 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
fig = px.bar(df,
|
76 |
+
x=x_col,
|
77 |
+
y='Model',
|
78 |
+
color=x_col,
|
79 |
+
color_continuous_scale='Viridis',
|
80 |
+
labels={x_col: 'Accuracy', 'Model': 'Model'},
|
81 |
+
title=title,
|
82 |
+
orientation='h',
|
83 |
+
range_color=[0,1])
|
84 |
|
85 |
fig.update_layout(
|
86 |
xaxis=dict(range=[0, x_range_max]),
|
|
|
95 |
with gr.Blocks() as app:
|
96 |
with gr.Tabs():
|
97 |
with gr.TabItem("Armenian Unified Exams"):
|
98 |
+
gr.Markdown("# Armenian Unified Test Exams")
|
99 |
+
gr.Markdown("### This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.")
|
100 |
+
# gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները հայերեն լեզվի և գրականության, հայոց պատմության և մաթեմատիկայի միասնական քնությունների թեսթերի համար։ Գնահատման համակարգը 20 բալանոց սանդղակ է, որտեղ 0-8-ը նշանակում է Անբավարար, 8-18-ը՝ Բավարար, իսկ 18-20-ը՝ Գերազանց:")
|
101 |
+
|
102 |
table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
|
103 |
+
plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
|
104 |
plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
|
105 |
with gr.TabItem("MMLU-Pro-Hy"):
|
106 |
+
gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
|
107 |
+
gr.Markdown("### This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.")
|
108 |
+
# gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները MMLU-Pro թեսթի համար, որը թարգմանվել է հայերեն: MMLU-Pro-ն իրենից ներկայացնում է : Միավորները ներկայացնում են ճշգրտությունը:")
|
109 |
+
|
110 |
table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
|
111 |
+
subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology', 'Average']
|
112 |
+
plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
|
113 |
+
plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
|
114 |
|
115 |
+
app.launch(share=True, debug=True)
|
mmlu_pro_hy_results.csv
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
-
Model,Accuracy
|
2 |
-
|
3 |
-
claude-3-5-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
1 |
+
Model,Accuracy,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Health,History,Law,Math,Other,Philosophy,Physics,Psychology
|
2 |
+
gpt-4o,0.685,0.8667,0.7424,0.6842,0.6176,0.7887,0.5625,0.7794,0.5517,0.5393,0.7788,0.5974,0.5476,0.6881,0.7164
|
3 |
+
claude-3-5-haiku-20241022,0.522,0.75,0.5758,0.5579,0.4412,0.6901,0.4125,0.5882,0.5172,0.2472,0.6018,0.3636,0.4048,0.5596,0.5672
|
4 |
+
claude-3-5-sonnet-20241022,0.701,0.8667,0.803,0.7579,0.7059,0.7887,0.5625,0.6618,0.6552,0.4944,0.7788,0.6494,0.5476,0.7523,0.7164
|
5 |
+
DeepSeek-V3,0.672,0.8167,0.8182,0.6947,0.7353,0.7887,0.5875,0.6471,0.4828,0.3596,0.8584,0.5455,0.5476,0.6881,0.7164
|
6 |
+
gemini-1.5-flash,0.579,0.75,0.7121,0.6947,0.5,0.7183,0.4,0.5,0.4483,0.2584,0.8319,0.3506,0.3571,0.6514,0.6567
|
7 |
+
gemini-2.0-flash,0.737,0.85,0.8182,0.7895,0.7353,0.8169,0.6,0.75,0.5517,0.5281,0.8673,0.6364,0.6429,0.7982,0.7612
|
8 |
+
Meta-Llama-3.3-70B-Instruct,0.523,0.7333,0.5303,0.5895,0.3824,0.6338,0.4875,0.5735,0.4138,0.3146,0.6018,0.3377,0.4524,0.5321,0.6119
|
unified_exam_results.csv
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
Model,Armenian language
|
2 |
claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
|
3 |
claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
|
4 |
gemini-2.0-flash,5.5,6.75,17.25,9.83
|
|
|
1 |
+
Model,Armenian language and literature,Armenian history,Mathematics,Average
|
2 |
claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
|
3 |
claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
|
4 |
gemini-2.0-flash,5.5,6.75,17.25,9.83
|