daniel7an commited on
Commit
0d208c0
·
1 Parent(s): 4781b83
Files changed (3) hide show
  1. app.py +60 -67
  2. mmlu_pro_hy_results.csv +8 -5
  3. unified_exam_results.csv +1 -1
app.py CHANGED
@@ -5,62 +5,52 @@ import plotly.express as px
5
  def display_table(exam_type):
6
  if exam_type == "Armenian Exams":
7
  df = pd.read_csv('unified_exam_results.csv')
8
- df = df.sort_values(by='Average score', ascending=False)
9
  cols = df.columns.tolist()
10
- cols.insert(1, cols.pop(cols.index('Average score')))
11
  df = df[cols]
12
  elif exam_type == "MMLU-Pro-Hy":
13
  df = pd.read_csv('mmlu_pro_hy_results.csv')
14
- df = df.sort_values(by='Accuracy', ascending=False)
 
 
 
 
 
 
 
15
  return df
16
 
17
  def create_bar_chart(exam_type, plot_column):
18
  if exam_type == "Armenian Exams":
19
  df = pd.read_csv('unified_exam_results.csv')
20
- df = df.sort_values(by='Average score', ascending=False)
21
  df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
22
 
23
  x_col = plot_column
24
- title = f'{plot_column} per Model'
25
- if plot_column == 'Average score':
26
- range_max = 20
27
- x_range_max = 20
28
- else:
29
- range_max = 20
30
- x_range_max = 20
31
- def get_label(score):
32
- if score < 8:
33
- return "Fail"
34
- elif 8 <= score <= 18:
35
- return "Pass"
36
- else:
37
- return "Distinction"
38
- df['Test Result'] = df[plot_column].apply(get_label)
39
 
40
- if plot_column in ['Average score', 'Accuracy']:
41
- fig = px.bar(df,
42
- x=x_col,
43
- y='Model',
44
- color=x_col,
45
- color_continuous_scale='tealrose_r',
46
- labels={x_col: plot_column, 'Model': 'Model'},
47
- title=title,
48
- orientation='h',
49
- range_color=[0, range_max])
50
- else:
51
- color_discrete_map = {
52
- "Fail": "#d15d80",
53
- "Pass": "#edd8be",
54
- "Distinction": "#059492"
55
- }
56
- fig = px.bar(df,
57
- x=x_col,
58
- y='Model',
59
- color=df['Test Result'],
60
- color_discrete_map=color_discrete_map,
61
- labels={x_col: plot_column, 'Model': 'Model'},
62
- title=title,
63
- orientation='h')
64
 
65
  fig.update_layout(
66
  xaxis=dict(range=[0, x_range_max]),
@@ -74,30 +64,23 @@ def create_bar_chart(exam_type, plot_column):
74
 
75
  elif exam_type == "MMLU-Pro-Hy":
76
  df = pd.read_csv('mmlu_pro_hy_results.csv')
77
- df = df.sort_values(by='Accuracy', ascending=False)
78
- x_col = 'Accuracy'
79
- title = 'Accuracy per Model (MMLU-Pro-Hy)'
80
- range_max = 1.0
 
 
81
  x_range_max = 1.0
82
- if plot_column != 'Accuracy':
83
- def get_label(accuracy):
84
- if accuracy < 0.5:
85
- return "Low"
86
- elif 0.5 <= accuracy <= 0.8:
87
- return "Medium"
88
- else:
89
- return "High"
90
- df['Test Result'] = df['Accuracy'].apply(get_label)
91
 
92
  fig = px.bar(df,
93
- x=x_col,
94
- y='Model',
95
- color=x_col,
96
- color_continuous_scale='tealrose_r',
97
- labels={x_col: plot_column, 'Model': 'Model'},
98
- title=title,
99
- orientation='h',
100
- range_color=[0, range_max])
101
 
102
  fig.update_layout(
103
  xaxis=dict(range=[0, x_range_max]),
@@ -112,11 +95,21 @@ def create_bar_chart(exam_type, plot_column):
112
  with gr.Blocks() as app:
113
  with gr.Tabs():
114
  with gr.TabItem("Armenian Unified Exams"):
 
 
 
 
115
  table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
116
- plot_column_dropdown = gr.Dropdown(choices=['Average score', 'Armenian language exam score', 'Armenian history exam score', 'Mathematics exam score'], value='Average score', label='Select Column to Plot')
117
  plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
118
  with gr.TabItem("MMLU-Pro-Hy"):
 
 
 
 
119
  table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
120
- plot_output_mmlu = gr.Plot(lambda: create_bar_chart("MMLU-Pro-Hy", 'Accuracy'))
 
 
121
 
122
- app.launch(share=True)
 
5
  def display_table(exam_type):
6
  if exam_type == "Armenian Exams":
7
  df = pd.read_csv('unified_exam_results.csv')
8
+ df = df.sort_values(by='Average', ascending=False)
9
  cols = df.columns.tolist()
10
+ cols.insert(1, cols.pop(cols.index('Average')))
11
  df = df[cols]
12
  elif exam_type == "MMLU-Pro-Hy":
13
  df = pd.read_csv('mmlu_pro_hy_results.csv')
14
+ subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
15
+ df['Average'] = df[subject_cols].mean(axis=1)
16
+ df = df.sort_values(by='Average', ascending=False)
17
+ cols = df.columns.tolist()
18
+ cols.remove('Accuracy')
19
+ cols.insert(1, cols.pop(cols.index('Average')))
20
+ cols.append(cols.pop(cols.index('Other')))
21
+ df = df[cols]
22
  return df
23
 
24
  def create_bar_chart(exam_type, plot_column):
25
  if exam_type == "Armenian Exams":
26
  df = pd.read_csv('unified_exam_results.csv')
 
27
  df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
28
 
29
  x_col = plot_column
30
+ title = f'{plot_column}'
31
+ x_range_max = 20
32
+ def get_label(score):
33
+ if score < 8:
34
+ return "Fail"
35
+ elif 8 <= score <= 18:
36
+ return "Pass"
37
+ else:
38
+ return "Distinction"
39
+ df['Test Result'] = df[plot_column].apply(get_label)
 
 
 
 
 
40
 
41
+ color_discrete_map = {
42
+ "Fail": "#ff5f56",
43
+ "Pass": "#ffbd2e",
44
+ "Distinction": "#27c93f"
45
+ }
46
+ fig = px.bar(df,
47
+ x=x_col,
48
+ y='Model',
49
+ color=df['Test Result'],
50
+ color_discrete_map=color_discrete_map,
51
+ labels={x_col: 'Score', 'Model': 'Model'},
52
+ title=title,
53
+ orientation='h')
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  fig.update_layout(
56
  xaxis=dict(range=[0, x_range_max]),
 
64
 
65
  elif exam_type == "MMLU-Pro-Hy":
66
  df = pd.read_csv('mmlu_pro_hy_results.csv')
67
+ subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
68
+ df['Average'] = df[subject_cols].mean(axis=1)
69
+ df = df.sort_values(by='Average', ascending=False)
70
+ df = df.drop(columns=['Accuracy'])
71
+ x_col = plot_column
72
+ title = f'{plot_column}'
73
  x_range_max = 1.0
 
 
 
 
 
 
 
 
 
74
 
75
  fig = px.bar(df,
76
+ x=x_col,
77
+ y='Model',
78
+ color=x_col,
79
+ color_continuous_scale='Viridis',
80
+ labels={x_col: 'Accuracy', 'Model': 'Model'},
81
+ title=title,
82
+ orientation='h',
83
+ range_color=[0,1])
84
 
85
  fig.update_layout(
86
  xaxis=dict(range=[0, x_range_max]),
 
95
  with gr.Blocks() as app:
96
  with gr.Tabs():
97
  with gr.TabItem("Armenian Unified Exams"):
98
+ gr.Markdown("# Armenian Unified Test Exams")
99
+ gr.Markdown("### This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.")
100
+ # gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները հայերեն լեզվի և գրականության, հայոց պատմության և մաթեմատիկայի միասնական քնությունների թեսթերի համար։ Գնահատման համակարգը 20 բալանոց սանդղակ է, որտեղ 0-8-ը նշանակում է Անբավարար, 8-18-ը՝ Բավարար, իսկ 18-20-ը՝ Գերազանց:")
101
+
102
  table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
103
+ plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
104
  plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
105
  with gr.TabItem("MMLU-Pro-Hy"):
106
+ gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
107
+ gr.Markdown("### This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.")
108
+ # gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները MMLU-Pro թեսթի համար, որը թարգմանվել է հայերեն: MMLU-Pro-ն իրենից ներկայացնում է : Միավորները ներկայացնում են ճշգրտությունը:")
109
+
110
  table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
111
+ subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology', 'Average']
112
+ plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
113
+ plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
114
 
115
+ app.launch(share=True, debug=True)
mmlu_pro_hy_results.csv CHANGED
@@ -1,5 +1,8 @@
1
- Model,Accuracy
2
- claude-3-5-haiku-20241022,0.526
3
- claude-3-5-sonnet-20241022,0.701
4
- gemini-2.0-flash,0.741
5
- gemini-1.5-flash,0.586
 
 
 
 
1
+ Model,Accuracy,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Health,History,Law,Math,Other,Philosophy,Physics,Psychology
2
+ gpt-4o,0.685,0.8667,0.7424,0.6842,0.6176,0.7887,0.5625,0.7794,0.5517,0.5393,0.7788,0.5974,0.5476,0.6881,0.7164
3
+ claude-3-5-haiku-20241022,0.522,0.75,0.5758,0.5579,0.4412,0.6901,0.4125,0.5882,0.5172,0.2472,0.6018,0.3636,0.4048,0.5596,0.5672
4
+ claude-3-5-sonnet-20241022,0.701,0.8667,0.803,0.7579,0.7059,0.7887,0.5625,0.6618,0.6552,0.4944,0.7788,0.6494,0.5476,0.7523,0.7164
5
+ DeepSeek-V3,0.672,0.8167,0.8182,0.6947,0.7353,0.7887,0.5875,0.6471,0.4828,0.3596,0.8584,0.5455,0.5476,0.6881,0.7164
6
+ gemini-1.5-flash,0.579,0.75,0.7121,0.6947,0.5,0.7183,0.4,0.5,0.4483,0.2584,0.8319,0.3506,0.3571,0.6514,0.6567
7
+ gemini-2.0-flash,0.737,0.85,0.8182,0.7895,0.7353,0.8169,0.6,0.75,0.5517,0.5281,0.8673,0.6364,0.6429,0.7982,0.7612
8
+ Meta-Llama-3.3-70B-Instruct,0.523,0.7333,0.5303,0.5895,0.3824,0.6338,0.4875,0.5735,0.4138,0.3146,0.6018,0.3377,0.4524,0.5321,0.6119
unified_exam_results.csv CHANGED
@@ -1,4 +1,4 @@
1
- Model,Armenian language exam score,Armenian history exam score,Mathematics exam score,Average score
2
  claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
3
  claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
4
  gemini-2.0-flash,5.5,6.75,17.25,9.83
 
1
+ Model,Armenian language and literature,Armenian history,Mathematics,Average
2
  claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
3
  claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
4
  gemini-2.0-flash,5.5,6.75,17.25,9.83