Bagratuni commited on
Commit
779cbde
·
1 Parent(s): 1b75b9d
Files changed (7) hide show
  1. app.py +114 -107
  2. data_handler.py +110 -0
  3. logo.png +0 -0
  4. mmlu_pro_hy_results.csv +0 -8
  5. model_handler.py +80 -0
  6. model_results.json +581 -0
  7. unified_exam_results.csv +0 -10
app.py CHANGED
@@ -1,115 +1,122 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
 
 
4
 
5
- def display_table(exam_type):
6
- if exam_type == "Armenian Exams":
7
- df = pd.read_csv('unified_exam_results.csv')
8
- df = df.sort_values(by='Average', ascending=False)
9
- cols = df.columns.tolist()
10
- cols.insert(1, cols.pop(cols.index('Average')))
11
- df = df[cols]
12
- df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
13
- df = df.round(4)
14
- elif exam_type == "MMLU-Pro-Hy":
15
- df = pd.read_csv('mmlu_pro_hy_results.csv')
16
- subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
17
- df['Average'] = df[subject_cols].mean(axis=1)
18
- df = df.sort_values(by='Average', ascending=False)
19
- cols = df.columns.tolist()
20
- cols.remove('Accuracy')
21
- cols.insert(1, cols.pop(cols.index('Average')))
22
- cols.append(cols.pop(cols.index('Other')))
23
- df = df[cols]
24
- df = df.round(4)
25
- return df
26
 
27
- def create_bar_chart(exam_type, plot_column):
28
- if exam_type == "Armenian Exams":
29
- df = pd.read_csv('unified_exam_results.csv')
30
- df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
31
- x_col = plot_column
32
- title = f'{plot_column}'
33
- x_range_max = 20
34
- def get_label(score):
35
- if score < 8:
36
- return "Fail"
37
- elif 8 <= score <= 18:
38
- return "Pass"
39
- else:
40
- return "Distinction"
41
- df['Test Result'] = df[plot_column].apply(get_label)
42
- color_discrete_map = {
43
- "Fail": "#ff5f56",
44
- "Pass": "#ffbd2e",
45
- "Distinction": "#27c93f"
46
- }
47
- fig = px.bar(df,
48
- x=x_col,
49
- y='Model',
50
- color=df['Test Result'],
51
- color_discrete_map=color_discrete_map,
52
- labels={x_col: 'Score', 'Model': 'Model'},
53
- title=title,
54
- orientation='h')
55
- fig.update_layout(
56
- xaxis=dict(range=[0, x_range_max]),
57
- title=dict(text=title, font=dict(size=16)),
58
- xaxis_title=dict(font=dict(size=12)),
59
- yaxis_title=dict(font=dict(size=12)),
60
- yaxis=dict(autorange="reversed"),
61
- autosize=True
62
- )
63
- return fig
64
- elif exam_type == "MMLU-Pro-Hy":
65
- df = pd.read_csv('mmlu_pro_hy_results.csv')
66
- subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
67
- df['Average'] = df[subject_cols].mean(axis=1)
68
- df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)
69
- df = df.drop(columns=['Accuracy'])
70
- x_col = plot_column
71
- title = f'{plot_column}'
72
- x_range_max = 1.0
73
- fig = px.bar(df,
74
- x=x_col,
75
- y='Model',
76
- color=x_col,
77
- color_continuous_scale='Viridis',
78
- labels={x_col: 'Accuracy', 'Model': 'Model'},
79
- title=title,
80
- orientation='h',
81
- range_color=[0,1])
82
- fig.update_layout(
83
- xaxis=dict(range=[0, x_range_max]),
84
- title=dict(text=title, font=dict(size=16)),
85
- xaxis_title=dict(font=dict(size=12)),
86
- yaxis_title=dict(font=dict(size=12)),
87
- yaxis=dict(autorange="reversed"),
88
- autosize=True
89
- )
90
- return fig
91
 
92
- with gr.Blocks() as app:
93
- with gr.Tabs():
94
- with gr.TabItem("Armenian Unified Exams"):
95
- gr.Markdown("# Armenian Unified Test Exams")
96
- gr.HTML(f"""
97
- <div style="font-size: 16px;">
 
 
 
98
  This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
99
- </div>
100
- """)
101
- table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
102
- plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
103
- plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
104
- with gr.TabItem("MMLU-Pro-Hy"):
105
- gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
106
- gr.HTML(f"""
107
- <div style="font-size: 16px;">
108
  This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
109
- </div>
110
- """)
111
- table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
112
- subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
113
- plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
114
- plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
115
- app.launch(share=True, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
+ from model_handler import ModelHandler
5
+ from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
6
 
7
+ global_unified_exam_df = None
8
+ global_mmlu_df = None
9
+ global_output_armenian = None
10
+ global_output_mmlu = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def refresh_data():
13
+ global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
14
+
15
+ model_handler = ModelHandler()
16
+ global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
17
+
18
+ global_output_armenian = unified_exam_result_table(global_unified_exam_df)
19
+ global_output_mmlu = mmlu_result_table(global_mmlu_df)
20
+
21
+ return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
22
+
23
+ def main():
24
+ global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
25
+ model_handler = ModelHandler()
26
+ global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ global_output_armenian = unified_exam_result_table(global_unified_exam_df)
29
+ global_output_mmlu = mmlu_result_table(global_mmlu_df)
30
+
31
+ with gr.Blocks() as app:
32
+ with gr.Tabs():
33
+ with gr.TabItem("Armenian Unified Exams"):
34
+ gr.Markdown("# Armenian Unified Test Exams")
35
+ gr.Markdown(
36
+ """
37
  This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
38
+ """
39
+ )
40
+ table_output_armenian = gr.DataFrame(value=global_output_armenian)
41
+ plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
42
+ plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
43
+ with gr.TabItem("MMLU-Pro-Hy"):
44
+ gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
45
+ gr.Markdown(
46
+ """
47
  This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
48
+ """
49
+ )
50
+ table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
51
+ subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
52
+ plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
53
+ plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
54
+ with gr.TabItem("About"):
55
+ gr.Markdown("# About the Benchmark")
56
+ gr.Markdown(
57
+ """
58
+ This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.
59
+
60
+ **Creator Company:** Metric AI Research Lab, Yerevan, Armenia."""
61
+ )
62
+ gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
63
+ gr.Markdown("""
64
+ - [Website](https://metric.am/)
65
+ - [Hugging Face](https://huggingface.co/Metric-AI)
66
+
67
+ MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
68
+ """
69
+ )
70
+ gr.Markdown("## Submission Guide")
71
+ gr.Markdown(
72
+ """
73
+ To submit a model for evaluation, please follow these steps:
74
+ 1. **Evaluate your model**:
75
+ - Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
76
+ 2. **Format your submission file**:
77
+ - After evaluation, you will get a `result.json` file. Ensure the file follows this format:
78
+ ```json
79
+ {
80
+ "mmlu_results": [
81
+ {
82
+ "category": "category_name",
83
+ "score": score_value
84
+ },
85
+ ...
86
+ ],
87
+ "unified_exam_results": [
88
+ {
89
+ "category": "category_name",
90
+ "score": score_value
91
+ },
92
+ ...
93
+ ]
94
+ }
95
+ ```
96
+ 3. **Submit your model**:
97
+ - Add the `arm_bench` tag and the `result.json` file to your model card.
98
+ - Click on the "Refresh Data" button in this app, and you will see your model's results.
99
+ """
100
+ )
101
+ gr.Markdown("## Contributing")
102
+ gr.Markdown(
103
+ """
104
+ You can contribute to this benchmark in several ways:
105
+ - Providing API credits for evaluating API-based models.
106
+ - Citing our work in your research and publications.
107
+ - Contributing to the development of the benchmark itself.
108
+ """
109
+ )
110
+
111
+ refresh_button = gr.Button("Refresh Data")
112
+ refresh_button.click(
113
+ fn=refresh_data,
114
+ outputs=[table_output_armenian,
115
+ table_output_mmlu,
116
+ plot_output_armenian,
117
+ plot_output_mmlu],
118
+ )
119
+ app.launch(share=True, debug=True)
120
+
121
+ if __name__ == "__main__":
122
+ main()
data_handler.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from model_handler import ModelHandler
5
+
6
+ def unified_exam_result_table(unified_exam_df):
7
+ df = unified_exam_df.copy()
8
+ numeric_columns = df.select_dtypes(include=["number"])
9
+ df["Average"] = numeric_columns.mean(axis=1)
10
+ df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
11
+ df.insert(0, 'Rank', range(1, len(df) + 1))
12
+ cols = df.columns.tolist()
13
+ cols.insert(2, cols.pop(cols.index('Average')))
14
+ df = df[cols]
15
+ df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
16
+ df = df.round(4)
17
+ return df
18
+
19
+ def mmlu_result_table(mmlu_df):
20
+ df = mmlu_df.copy()
21
+ numeric_columns = df.select_dtypes(include=["number"])
22
+ df["Average"] = numeric_columns.mean(axis=1)
23
+ df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
24
+ df.insert(0, 'Rank', range(1, len(df) + 1))
25
+ cols = df.columns.tolist()
26
+ cols.insert(2, cols.pop(cols.index('Average')))
27
+ cols.append(cols.pop(cols.index('Other')))
28
+ df = df[cols]
29
+ df = df.round(4)
30
+ return df
31
+
32
+ def unified_exam_chart(unified_exam_df, plot_column):
33
+ if plot_column == 'Armenian language and literature':
34
+ plot_column = 'Armenian language\nand literature'
35
+ df = unified_exam_df.copy()
36
+ df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
37
+ x_col = plot_column
38
+ title = f'{plot_column}'
39
+ x_range_max = 20
40
+ def get_label(score):
41
+ if score < 8:
42
+ return "Fail"
43
+ elif 8 <= score <= 18:
44
+ return "Pass"
45
+ else:
46
+ return "Distinction"
47
+ df['Test Result'] = df[plot_column].apply(get_label)
48
+ color_discrete_map = {
49
+ "Fail": "#ff5f56",
50
+ "Pass": "#ffbd2e",
51
+ "Distinction": "#27c93f"
52
+ }
53
+ fig = px.bar(df,
54
+ x=x_col,
55
+ y='Model',
56
+ color=df['Test Result'],
57
+ color_discrete_map=color_discrete_map,
58
+ labels={x_col: 'Score', 'Model': 'Model'},
59
+ title=title,
60
+ orientation='h'
61
+ )
62
+ # max_chart_height = 600
63
+
64
+ # chart_height = df.shape[0] * 50
65
+ # chart_height = min(chart_height, max_chart_height)
66
+
67
+ fig.update_layout(
68
+ xaxis=dict(range=[0, x_range_max]),
69
+ title=dict(text=title, font=dict(size=16)),
70
+ xaxis_title=dict(font=dict(size=12)),
71
+ yaxis_title=dict(font=dict(size=12)),
72
+ yaxis=dict(autorange="reversed"),
73
+ # height=chart_height,
74
+ width=1400
75
+ )
76
+ return fig
77
+
78
+ def mmlu_chart(mmlu_df, plot_column):
79
+ df = mmlu_df.copy()
80
+ subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
81
+ df['Average'] = df[subject_cols].mean(axis=1)
82
+ df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)
83
+ x_col = plot_column
84
+ title = f'{plot_column}'
85
+ x_range_max = 1.0
86
+ fig = px.bar(df,
87
+ x=x_col,
88
+ y='Model',
89
+ color=x_col,
90
+ color_continuous_scale='Viridis',
91
+ labels={x_col: 'Accuracy', 'Model': 'Model'},
92
+ title=title,
93
+ orientation='h',
94
+ range_color=[0,1]
95
+ )
96
+ # max_chart_height = 600
97
+
98
+ # chart_height = df.shape[0] * 50
99
+ # chart_height = min(chart_height, max_chart_height)
100
+
101
+ fig.update_layout(
102
+ xaxis=dict(range=[0, x_range_max]),
103
+ title=dict(text=title, font=dict(size=16)),
104
+ xaxis_title=dict(font=dict(size=12)),
105
+ yaxis_title=dict(font=dict(size=12)),
106
+ yaxis=dict(autorange="reversed"),
107
+ # height=chart_height,
108
+ width=1400
109
+ )
110
+ return fig
logo.png ADDED
mmlu_pro_hy_results.csv DELETED
@@ -1,8 +0,0 @@
1
- Model,Accuracy,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Health,History,Law,Math,Other,Philosophy,Physics,Psychology
2
- gpt-4o,0.685,0.8667,0.7424,0.6842,0.6176,0.7887,0.5625,0.7794,0.5517,0.5393,0.7788,0.5974,0.5476,0.6881,0.7164
3
- claude-3-5-haiku-20241022,0.522,0.75,0.5758,0.5579,0.4412,0.6901,0.4125,0.5882,0.5172,0.2472,0.6018,0.3636,0.4048,0.5596,0.5672
4
- claude-3-5-sonnet-20241022,0.701,0.8667,0.803,0.7579,0.7059,0.7887,0.5625,0.6618,0.6552,0.4944,0.7788,0.6494,0.5476,0.7523,0.7164
5
- DeepSeek-V3,0.672,0.8167,0.8182,0.6947,0.7353,0.7887,0.5875,0.6471,0.4828,0.3596,0.8584,0.5455,0.5476,0.6881,0.7164
6
- gemini-1.5-flash,0.579,0.75,0.7121,0.6947,0.5,0.7183,0.4,0.5,0.4483,0.2584,0.8319,0.3506,0.3571,0.6514,0.6567
7
- gemini-2.0-flash,0.737,0.85,0.8182,0.7895,0.7353,0.8169,0.6,0.75,0.5517,0.5281,0.8673,0.6364,0.6429,0.7982,0.7612
8
- Meta-Llama-3.3-70B-Instruct,0.523,0.7333,0.5303,0.5895,0.3824,0.6338,0.4875,0.5735,0.4138,0.3146,0.6018,0.3377,0.4524,0.5321,0.6119
 
 
 
 
 
 
 
 
 
model_handler.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import pandas as pd
6
+ from huggingface_hub import HfApi, hf_hub_download
7
+
8
+ class ModelHandler:
9
+ def __init__(self, model_infos_path="D:\Vscode\llm_benchmark_space\ArmBen\model_results.json"):
10
+ self.api = HfApi()
11
+ self.model_infos_path = model_infos_path
12
+ self.model_infos = self._load_model_infos()
13
+
14
+ def _load_model_infos(self) -> Dict:
15
+ if os.path.exists(self.model_infos_path):
16
+ with open(self.model_infos_path) as f:
17
+ return json.load(f)
18
+ return {}
19
+
20
+ def _save_model_infos(self):
21
+ print("Saving model infos")
22
+ with open(self.model_infos_path, "w") as f:
23
+ json.dump(self.model_infos, f, indent=4)
24
+
25
+ def get_arm_bench_data(self):
26
+ models = self.api.list_models(filter="arm_llm")
27
+ model_names = {model["model_name"] for model in self.model_infos}
28
+ repositories = [model.modelId for model in models]
29
+
30
+ for repo_id in repositories:
31
+ files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"]
32
+ if not files:
33
+ continue
34
+
35
+ for file in files:
36
+ model_name = repo_id
37
+ if model_name not in model_names:
38
+ try:
39
+ result_path = hf_hub_download(repo_id, filename=file)
40
+ with open(result_path) as f:
41
+ results = json.load(f)
42
+
43
+ self.model_infos.append({
44
+ "model_name": model_name,
45
+ "results": results
46
+ })
47
+
48
+ except Exception as e:
49
+ print(f"Error loading {model_name} - {e}")
50
+ continue
51
+
52
+ self._save_model_infos()
53
+
54
+ mmlu_data = []
55
+ unified_exam_data = []
56
+
57
+ for model in self.model_infos:
58
+ model_name = model["model_name"]
59
+ results = model.get("results", {})
60
+
61
+ mmlu_results = results.get("mmlu_results", [])
62
+ unified_exam_results = results.get("unified_exam_results", [])
63
+
64
+ if mmlu_results:
65
+ mmlu_row = {"Model": model_name}
66
+ for result in mmlu_results:
67
+ mmlu_row[result["category"]] = result["score"]
68
+ mmlu_data.append(mmlu_row)
69
+
70
+ if unified_exam_results:
71
+ unified_exam_row = {"Model": model_name}
72
+ for result in unified_exam_results:
73
+ unified_exam_row[result["category"]] = result["score"]
74
+ unified_exam_data.append(unified_exam_row)
75
+
76
+
77
+ mmlu_df = pd.DataFrame(mmlu_data)
78
+ unified_exam_df = pd.DataFrame(unified_exam_data)
79
+
80
+ return mmlu_df, unified_exam_df
model_results.json ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model_name": "claude-3-7-sonnet-20250219",
4
+ "results": {
5
+ "mmlu_results": [],
6
+ "unified_exam_results": [
7
+ {
8
+ "category": "Armenian language and literature",
9
+ "score": 10.5
10
+ },
11
+ {
12
+ "category": "Armenian history",
13
+ "score": 7.75
14
+ },
15
+ {
16
+ "category": "Mathematics",
17
+ "score": 15.0
18
+ }
19
+ ]
20
+ }
21
+ },
22
+ {
23
+ "model_name": "claude-3-5-sonnet-20241022",
24
+ "results": {
25
+ "mmlu_results": [
26
+ {
27
+ "category": "Biology",
28
+ "score": 0.8667
29
+ },
30
+ {
31
+ "category": "Business",
32
+ "score": 0.803
33
+ },
34
+ {
35
+ "category": "Chemistry",
36
+ "score": 0.7579
37
+ },
38
+ {
39
+ "category": "Computer Science",
40
+ "score": 0.7059
41
+ },
42
+ {
43
+ "category": "Economics",
44
+ "score": 0.7887
45
+ },
46
+ {
47
+ "category": "Engineering",
48
+ "score": 0.5625
49
+ },
50
+ {
51
+ "category": "Health",
52
+ "score": 0.6618
53
+ },
54
+ {
55
+ "category": "History",
56
+ "score": 0.6552
57
+ },
58
+ {
59
+ "category": "Law",
60
+ "score": 0.4944
61
+ },
62
+ {
63
+ "category": "Math",
64
+ "score": 0.7788
65
+ },
66
+ {
67
+ "category": "Other",
68
+ "score": 0.6494
69
+ },
70
+ {
71
+ "category": "Philosophy",
72
+ "score": 0.5476
73
+ },
74
+ {
75
+ "category": "Physics",
76
+ "score": 0.7523
77
+ },
78
+ {
79
+ "category": "Psychology",
80
+ "score": 0.7164
81
+ }
82
+ ],
83
+ "unified_exam_results": [
84
+ {
85
+ "category": "Armenian language and literature",
86
+ "score": 10.0
87
+ },
88
+ {
89
+ "category": "Armenian history",
90
+ "score": 9.25
91
+ },
92
+ {
93
+ "category": "Mathematics",
94
+ "score": 12.75
95
+ }
96
+ ]
97
+ }
98
+ },
99
+ {
100
+ "model_name": "gemini-2.0-flash",
101
+ "results": {
102
+ "mmlu_results": [
103
+ {
104
+ "category": "Biology",
105
+ "score": 0.85
106
+ },
107
+ {
108
+ "category": "Business",
109
+ "score": 0.8182
110
+ },
111
+ {
112
+ "category": "Chemistry",
113
+ "score": 0.7895
114
+ },
115
+ {
116
+ "category": "Computer Science",
117
+ "score": 0.7353
118
+ },
119
+ {
120
+ "category": "Economics",
121
+ "score": 0.8169
122
+ },
123
+ {
124
+ "category": "Engineering",
125
+ "score": 0.6
126
+ },
127
+ {
128
+ "category": "Health",
129
+ "score": 0.75
130
+ },
131
+ {
132
+ "category": "History",
133
+ "score": 0.5517
134
+ },
135
+ {
136
+ "category": "Law",
137
+ "score": 0.5281
138
+ },
139
+ {
140
+ "category": "Math",
141
+ "score": 0.8673
142
+ },
143
+ {
144
+ "category": "Other",
145
+ "score": 0.6364
146
+ },
147
+ {
148
+ "category": "Philosophy",
149
+ "score": 0.6429
150
+ },
151
+ {
152
+ "category": "Physics",
153
+ "score": 0.7982
154
+ },
155
+ {
156
+ "category": "Psychology",
157
+ "score": 0.7612
158
+ }
159
+ ],
160
+ "unified_exam_results": [
161
+ {
162
+ "category": "Armenian language and literature",
163
+ "score": 5.5
164
+ },
165
+ {
166
+ "category": "Armenian history",
167
+ "score": 6.75
168
+ },
169
+ {
170
+ "category": "Mathematics",
171
+ "score": 17.25
172
+ }
173
+ ]
174
+ }
175
+ },
176
+ {
177
+ "model_name": "gpt-4o",
178
+ "results": {
179
+ "mmlu_results": [
180
+ {
181
+ "category": "Biology",
182
+ "score": 0.8667
183
+ },
184
+ {
185
+ "category": "Business",
186
+ "score": 0.7424
187
+ },
188
+ {
189
+ "category": "Chemistry",
190
+ "score": 0.6842
191
+ },
192
+ {
193
+ "category": "Computer Science",
194
+ "score": 0.6176
195
+ },
196
+ {
197
+ "category": "Economics",
198
+ "score": 0.7887
199
+ },
200
+ {
201
+ "category": "Engineering",
202
+ "score": 0.5625
203
+ },
204
+ {
205
+ "category": "Health",
206
+ "score": 0.7794
207
+ },
208
+ {
209
+ "category": "History",
210
+ "score": 0.5517
211
+ },
212
+ {
213
+ "category": "Law",
214
+ "score": 0.5393
215
+ },
216
+ {
217
+ "category": "Math",
218
+ "score": 0.7788
219
+ },
220
+ {
221
+ "category": "Other",
222
+ "score": 0.5974
223
+ },
224
+ {
225
+ "category": "Philosophy",
226
+ "score": 0.5476
227
+ },
228
+ {
229
+ "category": "Physics",
230
+ "score": 0.6881
231
+ },
232
+ {
233
+ "category": "Psychology",
234
+ "score": 0.7164
235
+ }
236
+ ],
237
+ "unified_exam_results": [
238
+ {
239
+ "category": "Armenian language and literature",
240
+ "score": 6.75
241
+ },
242
+ {
243
+ "category": "Armenian history",
244
+ "score": 6.75
245
+ },
246
+ {
247
+ "category": "Mathematics",
248
+ "score": 13.25
249
+ }
250
+ ]
251
+ }
252
+ },
253
+ {
254
+ "model_name": "qwen-max-2025-01-25",
255
+ "results": {
256
+ "mmlu_results": [],
257
+ "unified_exam_results": [
258
+ {
259
+ "category": "Armenian language and literature",
260
+ "score": 7.25
261
+ },
262
+ {
263
+ "category": "Armenian history",
264
+ "score": 4.5
265
+ },
266
+ {
267
+ "category": "Mathematics",
268
+ "score": 14.25
269
+ }
270
+ ]
271
+ }
272
+ },
273
+ {
274
+ "model_name": "gemini-1.5-flash",
275
+ "results": {
276
+ "mmlu_results": [
277
+ {
278
+ "category": "Biology",
279
+ "score": 0.75
280
+ },
281
+ {
282
+ "category": "Business",
283
+ "score": 0.7121
284
+ },
285
+ {
286
+ "category": "Chemistry",
287
+ "score": 0.6947
288
+ },
289
+ {
290
+ "category": "Computer Science",
291
+ "score": 0.5
292
+ },
293
+ {
294
+ "category": "Economics",
295
+ "score": 0.7183
296
+ },
297
+ {
298
+ "category": "Engineering",
299
+ "score": 0.4
300
+ },
301
+ {
302
+ "category": "Health",
303
+ "score": 0.5
304
+ },
305
+ {
306
+ "category": "History",
307
+ "score": 0.4483
308
+ },
309
+ {
310
+ "category": "Law",
311
+ "score": 0.2584
312
+ },
313
+ {
314
+ "category": "Math",
315
+ "score": 0.8319
316
+ },
317
+ {
318
+ "category": "Other",
319
+ "score": 0.3506
320
+ },
321
+ {
322
+ "category": "Philosophy",
323
+ "score": 0.3571
324
+ },
325
+ {
326
+ "category": "Physics",
327
+ "score": 0.6514
328
+ },
329
+ {
330
+ "category": "Psychology",
331
+ "score": 0.6567
332
+ }
333
+ ],
334
+ "unified_exam_results": [
335
+ {
336
+ "category": "Armenian language and literature",
337
+ "score": 4.75
338
+ },
339
+ {
340
+ "category": "Armenian history",
341
+ "score": 3.75
342
+ },
343
+ {
344
+ "category": "Mathematics",
345
+ "score": 15.0
346
+ }
347
+ ]
348
+ }
349
+ },
350
+ {
351
+ "model_name": "DeepSeek-V3",
352
+ "results": {
353
+ "mmlu_results": [
354
+ {
355
+ "category": "Biology",
356
+ "score": 0.8167
357
+ },
358
+ {
359
+ "category": "Business",
360
+ "score": 0.8182
361
+ },
362
+ {
363
+ "category": "Chemistry",
364
+ "score": 0.6947
365
+ },
366
+ {
367
+ "category": "Computer Science",
368
+ "score": 0.7353
369
+ },
370
+ {
371
+ "category": "Economics",
372
+ "score": 0.7887
373
+ },
374
+ {
375
+ "category": "Engineering",
376
+ "score": 0.5875
377
+ },
378
+ {
379
+ "category": "Health",
380
+ "score": 0.6471
381
+ },
382
+ {
383
+ "category": "History",
384
+ "score": 0.4828
385
+ },
386
+ {
387
+ "category": "Law",
388
+ "score": 0.3596
389
+ },
390
+ {
391
+ "category": "Math",
392
+ "score": 0.8584
393
+ },
394
+ {
395
+ "category": "Other",
396
+ "score": 0.5455
397
+ },
398
+ {
399
+ "category": "Philosophy",
400
+ "score": 0.5476
401
+ },
402
+ {
403
+ "category": "Physics",
404
+ "score": 0.6881
405
+ },
406
+ {
407
+ "category": "Psychology",
408
+ "score": 0.7164
409
+ }
410
+ ],
411
+ "unified_exam_results": [
412
+ {
413
+ "category": "Armenian language and literature",
414
+ "score": 5.25
415
+ },
416
+ {
417
+ "category": "Armenian history",
418
+ "score": 5.0
419
+ },
420
+ {
421
+ "category": "Mathematics",
422
+ "score": 12.25
423
+ }
424
+ ]
425
+ }
426
+ },
427
+ {
428
+ "model_name": "Meta-Llama-3.3-70B-Instruct",
429
+ "results": {
430
+ "mmlu_results": [
431
+ {
432
+ "category": "Biology",
433
+ "score": 0.7333
434
+ },
435
+ {
436
+ "category": "Business",
437
+ "score": 0.5303
438
+ },
439
+ {
440
+ "category": "Chemistry",
441
+ "score": 0.5895
442
+ },
443
+ {
444
+ "category": "Computer Science",
445
+ "score": 0.3824
446
+ },
447
+ {
448
+ "category": "Economics",
449
+ "score": 0.6338
450
+ },
451
+ {
452
+ "category": "Engineering",
453
+ "score": 0.4875
454
+ },
455
+ {
456
+ "category": "Health",
457
+ "score": 0.5735
458
+ },
459
+ {
460
+ "category": "History",
461
+ "score": 0.4138
462
+ },
463
+ {
464
+ "category": "Law",
465
+ "score": 0.3146
466
+ },
467
+ {
468
+ "category": "Math",
469
+ "score": 0.6018
470
+ },
471
+ {
472
+ "category": "Other",
473
+ "score": 0.3377
474
+ },
475
+ {
476
+ "category": "Philosophy",
477
+ "score": 0.4524
478
+ },
479
+ {
480
+ "category": "Physics",
481
+ "score": 0.5321
482
+ },
483
+ {
484
+ "category": "Psychology",
485
+ "score": 0.6119
486
+ }
487
+ ],
488
+ "unified_exam_results": [
489
+ {
490
+ "category": "Armenian language and literature",
491
+ "score": 4.5
492
+ },
493
+ {
494
+ "category": "Armenian history",
495
+ "score": 5.25
496
+ },
497
+ {
498
+ "category": "Mathematics",
499
+ "score": 11.5
500
+ }
501
+ ]
502
+ }
503
+ },
504
+ {
505
+ "model_name": "claude-3-5-haiku-20241022",
506
+ "results": {
507
+ "mmlu_results": [
508
+ {
509
+ "category": "Biology",
510
+ "score": 0.75
511
+ },
512
+ {
513
+ "category": "Business",
514
+ "score": 0.5758
515
+ },
516
+ {
517
+ "category": "Chemistry",
518
+ "score": 0.5579
519
+ },
520
+ {
521
+ "category": "Computer Science",
522
+ "score": 0.4412
523
+ },
524
+ {
525
+ "category": "Economics",
526
+ "score": 0.6901
527
+ },
528
+ {
529
+ "category": "Engineering",
530
+ "score": 0.4125
531
+ },
532
+ {
533
+ "category": "Health",
534
+ "score": 0.5882
535
+ },
536
+ {
537
+ "category": "History",
538
+ "score": 0.5172
539
+ },
540
+ {
541
+ "category": "Law",
542
+ "score": 0.2472
543
+ },
544
+ {
545
+ "category": "Math",
546
+ "score": 0.6018
547
+ },
548
+ {
549
+ "category": "Other",
550
+ "score": 0.3636
551
+ },
552
+ {
553
+ "category": "Philosophy",
554
+ "score": 0.4048
555
+ },
556
+ {
557
+ "category": "Physics",
558
+ "score": 0.5596
559
+ },
560
+ {
561
+ "category": "Psychology",
562
+ "score": 0.5672
563
+ }
564
+ ],
565
+ "unified_exam_results": [
566
+ {
567
+ "category": "Armenian language and literature",
568
+ "score": 5.0
569
+ },
570
+ {
571
+ "category": "Armenian history",
572
+ "score": 3.75
573
+ },
574
+ {
575
+ "category": "Mathematics",
576
+ "score": 10.75
577
+ }
578
+ ]
579
+ }
580
+ }
581
+ ]
unified_exam_results.csv DELETED
@@ -1,10 +0,0 @@
1
- Model,Armenian language and literature,Armenian history,Mathematics,Average
2
- claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
3
- claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
4
- gemini-2.0-flash,5.5,6.75,17.25,9.83
5
- gpt-4o,6.75,6.75,13.25,8.92
6
- qwen-max-2025-01-25,7.25,4.5,14.25,8.67
7
- gemini-1.5-flash,4.75,3.75,15.0,7.83
8
- DeepSeek-V3,5.25,5.0,12.25,7.5
9
- Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
10
- claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5