ArmBench-LLM / model_results.json
Bagratuni's picture
commit
a4d362f
raw
history blame
18.4 kB
[
{
"model_name": "claude-3-7-sonnet-20250219",
"results": {
"mmlu_results": [],
"unified_exam_results": [
{
"category": "Average",
"score": 11.0833
},
{
"category": "Armenian language and literature",
"score": 10.5
},
{
"category": "Armenian history",
"score": 7.75
},
{
"category": "Mathematics",
"score": 15.0
}
]
}
},
{
"model_name": "claude-3-5-sonnet-20241022",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.6958
},
{
"category": "Biology",
"score": 0.8667
},
{
"category": "Business",
"score": 0.803
},
{
"category": "Chemistry",
"score": 0.7579
},
{
"category": "Computer Science",
"score": 0.7059
},
{
"category": "Economics",
"score": 0.7887
},
{
"category": "Engineering",
"score": 0.5625
},
{
"category": "Health",
"score": 0.6618
},
{
"category": "History",
"score": 0.6552
},
{
"category": "Law",
"score": 0.4944
},
{
"category": "Math",
"score": 0.7788
},
{
"category": "Other",
"score": 0.6494
},
{
"category": "Philosophy",
"score": 0.5476
},
{
"category": "Physics",
"score": 0.7523
},
{
"category": "Psychology",
"score": 0.7164
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 10.6667
},
{
"category": "Armenian language and literature",
"score": 10.0
},
{
"category": "Armenian history",
"score": 9.25
},
{
"category": "Mathematics",
"score": 12.75
}
]
}
},
{
"model_name": "gemini-2.0-flash",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.7247
},
{
"category": "Biology",
"score": 0.85
},
{
"category": "Business",
"score": 0.8182
},
{
"category": "Chemistry",
"score": 0.7895
},
{
"category": "Computer Science",
"score": 0.7353
},
{
"category": "Economics",
"score": 0.8169
},
{
"category": "Engineering",
"score": 0.6
},
{
"category": "Health",
"score": 0.75
},
{
"category": "History",
"score": 0.5517
},
{
"category": "Law",
"score": 0.5281
},
{
"category": "Math",
"score": 0.8673
},
{
"category": "Other",
"score": 0.6364
},
{
"category": "Philosophy",
"score": 0.6429
},
{
"category": "Physics",
"score": 0.7982
},
{
"category": "Psychology",
"score": 0.7612
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 9.8333
},
{
"category": "Armenian language and literature",
"score": 5.5
},
{
"category": "Armenian history",
"score": 6.75
},
{
"category": "Mathematics",
"score": 17.25
}
]
}
},
{
"model_name": "gpt-4o",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.6758
},
{
"category": "Biology",
"score": 0.8667
},
{
"category": "Business",
"score": 0.7424
},
{
"category": "Chemistry",
"score": 0.6842
},
{
"category": "Computer Science",
"score": 0.6176
},
{
"category": "Economics",
"score": 0.7887
},
{
"category": "Engineering",
"score": 0.5625
},
{
"category": "Health",
"score": 0.7794
},
{
"category": "History",
"score": 0.5517
},
{
"category": "Law",
"score": 0.5393
},
{
"category": "Math",
"score": 0.7788
},
{
"category": "Other",
"score": 0.5974
},
{
"category": "Philosophy",
"score": 0.5476
},
{
"category": "Physics",
"score": 0.6881
},
{
"category": "Psychology",
"score": 0.7164
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 8.9167
},
{
"category": "Armenian language and literature",
"score": 6.75
},
{
"category": "Armenian history",
"score": 6.75
},
{
"category": "Mathematics",
"score": 13.25
}
]
}
},
{
"model_name": "qwen-max-2025-01-25",
"results": {
"mmlu_results": [],
"unified_exam_results": [
{
"category": "Average",
"score": 8.6667
},
{
"category": "Armenian language and literature",
"score": 7.25
},
{
"category": "Armenian history",
"score": 4.5
},
{
"category": "Mathematics",
"score": 14.25
}
]
}
},
{
"model_name": "gemini-1.5-flash",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.5592
},
{
"category": "Biology",
"score": 0.75
},
{
"category": "Business",
"score": 0.7121
},
{
"category": "Chemistry",
"score": 0.6947
},
{
"category": "Computer Science",
"score": 0.5
},
{
"category": "Economics",
"score": 0.7183
},
{
"category": "Engineering",
"score": 0.4
},
{
"category": "Health",
"score": 0.5
},
{
"category": "History",
"score": 0.4483
},
{
"category": "Law",
"score": 0.2584
},
{
"category": "Math",
"score": 0.8319
},
{
"category": "Other",
"score": 0.3506
},
{
"category": "Philosophy",
"score": 0.3571
},
{
"category": "Physics",
"score": 0.6514
},
{
"category": "Psychology",
"score": 0.6567
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 7.8333
},
{
"category": "Armenian language and literature",
"score": 4.75
},
{
"category": "Armenian history",
"score": 3.75
},
{
"category": "Mathematics",
"score": 15.0
}
]
}
},
{
"model_name": "DeepSeek-V3",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.6633
},
{
"category": "Biology",
"score": 0.8167
},
{
"category": "Business",
"score": 0.8182
},
{
"category": "Chemistry",
"score": 0.6947
},
{
"category": "Computer Science",
"score": 0.7353
},
{
"category": "Economics",
"score": 0.7887
},
{
"category": "Engineering",
"score": 0.5875
},
{
"category": "Health",
"score": 0.6471
},
{
"category": "History",
"score": 0.4828
},
{
"category": "Law",
"score": 0.3596
},
{
"category": "Math",
"score": 0.8584
},
{
"category": "Other",
"score": 0.5455
},
{
"category": "Philosophy",
"score": 0.5476
},
{
"category": "Physics",
"score": 0.6881
},
{
"category": "Psychology",
"score": 0.7164
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 7.5
},
{
"category": "Armenian language and literature",
"score": 5.25
},
{
"category": "Armenian history",
"score": 5.0
},
{
"category": "Mathematics",
"score": 12.25
}
]
}
},
{
"model_name": "Meta-Llama-3.3-70B-Instruct",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.5139
},
{
"category": "Biology",
"score": 0.7333
},
{
"category": "Business",
"score": 0.5303
},
{
"category": "Chemistry",
"score": 0.5895
},
{
"category": "Computer Science",
"score": 0.3824
},
{
"category": "Economics",
"score": 0.6338
},
{
"category": "Engineering",
"score": 0.4875
},
{
"category": "Health",
"score": 0.5735
},
{
"category": "History",
"score": 0.4138
},
{
"category": "Law",
"score": 0.3146
},
{
"category": "Math",
"score": 0.6018
},
{
"category": "Other",
"score": 0.3377
},
{
"category": "Philosophy",
"score": 0.4524
},
{
"category": "Physics",
"score": 0.5321
},
{
"category": "Psychology",
"score": 0.6119
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 7.0833
},
{
"category": "Armenian language and literature",
"score": 4.5
},
{
"category": "Armenian history",
"score": 5.25
},
{
"category": "Mathematics",
"score": 11.5
}
]
}
},
{
"model_name": "claude-3-5-haiku-20241022",
"results": {
"mmlu_results": [
{
"category": "Average",
"score": 0.5198
},
{
"category": "Biology",
"score": 0.75
},
{
"category": "Business",
"score": 0.5758
},
{
"category": "Chemistry",
"score": 0.5579
},
{
"category": "Computer Science",
"score": 0.4412
},
{
"category": "Economics",
"score": 0.6901
},
{
"category": "Engineering",
"score": 0.4125
},
{
"category": "Health",
"score": 0.5882
},
{
"category": "History",
"score": 0.5172
},
{
"category": "Law",
"score": 0.2472
},
{
"category": "Math",
"score": 0.6018
},
{
"category": "Other",
"score": 0.3636
},
{
"category": "Philosophy",
"score": 0.4048
},
{
"category": "Physics",
"score": 0.5596
},
{
"category": "Psychology",
"score": 0.5672
}
],
"unified_exam_results": [
{
"category": "Average",
"score": 6.5
},
{
"category": "Armenian language and literature",
"score": 5.0
},
{
"category": "Armenian history",
"score": 3.75
},
{
"category": "Mathematics",
"score": 10.75
}
]
}
}
]