Spaces:
Running
Running
[ | |
{ | |
"model_name": "claude-3-7-sonnet-20250219", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 11.0833 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 10.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 7.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.0 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "claude-3-5-sonnet-20241022", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6958 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8667 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.803 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.7579 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7059 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5625 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.6618 | |
}, | |
{ | |
"category": "History", | |
"score": 0.6552 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.4944 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.7788 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.6494 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.7523 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 10.6667 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 10.0 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 9.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.75 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-2.0-flash", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.7247 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.85 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.8182 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.7895 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7353 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.8169 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.6 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.75 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5517 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.5281 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8673 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.6364 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.6429 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.7982 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7612 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 9.8333 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 17.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gpt-4o", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6758 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8667 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.7424 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6842 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.6176 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5625 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.7794 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5517 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.5393 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.7788 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.5974 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6881 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 8.9167 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 13.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "qwen-max-2025-01-25", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 8.6667 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 7.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 4.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 14.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-1.5-flash", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5592 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.75 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.7121 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6947 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.5 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7183 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4483 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.2584 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8319 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3506 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.3571 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6514 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.6567 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.8333 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 3.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.0 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "DeepSeek-V3", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6633 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8167 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.8182 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6947 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7353 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5875 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.6471 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4828 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.3596 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8584 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.5455 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6881 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.5 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 5.0 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "Meta-Llama-3.3-70B-Instruct", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5139 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.7333 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.5303 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.5895 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.3824 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.6338 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4875 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5735 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4138 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.3146 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.6018 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3377 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.4524 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.5321 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.6119 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.0833 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 5.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 11.5 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "claude-3-5-haiku-20241022", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5198 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.75 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.5758 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.5579 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.4412 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.6901 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4125 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5882 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5172 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.2472 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.6018 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3636 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.4048 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.5596 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.5672 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 6.5 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.0 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 3.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 10.75 | |
} | |
] | |
} | |
} | |
] |