zhwang4ai's picture
init leaderboard
f49345e
{
"qwen2.5-3b-instruct": {
"success_rate": 0.0624,
"relative_action_count": 2.4255102697302697
},
"llama-3.2-3b-instruct": {
"success_rate": 0.064,
"relative_action_count": 2.4438042524142523
},
"llama-3.1-8b-instruct": {
"success_rate": 0.11599999999999999,
"relative_action_count": 2.3321907026307027
},
"gpt-3.5-turbo": {
"success_rate": 0.12079999999999999,
"relative_action_count": 2.34957508047508
},
"gemma-2-9b-it": {
"success_rate": 0.132,
"relative_action_count": 2.3394684981684977
},
"qwen2.5-7b-instruct": {
"success_rate": 0.1664,
"relative_action_count": 2.3259762459762454
},
"gemma-2-27b-it": {
"success_rate": 0.1696,
"relative_action_count": 2.28467764013764
},
"llama-3.1-70b-instruct": {
"success_rate": 0.256,
"relative_action_count": 1.9653564912864916
},
"yi-lightning": {
"success_rate": 0.30720000000000003,
"relative_action_count": 2.031278719058719
},
"gpt-4o-mini": {
"success_rate": 0.31040000000000006,
"relative_action_count": 1.9804984304584305
},
"llama-3.3-70b-instruct": {
"success_rate": 0.33840000000000003,
"relative_action_count": 1.90917626040626
},
"claude-3.5-haiku": {
"success_rate": 0.3592000000000001,
"relative_action_count": 2.0113219180819177
},
"gemini-1.5-pro": {
"success_rate": 0.36879999999999996,
"relative_action_count": 1.9371788544788544
},
"qwen2.5-14b-instruct": {
"success_rate": 0.3816,
"relative_action_count": 1.9383408547008547
},
"qwen2.5-72b-instruct": {
"success_rate": 0.4008,
"relative_action_count": 1.8648658674658674
},
"mistral-large-instruct-2411": {
"success_rate": 0.4144,
"relative_action_count": 1.795764299034299
},
"qwen2.5-32b-instruct": {
"success_rate": 0.43920000000000003,
"relative_action_count": 1.8831460717060717
},
"claude-3.5-sonnet": {
"success_rate": 0.44000000000000006,
"relative_action_count": 1.6636790032190032
},
"gpt-4o": {
"success_rate": 0.44960000000000006,
"relative_action_count": 1.7164597657897656
},
"deepseek-v3": {
"success_rate": 0.5496000000000001,
"relative_action_count": 1.705338828948829
},
"deepseek-r1": {
"success_rate": 0.6112,
"relative_action_count": 1.4205231568431569
},
"qwq-32b": {
"success_rate": 0.6112,
"relative_action_count": 1.5151790675990677
},
"o1-mini": {
"success_rate": 0.6296,
"relative_action_count": 1.4230264535464534
}
}