zhwang4ai's picture
init leaderboard
f49345e
{
"llama-3.2-3b-instruct": {
"success_rate": 0.18239999999999998,
"relative_action_count": 0.8956800000000001
},
"qwen2.5-3b-instruct": {
"success_rate": 0.184,
"relative_action_count": 0.8255466666666665
},
"gpt-3.5-turbo": {
"success_rate": 0.272,
"relative_action_count": 0.8743314285714285
},
"qwen2.5-7b-instruct": {
"success_rate": 0.36639999999999995,
"relative_action_count": 0.8682133333333335
},
"gemma-2-9b-it": {
"success_rate": 0.392,
"relative_action_count": 0.8522190476190475
},
"llama-3.1-8b-instruct": {
"success_rate": 0.4424,
"relative_action_count": 0.8441104761904763
},
"gemma-2-27b-it": {
"success_rate": 0.548,
"relative_action_count": 0.6583142857142856
},
"yi-lightning": {
"success_rate": 0.6728,
"relative_action_count": 0.5962819047619048
},
"llama-3.1-70b-instruct": {
"success_rate": 0.696,
"relative_action_count": 0.5514495238095238
},
"llama-3.3-70b-instruct": {
"success_rate": 0.712,
"relative_action_count": 0.5916438095238095
},
"gpt-4o-mini": {
"success_rate": 0.7239999999999999,
"relative_action_count": 0.5270952380952381
},
"gemini-1.5-pro": {
"success_rate": 0.7256,
"relative_action_count": 0.5686514285714285
},
"claude-3.5-haiku": {
"success_rate": 0.7343999999999999,
"relative_action_count": 0.757095238095238
},
"qwen2.5-14b-instruct": {
"success_rate": 0.756,
"relative_action_count": 0.5723257142857143
},
"qwen2.5-72b-instruct": {
"success_rate": 0.7584,
"relative_action_count": 0.5753561904761904
},
"gpt-4o": {
"success_rate": 0.7856000000000002,
"relative_action_count": 0.506207619047619
},
"qwen2.5-32b-instruct": {
"success_rate": 0.7879999999999999,
"relative_action_count": 0.5955619047619047
},
"mistral-large-instruct-2411": {
"success_rate": 0.7879999999999999,
"relative_action_count": 0.5365238095238094
},
"claude-3.5-sonnet": {
"success_rate": 0.8263999999999999,
"relative_action_count": 0.46185714285714285
},
"deepseek-r1": {
"success_rate": 0.8712,
"relative_action_count": 0.51432
},
"o1-mini": {
"success_rate": 0.8784000000000001,
"relative_action_count": 0.46449523809523807
},
"deepseek-v3": {
"success_rate": 0.8928,
"relative_action_count": 0.5308400000000001
},
"qwq-32b": {
"success_rate": 0.9032,
"relative_action_count": 0.5338533333333333
}
}