rts-commander / docs /reports /comprehensive_mcp_evaluation.json
Luigi's picture
Organize project structure: move test scripts to tests/scripts and documentation to docs/reports
d28c36c
{
"evaluation_type": "comprehensive_mcp_test",
"total_models_tested": 9,
"successful_models": 5,
"results": [
{
"name": "Qwen2.5-0.5B",
"file_size_mb": 408.8689880371094,
"avg_score": 2.6,
"avg_time": 2.6360722541809083,
"efficiency": 0.9863159084036122,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 0.31192469596862793,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 0.18253064155578613,
"response": ""
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 7,
"time": 4.232211351394653,
"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 4.225749492645264,
"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 4,
"time": 4.22794508934021,
"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
}
],
"type": "general"
},
{
"name": "Qwen3-0.6B",
"file_size_mb": 609.8238830566406,
"avg_score": 2.8,
"avg_time": 8.223706769943238,
"efficiency": 0.3404790659892809,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 7,
"time": 8.638539791107178,
"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 8.075484991073608,
"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 7.951770067214966,
"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 8.252855062484741,
"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 8.199883937835693,
"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
}
],
"type": "general"
},
{
"name": "Gemma-3-270M",
"file_size_mb": 428.0401306152344,
"avg_score": 0.0,
"avg_time": 0.16690435409545898,
"efficiency": 0.0,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 0.2941462993621826,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 0.13967180252075195,
"response": ""
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 0.1264328956604004,
"response": ""
},
{
"test": "Construction",
"difficulty": "medium",
"score": 0,
"time": 0.14153170585632324,
"response": ""
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 0.13273906707763672,
"response": ""
}
],
"type": "general"
},
{
"name": "Qwen3-1.7B",
"file_size_mb": 1007.8267211914062,
"avg_score": 3.0,
"avg_time": 13.003729963302613,
"efficiency": 0.23070303739513193,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 2,
"time": 12.862720251083374,
"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 2,
"time": 12.972241401672363,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 3,
"time": 13.497555255889893,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 12.513315677642822,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 3,
"time": 13.17281723022461,
"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
}
],
"type": "general"
},
{
"name": "MCP-Instruct-v1",
"file_size_mb": 697.0347290039062,
"avg_score": 0.0,
"avg_time": 0.1320805072784424,
"efficiency": 0.0,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 0.6604025363922119,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 0,
"error": "llama_decode returned -1"
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 0,
"error": "llama_decode returned -1"
},
{
"test": "Construction",
"difficulty": "medium",
"score": 0,
"time": 0,
"error": "llama_decode returned -1"
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 0,
"error": "llama_decode returned -1"
}
],
"type": "mcp_specialized"
},
{
"name": "MCPR L-3B-Exa",
"file_size_mb": 1215.7023620605469,
"avg_score": 0.0,
"avg_time": 22.14646472930908,
"efficiency": 0.0,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 22.817347049713135,
"response": "+\\),),“), and“““““““““““““““““““““““““”“”““““““““““”“““““““““““““““““““““““““““““““““““““““initializ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 21.51675510406494,
"response": "+\\),),“),3“”“”“”“),),““““““““““”“),),),), and“),),), and@@ the Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ St..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 22.118958473205566,
"response": "+\\),),+\\),), and““”““““““““““““““““““““”“““““”“”“““““““““““““““““““““”“”““”““”““““““““““““““““““““““..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 0,
"time": 22.297714471817017,
"response": "+\\),),“), and@@ the Sty mini mini mini mini mini mini mini mini mini the““““”“),),+\\),),), and“),),)..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 21.98154854774475,
"response": "and@@ Sty@@patterns@@ mini@@ Sty@@ Sty mini mini mini mini mini mini mini mini mini the“““““““““”“““..."
}
],
"type": "mcp_specialized"
},
{
"name": "Gemma-3n-E2B-it",
"file_size_mb": 1958.3001403808594,
"avg_score": 0.0,
"avg_time": 1.5714858055114747,
"efficiency": 0.0,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 3.1773452758789062,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 1.1669323444366455,
"response": ""
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 1.1747264862060547,
"response": ""
},
{
"test": "Construction",
"difficulty": "medium",
"score": 0,
"time": 1.2873260974884033,
"response": ""
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 1.0510988235473633,
"response": ""
}
],
"type": "mcp_specialized"
},
{
"name": "Llama-Breeze2-3B",
"file_size_mb": 1424.04345703125,
"avg_score": 3.6,
"avg_time": 14.693956804275512,
"efficiency": 0.24499867856917243,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 3,
"time": 3.5608396530151367,
"response": "[get_game_state()]"
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 3,
"time": 5.626140356063843,
"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 23.91610813140869,
"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 23.745216846466064,
"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 2,
"time": 16.621479034423828,
"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
}
],
"type": "general"
},
{
"name": "Qwen2.5-Coder-0.5B",
"file_size_mb": 408.8690185546875,
"avg_score": 4.4,
"avg_time": 4.1166441440582275,
"efficiency": 1.0688317585941343,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 5,
"time": 4.25421667098999,
"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 4.333646059036255,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 4.139528274536133,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 3.9508562088012695,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 5,
"time": 3.9049735069274902,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
}
],
"type": "code_specialized"
}
],
"ranking_by_score": [
{
"name": "Qwen2.5-Coder-0.5B",
"file_size_mb": 408.8690185546875,
"avg_score": 4.4,
"avg_time": 4.1166441440582275,
"efficiency": 1.0688317585941343,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 5,
"time": 4.25421667098999,
"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 4.333646059036255,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 4.139528274536133,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 3.9508562088012695,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 5,
"time": 3.9049735069274902,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
}
],
"type": "code_specialized"
},
{
"name": "Llama-Breeze2-3B",
"file_size_mb": 1424.04345703125,
"avg_score": 3.6,
"avg_time": 14.693956804275512,
"efficiency": 0.24499867856917243,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 3,
"time": 3.5608396530151367,
"response": "[get_game_state()]"
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 3,
"time": 5.626140356063843,
"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 23.91610813140869,
"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 23.745216846466064,
"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 2,
"time": 16.621479034423828,
"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
}
],
"type": "general"
},
{
"name": "Qwen3-1.7B",
"file_size_mb": 1007.8267211914062,
"avg_score": 3.0,
"avg_time": 13.003729963302613,
"efficiency": 0.23070303739513193,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 2,
"time": 12.862720251083374,
"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 2,
"time": 12.972241401672363,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 3,
"time": 13.497555255889893,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 12.513315677642822,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 3,
"time": 13.17281723022461,
"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
}
],
"type": "general"
},
{
"name": "Qwen3-0.6B",
"file_size_mb": 609.8238830566406,
"avg_score": 2.8,
"avg_time": 8.223706769943238,
"efficiency": 0.3404790659892809,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 7,
"time": 8.638539791107178,
"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 8.075484991073608,
"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 7.951770067214966,
"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 8.252855062484741,
"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 8.199883937835693,
"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
}
],
"type": "general"
},
{
"name": "Qwen2.5-0.5B",
"file_size_mb": 408.8689880371094,
"avg_score": 2.6,
"avg_time": 2.6360722541809083,
"efficiency": 0.9863159084036122,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 0.31192469596862793,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 0.18253064155578613,
"response": ""
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 7,
"time": 4.232211351394653,
"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 4.225749492645264,
"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 4,
"time": 4.22794508934021,
"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
}
],
"type": "general"
}
],
"ranking_by_efficiency": [
{
"name": "Qwen2.5-Coder-0.5B",
"file_size_mb": 408.8690185546875,
"avg_score": 4.4,
"avg_time": 4.1166441440582275,
"efficiency": 1.0688317585941343,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 5,
"time": 4.25421667098999,
"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 4.333646059036255,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 4.139528274536133,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 3.9508562088012695,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 5,
"time": 3.9049735069274902,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
}
],
"type": "code_specialized"
},
{
"name": "Qwen2.5-0.5B",
"file_size_mb": 408.8689880371094,
"avg_score": 2.6,
"avg_time": 2.6360722541809083,
"efficiency": 0.9863159084036122,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 0,
"time": 0.31192469596862793,
"response": ""
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 0,
"time": 0.18253064155578613,
"response": ""
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 7,
"time": 4.232211351394653,
"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 4.225749492645264,
"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 4,
"time": 4.22794508934021,
"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
}
],
"type": "general"
},
{
"name": "Qwen3-0.6B",
"file_size_mb": 609.8238830566406,
"avg_score": 2.8,
"avg_time": 8.223706769943238,
"efficiency": 0.3404790659892809,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 7,
"time": 8.638539791107178,
"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 8.075484991073608,
"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 0,
"time": 7.951770067214966,
"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 8.252855062484741,
"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 0,
"time": 8.199883937835693,
"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
}
],
"type": "general"
},
{
"name": "Llama-Breeze2-3B",
"file_size_mb": 1424.04345703125,
"avg_score": 3.6,
"avg_time": 14.693956804275512,
"efficiency": 0.24499867856917243,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 3,
"time": 3.5608396530151367,
"response": "[get_game_state()]"
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 3,
"time": 5.626140356063843,
"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 23.91610813140869,
"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 23.745216846466064,
"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 2,
"time": 16.621479034423828,
"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
}
],
"type": "general"
},
{
"name": "Qwen3-1.7B",
"file_size_mb": 1007.8267211914062,
"avg_score": 3.0,
"avg_time": 13.003729963302613,
"efficiency": 0.23070303739513193,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 2,
"time": 12.862720251083374,
"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 2,
"time": 12.972241401672363,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 3,
"time": 13.497555255889893,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 5,
"time": 12.513315677642822,
"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 3,
"time": 13.17281723022461,
"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
}
],
"type": "general"
}
],
"best_overall": {
"name": "Qwen2.5-Coder-0.5B",
"file_size_mb": 408.8690185546875,
"avg_score": 4.4,
"avg_time": 4.1166441440582275,
"efficiency": 1.0688317585941343,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 5,
"time": 4.25421667098999,
"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 4.333646059036255,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 4.139528274536133,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 3.9508562088012695,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 5,
"time": 3.9049735069274902,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
}
],
"type": "code_specialized"
},
"most_efficient": {
"name": "Qwen2.5-Coder-0.5B",
"file_size_mb": 408.8690185546875,
"avg_score": 4.4,
"avg_time": 4.1166441440582275,
"efficiency": 1.0688317585941343,
"results": [
{
"test": "Commande simple",
"difficulty": "easy",
"score": 5,
"time": 4.25421667098999,
"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
},
{
"test": "Action avec coordonnées",
"difficulty": "easy",
"score": 5,
"time": 4.333646059036255,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Attaque spécifique",
"difficulty": "medium",
"score": 5,
"time": 4.139528274536133,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
},
{
"test": "Construction",
"difficulty": "medium",
"score": 2,
"time": 3.9508562088012695,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
},
{
"test": "Commande complexe",
"difficulty": "hard",
"score": 5,
"time": 3.9049735069274902,
"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
}
],
"type": "code_specialized"
}
}