Spaces:

Luigi
/

rts-commander

Sleeping

App Files Files Community

rts-commander / docs /reports /comprehensive_mcp_evaluation.json

Luigi

Organize project structure: move test scripts to tests/scripts and documentation to docs/reports

d28c36c 2 months ago

raw

history blame contribute delete

32.8 kB

	{
	"evaluation_type": "comprehensive_mcp_test",
	"total_models_tested": 9,
	"successful_models": 5,
	"results": [
	{
	"name": "Qwen2.5-0.5B",
	"file_size_mb": 408.8689880371094,
	"avg_score": 2.6,
	"avg_time": 2.6360722541809083,
	"efficiency": 0.9863159084036122,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 0.31192469596862793,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 0.18253064155578613,
	"response": ""
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 7,
	"time": 4.232211351394653,
	"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 4.225749492645264,
	"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 4,
	"time": 4.22794508934021,
	"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-0.6B",
	"file_size_mb": 609.8238830566406,
	"avg_score": 2.8,
	"avg_time": 8.223706769943238,
	"efficiency": 0.3404790659892809,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 7,
	"time": 8.638539791107178,
	"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 8.075484991073608,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 7.951770067214966,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 8.252855062484741,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 8.199883937835693,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
	}
	],
	"type": "general"
	},
	{
	"name": "Gemma-3-270M",
	"file_size_mb": 428.0401306152344,
	"avg_score": 0.0,
	"avg_time": 0.16690435409545898,
	"efficiency": 0.0,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 0.2941462993621826,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 0.13967180252075195,
	"response": ""
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 0.1264328956604004,
	"response": ""
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 0,
	"time": 0.14153170585632324,
	"response": ""
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 0.13273906707763672,
	"response": ""
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-1.7B",
	"file_size_mb": 1007.8267211914062,
	"avg_score": 3.0,
	"avg_time": 13.003729963302613,
	"efficiency": 0.23070303739513193,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 2,
	"time": 12.862720251083374,
	"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 2,
	"time": 12.972241401672363,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 3,
	"time": 13.497555255889893,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 12.513315677642822,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 3,
	"time": 13.17281723022461,
	"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
	}
	],
	"type": "general"
	},
	{
	"name": "MCP-Instruct-v1",
	"file_size_mb": 697.0347290039062,
	"avg_score": 0.0,
	"avg_time": 0.1320805072784424,
	"efficiency": 0.0,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 0.6604025363922119,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 0,
	"error": "llama_decode returned -1"
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 0,
	"error": "llama_decode returned -1"
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 0,
	"time": 0,
	"error": "llama_decode returned -1"
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 0,
	"error": "llama_decode returned -1"
	}
	],
	"type": "mcp_specialized"
	},
	{
	"name": "MCPR L-3B-Exa",
	"file_size_mb": 1215.7023620605469,
	"avg_score": 0.0,
	"avg_time": 22.14646472930908,
	"efficiency": 0.0,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 22.817347049713135,
	"response": "+\\),),“), and“““““““““““““““““““““““““”“”““““““““““”“““““““““““““““““““““““““““““““““““““““initializ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 21.51675510406494,
	"response": "+\\),),“),3“”“”“”“),),““““““““““”“),),),), and“),),), and@@ the Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ St..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 22.118958473205566,
	"response": "+\\),),+\\),), and““”““““““““““““““““““““”“““““”“”“““““““““““““““““““““”“”““”““”““““““““““““““““““““““..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 0,
	"time": 22.297714471817017,
	"response": "+\\),),“), and@@ the Sty mini mini mini mini mini mini mini mini mini the““““”“),),+\\),),), and“),),)..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 21.98154854774475,
	"response": "and@@ Sty@@patterns@@ mini@@ Sty@@ Sty mini mini mini mini mini mini mini mini mini the“““““““““”“““..."
	}
	],
	"type": "mcp_specialized"
	},
	{
	"name": "Gemma-3n-E2B-it",
	"file_size_mb": 1958.3001403808594,
	"avg_score": 0.0,
	"avg_time": 1.5714858055114747,
	"efficiency": 0.0,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 3.1773452758789062,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 1.1669323444366455,
	"response": ""
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 1.1747264862060547,
	"response": ""
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 0,
	"time": 1.2873260974884033,
	"response": ""
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 1.0510988235473633,
	"response": ""
	}
	],
	"type": "mcp_specialized"
	},
	{
	"name": "Llama-Breeze2-3B",
	"file_size_mb": 1424.04345703125,
	"avg_score": 3.6,
	"avg_time": 14.693956804275512,
	"efficiency": 0.24499867856917243,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 3,
	"time": 3.5608396530151367,
	"response": "[get_game_state()]"
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 3,
	"time": 5.626140356063843,
	"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 23.91610813140869,
	"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 23.745216846466064,
	"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 2,
	"time": 16.621479034423828,
	"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen2.5-Coder-0.5B",
	"file_size_mb": 408.8690185546875,
	"avg_score": 4.4,
	"avg_time": 4.1166441440582275,
	"efficiency": 1.0688317585941343,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 5,
	"time": 4.25421667098999,
	"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 4.333646059036255,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 4.139528274536133,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 3.9508562088012695,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 5,
	"time": 3.9049735069274902,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	}
	],
	"type": "code_specialized"
	}
	],
	"ranking_by_score": [
	{
	"name": "Qwen2.5-Coder-0.5B",
	"file_size_mb": 408.8690185546875,
	"avg_score": 4.4,
	"avg_time": 4.1166441440582275,
	"efficiency": 1.0688317585941343,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 5,
	"time": 4.25421667098999,
	"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 4.333646059036255,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 4.139528274536133,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 3.9508562088012695,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 5,
	"time": 3.9049735069274902,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	}
	],
	"type": "code_specialized"
	},
	{
	"name": "Llama-Breeze2-3B",
	"file_size_mb": 1424.04345703125,
	"avg_score": 3.6,
	"avg_time": 14.693956804275512,
	"efficiency": 0.24499867856917243,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 3,
	"time": 3.5608396530151367,
	"response": "[get_game_state()]"
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 3,
	"time": 5.626140356063843,
	"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 23.91610813140869,
	"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 23.745216846466064,
	"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 2,
	"time": 16.621479034423828,
	"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-1.7B",
	"file_size_mb": 1007.8267211914062,
	"avg_score": 3.0,
	"avg_time": 13.003729963302613,
	"efficiency": 0.23070303739513193,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 2,
	"time": 12.862720251083374,
	"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 2,
	"time": 12.972241401672363,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 3,
	"time": 13.497555255889893,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 12.513315677642822,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 3,
	"time": 13.17281723022461,
	"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-0.6B",
	"file_size_mb": 609.8238830566406,
	"avg_score": 2.8,
	"avg_time": 8.223706769943238,
	"efficiency": 0.3404790659892809,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 7,
	"time": 8.638539791107178,
	"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 8.075484991073608,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 7.951770067214966,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 8.252855062484741,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 8.199883937835693,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen2.5-0.5B",
	"file_size_mb": 408.8689880371094,
	"avg_score": 2.6,
	"avg_time": 2.6360722541809083,
	"efficiency": 0.9863159084036122,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 0.31192469596862793,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 0.18253064155578613,
	"response": ""
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 7,
	"time": 4.232211351394653,
	"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 4.225749492645264,
	"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 4,
	"time": 4.22794508934021,
	"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
	}
	],
	"type": "general"
	}
	],
	"ranking_by_efficiency": [
	{
	"name": "Qwen2.5-Coder-0.5B",
	"file_size_mb": 408.8690185546875,
	"avg_score": 4.4,
	"avg_time": 4.1166441440582275,
	"efficiency": 1.0688317585941343,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 5,
	"time": 4.25421667098999,
	"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 4.333646059036255,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 4.139528274536133,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 3.9508562088012695,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 5,
	"time": 3.9049735069274902,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	}
	],
	"type": "code_specialized"
	},
	{
	"name": "Qwen2.5-0.5B",
	"file_size_mb": 408.8689880371094,
	"avg_score": 2.6,
	"avg_time": 2.6360722541809083,
	"efficiency": 0.9863159084036122,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 0,
	"time": 0.31192469596862793,
	"response": ""
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 0,
	"time": 0.18253064155578613,
	"response": ""
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 7,
	"time": 4.232211351394653,
	"response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 4.225749492645264,
	"response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 4,
	"time": 4.22794508934021,
	"response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-0.6B",
	"file_size_mb": 609.8238830566406,
	"avg_score": 2.8,
	"avg_time": 8.223706769943238,
	"efficiency": 0.3404790659892809,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 7,
	"time": 8.638539791107178,
	"response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 8.075484991073608,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 0,
	"time": 7.951770067214966,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 8.252855062484741,
	"response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 0,
	"time": 8.199883937835693,
	"response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..."
	}
	],
	"type": "general"
	},
	{
	"name": "Llama-Breeze2-3B",
	"file_size_mb": 1424.04345703125,
	"avg_score": 3.6,
	"avg_time": 14.693956804275512,
	"efficiency": 0.24499867856917243,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 3,
	"time": 3.5608396530151367,
	"response": "[get_game_state()]"
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 3,
	"time": 5.626140356063843,
	"response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]"
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 23.91610813140869,
	"response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 23.745216846466064,
	"response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 2,
	"time": 16.621479034423828,
	"response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..."
	}
	],
	"type": "general"
	},
	{
	"name": "Qwen3-1.7B",
	"file_size_mb": 1007.8267211914062,
	"avg_score": 3.0,
	"avg_time": 13.003729963302613,
	"efficiency": 0.23070303739513193,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 2,
	"time": 12.862720251083374,
	"response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 2,
	"time": 12.972241401672363,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 3,
	"time": 13.497555255889893,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 5,
	"time": 12.513315677642822,
	"response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 3,
	"time": 13.17281723022461,
	"response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..."
	}
	],
	"type": "general"
	}
	],
	"best_overall": {
	"name": "Qwen2.5-Coder-0.5B",
	"file_size_mb": 408.8690185546875,
	"avg_score": 4.4,
	"avg_time": 4.1166441440582275,
	"efficiency": 1.0688317585941343,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 5,
	"time": 4.25421667098999,
	"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 4.333646059036255,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 4.139528274536133,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 3.9508562088012695,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 5,
	"time": 3.9049735069274902,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	}
	],
	"type": "code_specialized"
	},
	"most_efficient": {
	"name": "Qwen2.5-Coder-0.5B",
	"file_size_mb": 408.8690185546875,
	"avg_score": 4.4,
	"avg_time": 4.1166441440582275,
	"efficiency": 1.0688317585941343,
	"results": [
	{
	"test": "Commande simple",
	"difficulty": "easy",
	"score": 5,
	"time": 4.25421667098999,
	"response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..."
	},
	{
	"test": "Action avec coordonnées",
	"difficulty": "easy",
	"score": 5,
	"time": 4.333646059036255,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Attaque spécifique",
	"difficulty": "medium",
	"score": 5,
	"time": 4.139528274536133,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	},
	{
	"test": "Construction",
	"difficulty": "medium",
	"score": 2,
	"time": 3.9508562088012695,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..."
	},
	{
	"test": "Commande complexe",
	"difficulty": "hard",
	"score": 5,
	"time": 3.9049735069274902,
	"response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..."
	}
	],
	"type": "code_specialized"
	}
	}