Spaces:
Sleeping
Sleeping
| { | |
| "evaluation_type": "comprehensive_mcp_test", | |
| "total_models_tested": 9, | |
| "successful_models": 5, | |
| "results": [ | |
| { | |
| "name": "Qwen2.5-0.5B", | |
| "file_size_mb": 408.8689880371094, | |
| "avg_score": 2.6, | |
| "avg_time": 2.6360722541809083, | |
| "efficiency": 0.9863159084036122, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.31192469596862793, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.18253064155578613, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 7, | |
| "time": 4.232211351394653, | |
| "response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 4.225749492645264, | |
| "response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 4, | |
| "time": 4.22794508934021, | |
| "response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-0.6B", | |
| "file_size_mb": 609.8238830566406, | |
| "avg_score": 2.8, | |
| "avg_time": 8.223706769943238, | |
| "efficiency": 0.3404790659892809, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 7, | |
| "time": 8.638539791107178, | |
| "response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 8.075484991073608, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 7.951770067214966, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 8.252855062484741, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 8.199883937835693, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Gemma-3-270M", | |
| "file_size_mb": 428.0401306152344, | |
| "avg_score": 0.0, | |
| "avg_time": 0.16690435409545898, | |
| "efficiency": 0.0, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.2941462993621826, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.13967180252075195, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 0.1264328956604004, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 0.14153170585632324, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 0.13273906707763672, | |
| "response": "" | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-1.7B", | |
| "file_size_mb": 1007.8267211914062, | |
| "avg_score": 3.0, | |
| "avg_time": 13.003729963302613, | |
| "efficiency": 0.23070303739513193, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.862720251083374, | |
| "response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.972241401672363, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 3, | |
| "time": 13.497555255889893, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 12.513315677642822, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 3, | |
| "time": 13.17281723022461, | |
| "response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "MCP-Instruct-v1", | |
| "file_size_mb": 697.0347290039062, | |
| "avg_score": 0.0, | |
| "avg_time": 0.1320805072784424, | |
| "efficiency": 0.0, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.6604025363922119, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0, | |
| "error": "llama_decode returned -1" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 0, | |
| "error": "llama_decode returned -1" | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 0, | |
| "error": "llama_decode returned -1" | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 0, | |
| "error": "llama_decode returned -1" | |
| } | |
| ], | |
| "type": "mcp_specialized" | |
| }, | |
| { | |
| "name": "MCPR L-3B-Exa", | |
| "file_size_mb": 1215.7023620605469, | |
| "avg_score": 0.0, | |
| "avg_time": 22.14646472930908, | |
| "efficiency": 0.0, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 22.817347049713135, | |
| "response": "+\\),),“), and“““““““““““““““““““““““““”“”““““““““““”“““““““““““““““““““““““““““““““““““““““initializ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 21.51675510406494, | |
| "response": "+\\),),“),3“”“”“”“),),““““““““““”“),),),), and“),),), and@@ the Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ Sty□ St..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 22.118958473205566, | |
| "response": "+\\),),+\\),), and““”““““““““““““““““““““”“““““”“”“““““““““““““““““““““”“”““”““”““““““““““““““““““““““..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 22.297714471817017, | |
| "response": "+\\),),“), and@@ the Sty mini mini mini mini mini mini mini mini mini the““““”“),),+\\),),), and“),),)..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 21.98154854774475, | |
| "response": "and@@ Sty@@patterns@@ mini@@ Sty@@ Sty mini mini mini mini mini mini mini mini mini the“““““““““”“““..." | |
| } | |
| ], | |
| "type": "mcp_specialized" | |
| }, | |
| { | |
| "name": "Gemma-3n-E2B-it", | |
| "file_size_mb": 1958.3001403808594, | |
| "avg_score": 0.0, | |
| "avg_time": 1.5714858055114747, | |
| "efficiency": 0.0, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 3.1773452758789062, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 1.1669323444366455, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 1.1747264862060547, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 1.2873260974884033, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 1.0510988235473633, | |
| "response": "" | |
| } | |
| ], | |
| "type": "mcp_specialized" | |
| }, | |
| { | |
| "name": "Llama-Breeze2-3B", | |
| "file_size_mb": 1424.04345703125, | |
| "avg_score": 3.6, | |
| "avg_time": 14.693956804275512, | |
| "efficiency": 0.24499867856917243, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 3.5608396530151367, | |
| "response": "[get_game_state()]" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 5.626140356063843, | |
| "response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.91610813140869, | |
| "response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.745216846466064, | |
| "response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 2, | |
| "time": 16.621479034423828, | |
| "response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen2.5-Coder-0.5B", | |
| "file_size_mb": 408.8690185546875, | |
| "avg_score": 4.4, | |
| "avg_time": 4.1166441440582275, | |
| "efficiency": 1.0688317585941343, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.25421667098999, | |
| "response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.333646059036255, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 4.139528274536133, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 3.9508562088012695, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 5, | |
| "time": 3.9049735069274902, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| } | |
| ], | |
| "type": "code_specialized" | |
| } | |
| ], | |
| "ranking_by_score": [ | |
| { | |
| "name": "Qwen2.5-Coder-0.5B", | |
| "file_size_mb": 408.8690185546875, | |
| "avg_score": 4.4, | |
| "avg_time": 4.1166441440582275, | |
| "efficiency": 1.0688317585941343, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.25421667098999, | |
| "response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.333646059036255, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 4.139528274536133, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 3.9508562088012695, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 5, | |
| "time": 3.9049735069274902, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| } | |
| ], | |
| "type": "code_specialized" | |
| }, | |
| { | |
| "name": "Llama-Breeze2-3B", | |
| "file_size_mb": 1424.04345703125, | |
| "avg_score": 3.6, | |
| "avg_time": 14.693956804275512, | |
| "efficiency": 0.24499867856917243, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 3.5608396530151367, | |
| "response": "[get_game_state()]" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 5.626140356063843, | |
| "response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.91610813140869, | |
| "response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.745216846466064, | |
| "response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 2, | |
| "time": 16.621479034423828, | |
| "response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-1.7B", | |
| "file_size_mb": 1007.8267211914062, | |
| "avg_score": 3.0, | |
| "avg_time": 13.003729963302613, | |
| "efficiency": 0.23070303739513193, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.862720251083374, | |
| "response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.972241401672363, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 3, | |
| "time": 13.497555255889893, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 12.513315677642822, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 3, | |
| "time": 13.17281723022461, | |
| "response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-0.6B", | |
| "file_size_mb": 609.8238830566406, | |
| "avg_score": 2.8, | |
| "avg_time": 8.223706769943238, | |
| "efficiency": 0.3404790659892809, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 7, | |
| "time": 8.638539791107178, | |
| "response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 8.075484991073608, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 7.951770067214966, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 8.252855062484741, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 8.199883937835693, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen2.5-0.5B", | |
| "file_size_mb": 408.8689880371094, | |
| "avg_score": 2.6, | |
| "avg_time": 2.6360722541809083, | |
| "efficiency": 0.9863159084036122, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.31192469596862793, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.18253064155578613, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 7, | |
| "time": 4.232211351394653, | |
| "response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 4.225749492645264, | |
| "response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 4, | |
| "time": 4.22794508934021, | |
| "response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..." | |
| } | |
| ], | |
| "type": "general" | |
| } | |
| ], | |
| "ranking_by_efficiency": [ | |
| { | |
| "name": "Qwen2.5-Coder-0.5B", | |
| "file_size_mb": 408.8690185546875, | |
| "avg_score": 4.4, | |
| "avg_time": 4.1166441440582275, | |
| "efficiency": 1.0688317585941343, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.25421667098999, | |
| "response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.333646059036255, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 4.139528274536133, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 3.9508562088012695, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 5, | |
| "time": 3.9049735069274902, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| } | |
| ], | |
| "type": "code_specialized" | |
| }, | |
| { | |
| "name": "Qwen2.5-0.5B", | |
| "file_size_mb": 408.8689880371094, | |
| "avg_score": 2.6, | |
| "avg_time": 2.6360722541809083, | |
| "efficiency": 0.9863159084036122, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.31192469596862793, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 0, | |
| "time": 0.18253064155578613, | |
| "response": "" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 7, | |
| "time": 4.232211351394653, | |
| "response": "Where tool_name is the name of the tool used and args is a dictionary containing the arguments for t..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 4.225749492645264, | |
| "response": "Where tool_name is the name of the tool and args is a dictionary with the arguments. If no arguments..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 4, | |
| "time": 4.22794508934021, | |
| "response": "where tool_name is the name of the tool and args is a dictionary containing the arguments. If no too..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-0.6B", | |
| "file_size_mb": 609.8238830566406, | |
| "avg_score": 2.8, | |
| "avg_time": 8.223706769943238, | |
| "efficiency": 0.3404790659892809, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 7, | |
| "time": 8.638539791107178, | |
| "response": ".\n\nIf the user command is not supported by the available tools, respond with an empty array.\n\nNow, t..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 8.075484991073608, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON correctly.\n\nIf the command is not poss..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 0, | |
| "time": 7.951770067214966, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON correctly.\n\nNow, the game state is as..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 8.252855062484741, | |
| "response": ".\n\nMake sure to use the correct tool names and format the JSON properly.\n\nNow, the user is in a game..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 0, | |
| "time": 8.199883937835693, | |
| "response": ".\n\nMake sure to use the correct tool name and format the JSON.\n\nIf the command is not possible, retu..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Llama-Breeze2-3B", | |
| "file_size_mb": 1424.04345703125, | |
| "avg_score": 3.6, | |
| "avg_time": 14.693956804275512, | |
| "efficiency": 0.24499867856917243, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 3.5608396530151367, | |
| "response": "[get_game_state()]" | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 3, | |
| "time": 5.626140356063843, | |
| "response": "[move_units(unit_ids='infantry', target_x='150', target_y='200')]" | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.91610813140869, | |
| "response": "The tool used is \"get_game_state\" and the tool name is \"get_game_state\". The args for this tool is a..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 23.745216846466064, | |
| "response": "{\n \"tool\": \"build_building\",\n \"args\": {\n \"building_type\": \"power plant\",\n \"position_x\": 100,..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 2, | |
| "time": 16.621479034423828, | |
| "response": "The game is ready for action. You are the AI, and you are ready to assist. You have the ability to m..." | |
| } | |
| ], | |
| "type": "general" | |
| }, | |
| { | |
| "name": "Qwen3-1.7B", | |
| "file_size_mb": 1007.8267211914062, | |
| "avg_score": 3.0, | |
| "avg_time": 13.003729963302613, | |
| "efficiency": 0.23070303739513193, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.862720251083374, | |
| "response": ".\n\nYou must use the JSON format specified, without any additional text or explanation. The JSON must..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 2, | |
| "time": 12.972241401672363, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe game state is as follows:\n-..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 3, | |
| "time": 13.497555255889893, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe tool to use is attack_unit...." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 12.513315677642822, | |
| "response": ".\n\nYou must use the correct tool name and format the JSON properly.\n\nThe correct tool name is \"build..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 3, | |
| "time": 13.17281723022461, | |
| "response": ".\n\nYou can use the following tool definitions:\n- move_units: move units to a new position\n- attack_u..." | |
| } | |
| ], | |
| "type": "general" | |
| } | |
| ], | |
| "best_overall": { | |
| "name": "Qwen2.5-Coder-0.5B", | |
| "file_size_mb": 408.8690185546875, | |
| "avg_score": 4.4, | |
| "avg_time": 4.1166441440582275, | |
| "efficiency": 1.0688317585941343, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.25421667098999, | |
| "response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.333646059036255, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 4.139528274536133, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 3.9508562088012695, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 5, | |
| "time": 3.9049735069274902, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| } | |
| ], | |
| "type": "code_specialized" | |
| }, | |
| "most_efficient": { | |
| "name": "Qwen2.5-Coder-0.5B", | |
| "file_size_mb": 408.8690185546875, | |
| "avg_score": 4.4, | |
| "avg_time": 4.1166441440582275, | |
| "efficiency": 1.0688317585941343, | |
| "results": [ | |
| { | |
| "test": "Commande simple", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.25421667098999, | |
| "response": ".\n\nHere's a possible response:\n\n{\"tool\": \"get_game_state\", \"args\": {\"game_state\": {\"units\": [{\"id\": ..." | |
| }, | |
| { | |
| "test": "Action avec coordonnées", | |
| "difficulty": "easy", | |
| "score": 5, | |
| "time": 4.333646059036255, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Attaque spécifique", | |
| "difficulty": "medium", | |
| "score": 5, | |
| "time": 4.139528274536133, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| }, | |
| { | |
| "test": "Construction", | |
| "difficulty": "medium", | |
| "score": 2, | |
| "time": 3.9508562088012695, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a JSON object containing the arguments ..." | |
| }, | |
| { | |
| "test": "Commande complexe", | |
| "difficulty": "hard", | |
| "score": 5, | |
| "time": 3.9049735069274902, | |
| "response": "where \"tool_name\" is the name of the tool used and \"args\" is a dictionary containing the arguments p..." | |
| } | |
| ], | |
| "type": "code_specialized" | |
| } | |
| } |