{ "evaluation_info": { "title": "Comprehensive MCP Capability Evaluation - 14 Models", "date": "2025-10-05", "total_models": 14, "framework": "llama-cpp-python 0.3.16", "test_scenarios": 3, "max_score_per_test": 10 }, "test_scenarios": [ { "id": 1, "description": "Get the current game state", "expected_tool": "get_game_state", "scoring": { "correct_tool": 4, "valid_json": 3, "proper_terminology": 2, "coordinates": 1 } }, { "id": 2, "description": "Move infantry unit to coordinates 150,200", "expected_tool": "move_units", "expected_coordinates": [150, 200], "scoring": { "correct_tool": 4, "valid_json": 3, "proper_terminology": 2, "coordinates": 1 } }, { "id": 3, "description": "Attack enemy tank at location 300,150", "expected_tool": "attack_unit", "expected_coordinates": [300, 150], "scoring": { "correct_tool": 4, "valid_json": 3, "proper_terminology": 2, "coordinates": 1 } } ], "results": [ { "rank": 1, "model_name": "Qwen2.5-Coder-1.5B-Q4", "filename": "qwen2.5-coder-1.5b-instruct-q4_0.gguf", "avg_score": 9.7, "avg_time": 4.12, "size_mb": 1017, "efficiency": 2.34, "status": "champion", "notes": "Exceptional MCP performance. Wraps JSON in markdown code blocks. 10/10 on tests 2 and 3, 9/10 on test 1.", "detailed_scores": [9, 10, 10] }, { "rank": 2, "model_name": "Qwen2.5-Coder-0.5B", "filename": "qwen2.5-coder-0.5b-instruct-q4_0.gguf", "avg_score": 4.3, "avg_time": 2.08, "size_mb": 409, "efficiency": 2.08, "status": "previous_champion", "notes": "Best budget option. Good balance of size and performance." }, { "rank": 3, "model_name": "Qwen3-0.6B", "filename": "Qwen3-0.6B-Q8_0.gguf", "avg_score": 3.7, "avg_time": 3.98, "size_mb": 610, "efficiency": 0.92, "status": "functional" }, { "rank": 4, "model_name": "Gemma-3-270M", "filename": "gemma-3-270m-it-qat-Q8_0.gguf", "avg_score": 3.7, "avg_time": 2.29, "size_mb": 428, "efficiency": 1.60, "status": "functional", "notes": "Ultra-lightweight champion. Excellent efficiency for its tiny size." }, { "rank": 5, "model_name": "MCPR-L-3B-Exa-Q8", "filename": "mcprl-3b-exa.Q8_0.gguf", "avg_score": 3.7, "avg_time": 17.42, "size_mb": 3133, "efficiency": 0.21, "status": "functional", "notes": "MCP-specialized but slow. Large size, poor efficiency." }, { "rank": 6, "model_name": "Gemma-3n-E2B-it-Q8", "filename": "google_gemma-3n-E2B-it-Q8_0.gguf", "avg_score": 3.7, "avg_time": 14.80, "size_mb": 4566, "efficiency": 0.25, "status": "functional", "notes": "Largest model tested. Poor efficiency despite high quantization." }, { "rank": 7, "model_name": "Qwen3-1.7B", "filename": "Qwen3-1.7B-Q4_0.gguf", "avg_score": 3.7, "avg_time": 6.24, "size_mb": 1008, "efficiency": 0.59, "status": "functional" }, { "rank": 8, "model_name": "Qwen2.5-0.5B", "filename": "qwen2.5-0.5b-instruct-q4_0.gguf", "avg_score": 2.7, "avg_time": 1.17, "size_mb": 409, "efficiency": 2.28, "status": "functional", "notes": "Fast but limited MCP capability. General-purpose model." }, { "rank": 9, "model_name": "Gemma-3n-E2B-it-IQ2", "filename": "gemma-3n-E2B-it-UD-IQ2_XXS.gguf", "avg_score": 2.3, "avg_time": 14.11, "size_mb": 1958, "efficiency": 0.17, "status": "functional", "notes": "Heavy quantization impacts quality." }, { "rank": 10, "model_name": "Llama-Breeze2-3B-Q2", "filename": "Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf", "avg_score": 1.3, "avg_time": 11.39, "size_mb": 1424, "efficiency": 0.12, "status": "functional", "notes": "Poor performance. Q2 quantization too aggressive." }, { "rank": 11, "model_name": "Yi-Coder-1.5B-Q4", "filename": "Yi-Coder-1.5B.Q4_0.gguf", "avg_score": 0.0, "avg_time": 11.64, "size_mb": 826, "efficiency": 0.0, "status": "failed", "notes": "Prompt format incompatibility. Returns system prompt instead of generating responses." }, { "rank": 12, "model_name": "MCP-Instruct-v1-Q4", "filename": "mcp-instruct-v1.Q4_K_M.gguf", "avg_score": 0.0, "avg_time": 0.0, "size_mb": 697, "efficiency": 0.0, "status": "failed", "notes": "Technical error: llama_decode returned -1" }, { "rank": 13, "model_name": "MCPR-L-3B-Exa-Q2", "filename": "mcprl-3b-exa.Q2_K.gguf", "avg_score": 0.0, "avg_time": 10.63, "size_mb": 1216, "efficiency": 0.0, "status": "failed", "notes": "Produces gibberish output. Q2 quantization too aggressive for this architecture." }, { "rank": 14, "model_name": "MCP-Instruct-v1-Q8", "filename": "mcp-instruct-v1.Q8_0.gguf", "avg_score": 0.0, "avg_time": 0.0, "size_mb": 1465, "efficiency": 0.0, "status": "failed", "notes": "Technical error: llama_decode returned -1. Same issue as Q4 version." } ], "key_insights": { "champion": { "model": "Qwen2.5-Coder-1.5B-Q4", "score": 9.7, "reason": "Code-specialized models excel at structured JSON generation. Near-perfect MCP capability." }, "scaling_effect": { "observation": "Increasing parameters from 0.5B to 1.5B more than doubled MCP score (4.3 → 9.7)", "conclusion": "Parameter scaling works exceptionally well for code-specialized models" }, "mcp_specialized_disappointment": { "observation": "MCP-Instruct models completely failed. MCPR-L models scored only 3.7/10 at best.", "conclusion": "MCP specialization alone is insufficient. Code training provides better foundation." }, "quantization_impact": { "observation": "Q2 quantization caused failures or poor performance. Q4 and Q8 worked well.", "conclusion": "Avoid Q2 quantization for MCP tasks. Q4 offers best size/quality tradeoff." }, "size_efficiency": { "observation": "Gemma-3-270M (428MB) matched 3133MB model performance", "conclusion": "Larger models don't guarantee better MCP performance" } }, "recommendations": { "primary": { "model": "Qwen2.5-Coder-1.5B-Q4", "use_case": "Production deployments requiring high-quality MCP", "requirement": "JSON extraction logic to handle markdown code blocks" }, "budget": { "model": "Qwen2.5-Coder-0.5B", "use_case": "Resource-constrained environments", "advantage": "2x smaller, 2x faster, still 4.3/10 performance" }, "ultra_lightweight": { "model": "Gemma-3-270M", "use_case": "Edge devices, embedded systems", "advantage": "Only 428MB, decent 3.7/10 performance" }, "avoid": [ { "model": "MCP-Instruct-v1 (all versions)", "reason": "Technical incompatibility with llama.cpp" }, { "model": "Yi-Coder-1.5B", "reason": "Prompt format incompatibility" }, { "model": "Any Q2 quantization", "reason": "Too aggressive, causes failures or gibberish" } ] } }