models: "meta-llama/Meta-Llama-3-70B-Instruct": hidden_size: 8192 num_hidden_layers: 80 num_attention_heads: 64 num_key_value_heads: 8 vocab_size: 128256 max_position_embeddings: 8192 intermediate_size: 28672 "meta-llama/Meta-Llama-3-8B-Instruct": hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 vocab_size: 128256 max_position_embeddings: 8192 intermediate_size: 14336 "mistralai/Mixtral-8x7B-Instruct-v0.1": hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 vocab_size: 32000 max_position_embeddings: 32768 intermediate_size: 14336 notes: moe: num_local_experts: 8 num_experts_per_tok: 2 "mistralai/Mistral-7B-Instruct-v0.3": hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 vocab_size: 32768 max_position_embeddings: 32768 intermediate_size: 14336 "google/gemma-7b": hidden_size: 3072 num_hidden_layers: 28 num_attention_heads: 16 num_key_value_heads: 16 vocab_size: 256000 max_position_embeddings: 8192 intermediate_size: 24576 "openai/gpt-oss-20b": hidden_size: 2880 num_hidden_layers: 24 num_attention_heads: 64 num_key_value_heads: 8 vocab_size: 201088 max_position_embeddings: 131072 intermediate_size: 2880 num_local_experts: 32 num_experts_per_tok: 4 "openai/gpt-oss-120b": hidden_size: 2880 num_hidden_layers: 36 num_attention_heads: 64 num_key_value_heads: 8 vocab_size: 201088 max_position_embeddings: 131072 intermediate_size: 2880 num_local_experts: 128 num_experts_per_tok: 4 "Qwen/Qwen3-VL-235B-A22B-Thinking": text_config: hidden_size: 8192 num_hidden_layers: 96 num_attention_heads: 64 num_key_value_heads: 8 vocab_size: 151936 max_position_embeddings: 262144 intermediate_size: 24576 torch_dtype: bfloat16 notes: moe: num_experts: 512 num_experts_per_tok: 10 "Qwen/Qwen3-VL-235B-A22B-Instruct": text_config: hidden_size: 8192 num_hidden_layers: 96 num_attention_heads: 64 num_key_value_heads: 8 vocab_size: 151936 max_position_embeddings: 262144 intermediate_size: 24576 torch_dtype: bfloat16 notes: moe: num_experts: 512 num_experts_per_tok: 10 "Qwen/Qwen3-VL-30B-A3B-Thinking": text_config: hidden_size: 6144 num_hidden_layers: 80 num_attention_heads: 48 num_key_value_heads: 8 vocab_size: 151936 max_position_embeddings: 262144 intermediate_size: 16384 torch_dtype: bfloat16 "Qwen/Qwen3-VL-30B-A3B-Instruct": text_config: hidden_size: 6144 num_hidden_layers: 80 num_attention_heads: 48 num_key_value_heads: 8 vocab_size: 151936 max_position_embeddings: 262144 intermediate_size: 16384 torch_dtype: bfloat16