Spaces:
Running
Running
| models: | |
| "meta-llama/Meta-Llama-3-70B-Instruct": | |
| hidden_size: 8192 | |
| num_hidden_layers: 80 | |
| num_attention_heads: 64 | |
| num_key_value_heads: 8 | |
| vocab_size: 128256 | |
| max_position_embeddings: 8192 | |
| intermediate_size: 28672 | |
| "meta-llama/Meta-Llama-3-8B-Instruct": | |
| hidden_size: 4096 | |
| num_hidden_layers: 32 | |
| num_attention_heads: 32 | |
| num_key_value_heads: 8 | |
| vocab_size: 128256 | |
| max_position_embeddings: 8192 | |
| intermediate_size: 14336 | |
| "mistralai/Mixtral-8x7B-Instruct-v0.1": | |
| hidden_size: 4096 | |
| num_hidden_layers: 32 | |
| num_attention_heads: 32 | |
| num_key_value_heads: 8 | |
| vocab_size: 32000 | |
| max_position_embeddings: 32768 | |
| intermediate_size: 14336 | |
| notes: | |
| moe: | |
| num_local_experts: 8 | |
| num_experts_per_tok: 2 | |
| "mistralai/Mistral-7B-Instruct-v0.3": | |
| hidden_size: 4096 | |
| num_hidden_layers: 32 | |
| num_attention_heads: 32 | |
| num_key_value_heads: 8 | |
| vocab_size: 32768 | |
| max_position_embeddings: 32768 | |
| intermediate_size: 14336 | |
| "google/gemma-7b": | |
| hidden_size: 3072 | |
| num_hidden_layers: 28 | |
| num_attention_heads: 16 | |
| num_key_value_heads: 16 | |
| vocab_size: 256000 | |
| max_position_embeddings: 8192 | |
| intermediate_size: 24576 | |
| "openai/gpt-oss-20b": | |
| hidden_size: 2880 | |
| num_hidden_layers: 24 | |
| num_attention_heads: 64 | |
| num_key_value_heads: 8 | |
| vocab_size: 201088 | |
| max_position_embeddings: 131072 | |
| intermediate_size: 2880 | |
| num_local_experts: 32 | |
| num_experts_per_tok: 4 | |
| "openai/gpt-oss-120b": | |
| hidden_size: 2880 | |
| num_hidden_layers: 36 | |
| num_attention_heads: 64 | |
| num_key_value_heads: 8 | |
| vocab_size: 201088 | |
| max_position_embeddings: 131072 | |
| intermediate_size: 2880 | |
| num_local_experts: 128 | |
| num_experts_per_tok: 4 | |
| "Qwen/Qwen3-VL-235B-A22B-Thinking": | |
| text_config: | |
| hidden_size: 8192 | |
| num_hidden_layers: 96 | |
| num_attention_heads: 64 | |
| num_key_value_heads: 8 | |
| vocab_size: 151936 | |
| max_position_embeddings: 262144 | |
| intermediate_size: 24576 | |
| torch_dtype: bfloat16 | |
| notes: | |
| moe: | |
| num_experts: 512 | |
| num_experts_per_tok: 10 | |
| "Qwen/Qwen3-VL-235B-A22B-Instruct": | |
| text_config: | |
| hidden_size: 8192 | |
| num_hidden_layers: 96 | |
| num_attention_heads: 64 | |
| num_key_value_heads: 8 | |
| vocab_size: 151936 | |
| max_position_embeddings: 262144 | |
| intermediate_size: 24576 | |
| torch_dtype: bfloat16 | |
| notes: | |
| moe: | |
| num_experts: 512 | |
| num_experts_per_tok: 10 | |
| "Qwen/Qwen3-VL-30B-A3B-Thinking": | |
| text_config: | |
| hidden_size: 6144 | |
| num_hidden_layers: 80 | |
| num_attention_heads: 48 | |
| num_key_value_heads: 8 | |
| vocab_size: 151936 | |
| max_position_embeddings: 262144 | |
| intermediate_size: 16384 | |
| torch_dtype: bfloat16 | |
| "Qwen/Qwen3-VL-30B-A3B-Instruct": | |
| text_config: | |
| hidden_size: 6144 | |
| num_hidden_layers: 80 | |
| num_attention_heads: 48 | |
| num_key_value_heads: 8 | |
| vocab_size: 151936 | |
| max_position_embeddings: 262144 | |
| intermediate_size: 16384 | |
| torch_dtype: bfloat16 | |