File size: 3,066 Bytes
3846650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1968f0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
models:
  "meta-llama/Meta-Llama-3-70B-Instruct":
    hidden_size: 8192
    num_hidden_layers: 80
    num_attention_heads: 64
    num_key_value_heads: 8
    vocab_size: 128256
    max_position_embeddings: 8192
    intermediate_size: 28672

  "meta-llama/Meta-Llama-3-8B-Instruct":
    hidden_size: 4096
    num_hidden_layers: 32
    num_attention_heads: 32
    num_key_value_heads: 8
    vocab_size: 128256
    max_position_embeddings: 8192
    intermediate_size: 14336

  "mistralai/Mixtral-8x7B-Instruct-v0.1":
    hidden_size: 4096
    num_hidden_layers: 32
    num_attention_heads: 32
    num_key_value_heads: 8
    vocab_size: 32000
    max_position_embeddings: 32768
    intermediate_size: 14336
    notes:
      moe:
        num_local_experts: 8
        num_experts_per_tok: 2

  "mistralai/Mistral-7B-Instruct-v0.3":
    hidden_size: 4096
    num_hidden_layers: 32
    num_attention_heads: 32
    num_key_value_heads: 8
    vocab_size: 32768
    max_position_embeddings: 32768
    intermediate_size: 14336

  "google/gemma-7b":
    hidden_size: 3072
    num_hidden_layers: 28
    num_attention_heads: 16
    num_key_value_heads: 16
    vocab_size: 256000
    max_position_embeddings: 8192
    intermediate_size: 24576

  "openai/gpt-oss-20b":
    hidden_size: 2880
    num_hidden_layers: 24
    num_attention_heads: 64
    num_key_value_heads: 8
    vocab_size: 201088
    max_position_embeddings: 131072
    intermediate_size: 2880
    num_local_experts: 32
    num_experts_per_tok: 4

  "openai/gpt-oss-120b":
    hidden_size: 2880
    num_hidden_layers: 36
    num_attention_heads: 64
    num_key_value_heads: 8
    vocab_size: 201088
    max_position_embeddings: 131072
    intermediate_size: 2880
    num_local_experts: 128
    num_experts_per_tok: 4

  "Qwen/Qwen3-VL-235B-A22B-Thinking":
    text_config:
      hidden_size: 8192
      num_hidden_layers: 96
      num_attention_heads: 64
      num_key_value_heads: 8
      vocab_size: 151936
      max_position_embeddings: 262144
      intermediate_size: 24576
    torch_dtype: bfloat16
    notes:
      moe:
        num_experts: 512
        num_experts_per_tok: 10

  "Qwen/Qwen3-VL-235B-A22B-Instruct":
    text_config:
      hidden_size: 8192
      num_hidden_layers: 96
      num_attention_heads: 64
      num_key_value_heads: 8
      vocab_size: 151936
      max_position_embeddings: 262144
      intermediate_size: 24576
    torch_dtype: bfloat16
    notes:
      moe:
        num_experts: 512
        num_experts_per_tok: 10

  "Qwen/Qwen3-VL-30B-A3B-Thinking":
    text_config:
      hidden_size: 6144
      num_hidden_layers: 80
      num_attention_heads: 48
      num_key_value_heads: 8
      vocab_size: 151936
      max_position_embeddings: 262144
      intermediate_size: 16384
    torch_dtype: bfloat16

  "Qwen/Qwen3-VL-30B-A3B-Instruct":
    text_config:
      hidden_size: 6144
      num_hidden_layers: 80
      num_attention_heads: 48
      num_key_value_heads: 8
      vocab_size: 151936
      max_position_embeddings: 262144
      intermediate_size: 16384
    torch_dtype: bfloat16