tclf90
commited on
Commit
·
ea92b81
1
Parent(s):
cc1a0fe
'decrease gptq group size'
Browse files- README.md +11 -3
- config.json +3 -3
- generation_config.json +1 -0
- model-00001-of-00002.safetensors +2 -2
- model-00002-of-00002.safetensors +2 -2
- model.safetensors.index.json +29 -29
- modeling_chatglm.py +1 -1
README.md
CHANGED
|
@@ -16,15 +16,23 @@ tags:
|
|
| 16 |
|
| 17 |
|
| 18 |
### 【模型更新日期】
|
| 19 |
-
``` 2024-06-05 ```
|
| 20 |
|
| 21 |
### 【模型大小】
|
| 22 |
-
`6.
|
| 23 |
|
| 24 |
### 【06-05 临时情况告知】
|
| 25 |
|
| 26 |
1. 目前需要用vllm entrypoint的方式来启动模型。
|
| 27 |
-
2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
### 【介绍】
|
| 30 |
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
### 【模型更新日期】
|
| 19 |
+
``` 2024-06-05 21:00 ```
|
| 20 |
|
| 21 |
### 【模型大小】
|
| 22 |
+
`6.5GB`
|
| 23 |
|
| 24 |
### 【06-05 临时情况告知】
|
| 25 |
|
| 26 |
1. 目前需要用vllm entrypoint的方式来启动模型。
|
| 27 |
+
2. 如果出现感叹号,请留言告知,并告知显卡型号。
|
| 28 |
+
|
| 29 |
+
### 【更新日志】
|
| 30 |
+
|
| 31 |
+
```
|
| 32 |
+
2004-06-05 21:00
|
| 33 |
+
1. 尝试修复!!!感叹号吐字问题。
|
| 34 |
+
2. group_size 调整为64,减少量化精度损失。
|
| 35 |
+
```
|
| 36 |
|
| 37 |
### 【介绍】
|
| 38 |
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4
|
| 3 |
"add_bias_linear": false,
|
| 4 |
"add_qkv_bias": true,
|
| 5 |
"apply_query_key_layer_scaling": true,
|
|
@@ -28,7 +28,7 @@
|
|
| 28 |
"hidden_dropout": 0.0,
|
| 29 |
"hidden_size": 4096,
|
| 30 |
"kv_channels": 128,
|
| 31 |
-
"layernorm_epsilon":
|
| 32 |
"model_type": "chatglm",
|
| 33 |
"multi_query_attention": true,
|
| 34 |
"multi_query_group_num": 2,
|
|
@@ -50,7 +50,7 @@
|
|
| 50 |
"exllama_config": {
|
| 51 |
"version": 1
|
| 52 |
},
|
| 53 |
-
"group_size":
|
| 54 |
"max_input_length": null,
|
| 55 |
"model_seqlen": null,
|
| 56 |
"module_name_preceding_first_block": null,
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4",
|
| 3 |
"add_bias_linear": false,
|
| 4 |
"add_qkv_bias": true,
|
| 5 |
"apply_query_key_layer_scaling": true,
|
|
|
|
| 28 |
"hidden_dropout": 0.0,
|
| 29 |
"hidden_size": 4096,
|
| 30 |
"kv_channels": 128,
|
| 31 |
+
"layernorm_epsilon": 1e-05,
|
| 32 |
"model_type": "chatglm",
|
| 33 |
"multi_query_attention": true,
|
| 34 |
"multi_query_group_num": 2,
|
|
|
|
| 50 |
"exllama_config": {
|
| 51 |
"version": 1
|
| 52 |
},
|
| 53 |
+
"group_size": 64,
|
| 54 |
"max_input_length": null,
|
| 55 |
"model_seqlen": null,
|
| 56 |
"module_name_preceding_first_block": null,
|
generation_config.json
CHANGED
|
@@ -5,5 +5,6 @@
|
|
| 5 |
151336,
|
| 6 |
151338
|
| 7 |
],
|
|
|
|
| 8 |
"transformers_version": "4.40.2"
|
| 9 |
}
|
|
|
|
| 5 |
151336,
|
| 6 |
151338
|
| 7 |
],
|
| 8 |
+
"pad_token_id": 151329,
|
| 9 |
"transformers_version": "4.40.2"
|
| 10 |
}
|
model-00001-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65eadf64a1c6f70038b02fb1a6526e11a115459936ea74f6939b06a9bfe3990f
|
| 3 |
+
size 4995499432
|
model-00002-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad97ab92f6ce8fa9f9fc3ab470e930c9aa837fb471d9dda9dd05a53cc0f294d1
|
| 3 |
+
size 1893310768
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"total_size":
|
| 4 |
},
|
| 5 |
"weight_map": {
|
| 6 |
"transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
|
|
@@ -622,28 +622,28 @@
|
|
| 622 |
"transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
|
| 623 |
"transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
|
| 624 |
"transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 625 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-
|
| 626 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-
|
| 627 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-
|
| 628 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-
|
| 629 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-
|
| 630 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-
|
| 631 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-
|
| 632 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-
|
| 633 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-
|
| 634 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-
|
| 635 |
-
"transformer.encoder.layers.34.post_attention_layernorm.weight": "model-
|
| 636 |
"transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
|
| 637 |
"transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
|
| 638 |
"transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
|
| 639 |
"transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
|
| 640 |
"transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
|
| 641 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-
|
| 642 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-
|
| 643 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-
|
| 644 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-
|
| 645 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-
|
| 646 |
-
"transformer.encoder.layers.35.input_layernorm.weight": "model-
|
| 647 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
| 648 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
| 649 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
|
@@ -654,17 +654,17 @@
|
|
| 654 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
| 655 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
| 656 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
| 657 |
-
"transformer.encoder.layers.35.post_attention_layernorm.weight": "model-
|
| 658 |
-
"transformer.encoder.layers.35.self_attention.dense.bias": "model-
|
| 659 |
-
"transformer.encoder.layers.35.self_attention.dense.g_idx": "model-
|
| 660 |
-
"transformer.encoder.layers.35.self_attention.dense.qweight": "model-
|
| 661 |
-
"transformer.encoder.layers.35.self_attention.dense.qzeros": "model-
|
| 662 |
-
"transformer.encoder.layers.35.self_attention.dense.scales": "model-
|
| 663 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-
|
| 664 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-
|
| 665 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-
|
| 666 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-
|
| 667 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-
|
| 668 |
"transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 669 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
| 670 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_size": 6888693824
|
| 4 |
},
|
| 5 |
"weight_map": {
|
| 6 |
"transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
|
|
|
|
| 622 |
"transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
|
| 623 |
"transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
|
| 624 |
"transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 625 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
| 626 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
| 627 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
| 628 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-00002-of-00002.safetensors",
|
| 629 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-00002-of-00002.safetensors",
|
| 630 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
|
| 631 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-00002-of-00002.safetensors",
|
| 632 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
| 633 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
| 634 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
| 635 |
+
"transformer.encoder.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 636 |
"transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
|
| 637 |
"transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
|
| 638 |
"transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
|
| 639 |
"transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
|
| 640 |
"transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
|
| 641 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
| 642 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
|
| 643 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
|
| 644 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
|
| 645 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
|
| 646 |
+
"transformer.encoder.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 647 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
| 648 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
| 649 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
|
|
|
| 654 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
| 655 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
| 656 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
| 657 |
+
"transformer.encoder.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 658 |
+
"transformer.encoder.layers.35.self_attention.dense.bias": "model-00002-of-00002.safetensors",
|
| 659 |
+
"transformer.encoder.layers.35.self_attention.dense.g_idx": "model-00002-of-00002.safetensors",
|
| 660 |
+
"transformer.encoder.layers.35.self_attention.dense.qweight": "model-00002-of-00002.safetensors",
|
| 661 |
+
"transformer.encoder.layers.35.self_attention.dense.qzeros": "model-00002-of-00002.safetensors",
|
| 662 |
+
"transformer.encoder.layers.35.self_attention.dense.scales": "model-00002-of-00002.safetensors",
|
| 663 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
| 664 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
|
| 665 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
|
| 666 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
|
| 667 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
|
| 668 |
"transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 669 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
| 670 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
modeling_chatglm.py
CHANGED
|
@@ -324,7 +324,7 @@ class SelfAttention(torch.nn.Module):
|
|
| 324 |
)
|
| 325 |
|
| 326 |
def forward(
|
| 327 |
-
self, hidden_states, attention_mask, rotary_pos_emb
|
| 328 |
):
|
| 329 |
# hidden_states: [b, sq, h]
|
| 330 |
|
|
|
|
| 324 |
)
|
| 325 |
|
| 326 |
def forward(
|
| 327 |
+
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
|
| 328 |
):
|
| 329 |
# hidden_states: [b, sq, h]
|
| 330 |
|