QuantTrio
/

DeepSeek-V3.2-AWQ

Text Generation

4-bit precision

Model card Files Files and versions

JunHowie commited on 13 days ago

Commit

591a8e2

·

verified ·

1 Parent(s): bf14887

Update README.md

Files changed (1) hide show

README.md +4 -4

README.md CHANGED Viewed

@@ -60,7 +60,7 @@ see [Official vLLM Deepseek-V3.2 Guide](https://docs.vllm.ai/projects/recipes/en
 export VLLM_USE_DEEP_GEMM=0  # ATM, this line is a "must" for Hopper devices
 CONTEXT_LENGTH=32768
 vllm serve \
-    __YOUR_PATH__/tclf90/DeepSeek-V3.2-AWQ \
     --served-model-name MY_MODEL_NAME \
     --enable-auto-tool-choice \
     --tool-call-parser deepseek_v31 \
@@ -71,7 +71,7 @@ vllm serve \
     --gpu-memory-utilization 0.9 \
     --tensor-parallel-size 8 \
     --enable-expert-parallel \  # optional
-    --speculative-config '{"model": "__YOUR_PATH__/tclf90/DeepSeek-V3.2-AWQ", "num_speculative_tokens": 1}' \  # optional, 50%+- throughput increase is observed
     --trust-remote-code \
     --host 0.0.0.0 \
     --port 8000
@@ -90,8 +90,8 @@ vllm serve \
 ### 【Model Download】
 ```python
-from modelscope import snapshot_download
-snapshot_download('tclf90/DeepSeek-V3.2-AWQ', cache_dir="your_local_path")
 ```
 ### 【Overview】

 export VLLM_USE_DEEP_GEMM=0  # ATM, this line is a "must" for Hopper devices
 CONTEXT_LENGTH=32768
 vllm serve \
+    __YOUR_PATH__/QuantTrio/DeepSeek-V3.2-AWQ \
     --served-model-name MY_MODEL_NAME \
     --enable-auto-tool-choice \
     --tool-call-parser deepseek_v31 \
     --gpu-memory-utilization 0.9 \
     --tensor-parallel-size 8 \
     --enable-expert-parallel \  # optional
+    --speculative-config '{"model": "__YOUR_PATH__/QuantTrio/DeepSeek-V3.2-AWQ", "num_speculative_tokens": 1}' \  # optional, 50%+- throughput increase is observed
     --trust-remote-code \
     --host 0.0.0.0 \
     --port 8000
 ### 【Model Download】
 ```python
+from huggingface_hub  import snapshot_download
+snapshot_download('QuantTrio/DeepSeek-V3.2-AWQ', cache_dir="your_local_path")
 ```
 ### 【Overview】