Update README.md
Browse files
README.md
CHANGED
|
@@ -60,7 +60,7 @@ see [Official vLLM Deepseek-V3.2 Guide](https://docs.vllm.ai/projects/recipes/en
|
|
| 60 |
export VLLM_USE_DEEP_GEMM=0 # ATM, this line is a "must" for Hopper devices
|
| 61 |
CONTEXT_LENGTH=32768
|
| 62 |
vllm serve \
|
| 63 |
-
__YOUR_PATH__/
|
| 64 |
--served-model-name MY_MODEL_NAME \
|
| 65 |
--enable-auto-tool-choice \
|
| 66 |
--tool-call-parser deepseek_v31 \
|
|
@@ -71,7 +71,7 @@ vllm serve \
|
|
| 71 |
--gpu-memory-utilization 0.9 \
|
| 72 |
--tensor-parallel-size 8 \
|
| 73 |
--enable-expert-parallel \ # optional
|
| 74 |
-
--speculative-config '{"model": "__YOUR_PATH__/
|
| 75 |
--trust-remote-code \
|
| 76 |
--host 0.0.0.0 \
|
| 77 |
--port 8000
|
|
@@ -90,8 +90,8 @@ vllm serve \
|
|
| 90 |
|
| 91 |
### 【Model Download】
|
| 92 |
```python
|
| 93 |
-
from
|
| 94 |
-
snapshot_download('
|
| 95 |
```
|
| 96 |
|
| 97 |
### 【Overview】
|
|
|
|
| 60 |
export VLLM_USE_DEEP_GEMM=0 # ATM, this line is a "must" for Hopper devices
|
| 61 |
CONTEXT_LENGTH=32768
|
| 62 |
vllm serve \
|
| 63 |
+
__YOUR_PATH__/QuantTrio/DeepSeek-V3.2-AWQ \
|
| 64 |
--served-model-name MY_MODEL_NAME \
|
| 65 |
--enable-auto-tool-choice \
|
| 66 |
--tool-call-parser deepseek_v31 \
|
|
|
|
| 71 |
--gpu-memory-utilization 0.9 \
|
| 72 |
--tensor-parallel-size 8 \
|
| 73 |
--enable-expert-parallel \ # optional
|
| 74 |
+
--speculative-config '{"model": "__YOUR_PATH__/QuantTrio/DeepSeek-V3.2-AWQ", "num_speculative_tokens": 1}' \ # optional, 50%+- throughput increase is observed
|
| 75 |
--trust-remote-code \
|
| 76 |
--host 0.0.0.0 \
|
| 77 |
--port 8000
|
|
|
|
| 90 |
|
| 91 |
### 【Model Download】
|
| 92 |
```python
|
| 93 |
+
from huggingface_hub import snapshot_download
|
| 94 |
+
snapshot_download('QuantTrio/DeepSeek-V3.2-AWQ', cache_dir="your_local_path")
|
| 95 |
```
|
| 96 |
|
| 97 |
### 【Overview】
|