AgenticRAG / start.sh
Sumkh's picture
Update start.sh
872a46f verified
raw
history blame
545 Bytes
#!/bin/bash
# Start vLLM server in the background with your custom flags
vllm \
--runtime nvidia --gpus all --ipc=host \
--model unsloth/llama-3-8b-Instruct-bnb-4bit \
--enable-auto-tool-choice \
--tool-call-parser llama3_json \
--chat-template examples/tool_chat_template_llama3.1_json.jinja \
--quantization bitsandbytes \
--load-format bitsandbytes \
--dtype half \
--max-model-len 8192 &
# Allow some time for the vLLM server to start up (adjust if needed)
sleep 10
# Launch the Gradio chatbot application
python3 app.py