#!/bin/bash
# Start vLLM server in the background with your custom flags
vllm \
  --runtime nvidia --gpus all --ipc=host \
  --model unsloth/llama-3-8b-Instruct-bnb-4bit \
  --enable-auto-tool-choice \
  --tool-call-parser llama3_json \
  --chat-template examples/tool_chat_template_llama3.1_json.jinja \
  --quantization bitsandbytes \
  --load-format bitsandbytes \
  --dtype half \
  --max-model-len 8192 &

# Allow some time for the vLLM server to start up (adjust if needed)
sleep 10

# Launch the Gradio chatbot application
python3 app.py