#!/bin/bash # Start vLLM server in the background with your custom flags vllm \ --runtime nvidia --gpus all --ipc=host \ --model unsloth/llama-3-8b-Instruct-bnb-4bit \ --enable-auto-tool-choice \ --tool-call-parser llama3_json \ --chat-template examples/tool_chat_template_llama3.1_json.jinja \ --quantization bitsandbytes \ --load-format bitsandbytes \ --dtype half \ --max-model-len 8192 & # Allow some time for the vLLM server to start up (adjust if needed) sleep 10 # Launch the Gradio chatbot application python3 app.py