version: "3.8" services: vllm: image: vllm/vllm-openai:latest container_name: vllm restart: unless-stopped ports: - "${VLLM_PORT:-8000}:8000" volumes: - vllm_cache:/root/.cache/huggingface environment: - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN:-} command: > --model ${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct} --max-model-len ${MAX_MODEL_LEN:-4096} --gpu-memory-utilization ${GPU_MEM_UTIL:-0.90} --tensor-parallel-size ${TENSOR_PARALLEL:-1} deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ipc: host volumes: vllm_cache: