Files
portainer_scripts/ai-templates/stacks/vllm/docker-compose.yml

30 lines
708 B
YAML

version: "3.8"
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm
restart: unless-stopped
ports:
- "${VLLM_PORT:-8000}:8000"
volumes:
- vllm_cache:/root/.cache/huggingface
environment:
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN:-}
command: >
--model ${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}
--max-model-len ${MAX_MODEL_LEN:-4096}
--gpu-memory-utilization ${GPU_MEM_UTIL:-0.90}
--tensor-parallel-size ${TENSOR_PARALLEL:-1}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ipc: host
volumes:
vllm_cache: