version: "3.8" services: triton: image: nvcr.io/nvidia/tritonserver:${TRITON_VERSION:-24.08}-py3 container_name: triton-inference-server restart: unless-stopped ports: - "${HTTP_PORT:-8000}:8000" - "${GRPC_PORT:-8001}:8001" - "${METRICS_PORT:-8002}:8002" volumes: - triton_models:/models command: > tritonserver --model-repository=/models --strict-model-config=${STRICT_CONFIG:-false} --log-verbose=${LOG_VERBOSE:-0} --exit-on-error=${EXIT_ON_ERROR:-false} --rate-limit=${RATE_LIMIT:-off} --model-control-mode=${MODEL_CONTROL:-poll} --repository-poll-secs=${POLL_INTERVAL:-30} environment: - CUDA_VISIBLE_DEVICES=${CUDA_DEVICES:-all} deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] shm_size: ${SHM_SIZE:-1g} ulimits: memlock: -1 stack: 67108864 healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"] interval: 30s timeout: 10s retries: 5 volumes: triton_models: