version: "3.8"

services:
  triton:
    image: nvcr.io/nvidia/tritonserver:${TRITON_VERSION:-24.08}-py3
    container_name: triton-inference-server
    restart: unless-stopped
    ports:
      - "${HTTP_PORT:-8000}:8000"
      - "${GRPC_PORT:-8001}:8001"
      - "${METRICS_PORT:-8002}:8002"
    volumes:
      - triton_models:/models
    command: >
      tritonserver
      --model-repository=/models
      --strict-model-config=${STRICT_CONFIG:-false}
      --log-verbose=${LOG_VERBOSE:-0}
      --exit-on-error=${EXIT_ON_ERROR:-false}
      --rate-limit=${RATE_LIMIT:-off}
      --model-control-mode=${MODEL_CONTROL:-poll}
      --repository-poll-secs=${POLL_INTERVAL:-30}
    environment:
      - CUDA_VISIBLE_DEVICES=${CUDA_DEVICES:-all}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    shm_size: ${SHM_SIZE:-1g}
    ulimits:
      memlock: -1
      stack: 67108864
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"]
      interval: 30s
      timeout: 10s
      retries: 5

volumes:
  triton_models: