version: "3.8"

services:
  nim:
    image: nvcr.io/nim/${NIM_MODEL:-meta/llama-3.1-8b-instruct}:${NIM_VERSION:-latest}
    container_name: nvidia-nim
    restart: unless-stopped
    ports:
      - "${NIM_PORT:-8000}:8000"
    volumes:
      - nim_cache:/opt/nim/.cache
    environment:
      - NGC_API_KEY=${NGC_API_KEY}
      - NIM_MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
      - NIM_GPU_MEMORY_UTILIZATION=${GPU_MEM_UTIL:-0.9}
      - NIM_MAX_BATCH_SIZE=${MAX_BATCH:-256}
      - NIM_LOG_LEVEL=${LOG_LEVEL:-INFO}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    shm_size: ${SHM_SIZE:-16g}
    ulimits:
      memlock: -1
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
      interval: 30s
      timeout: 10s
      retries: 10
      start_period: 120s

volumes:
  nim_cache: