version: "3.8" services: nim: image: nvcr.io/nim/${NIM_MODEL:-meta/llama-3.1-8b-instruct}:${NIM_VERSION:-latest} container_name: nvidia-nim restart: unless-stopped ports: - "${NIM_PORT:-8000}:8000" volumes: - nim_cache:/opt/nim/.cache environment: - NGC_API_KEY=${NGC_API_KEY} - NIM_MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} - NIM_GPU_MEMORY_UTILIZATION=${GPU_MEM_UTIL:-0.9} - NIM_MAX_BATCH_SIZE=${MAX_BATCH:-256} - NIM_LOG_LEVEL=${LOG_LEVEL:-INFO} deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] shm_size: ${SHM_SIZE:-16g} ulimits: memlock: -1 healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"] interval: 30s timeout: 10s retries: 10 start_period: 120s volumes: nim_cache: