44 lines
1.1 KiB
YAML
44 lines
1.1 KiB
YAML
version: "3.8"
|
|
|
|
services:
|
|
triton:
|
|
image: nvcr.io/nvidia/tritonserver:${TRITON_VERSION:-24.08}-py3
|
|
container_name: triton-inference-server
|
|
restart: unless-stopped
|
|
ports:
|
|
- "${HTTP_PORT:-8000}:8000"
|
|
- "${GRPC_PORT:-8001}:8001"
|
|
- "${METRICS_PORT:-8002}:8002"
|
|
volumes:
|
|
- triton_models:/models
|
|
command: >
|
|
tritonserver
|
|
--model-repository=/models
|
|
--strict-model-config=${STRICT_CONFIG:-false}
|
|
--log-verbose=${LOG_VERBOSE:-0}
|
|
--exit-on-error=${EXIT_ON_ERROR:-false}
|
|
--rate-limit=${RATE_LIMIT:-off}
|
|
--model-control-mode=${MODEL_CONTROL:-poll}
|
|
--repository-poll-secs=${POLL_INTERVAL:-30}
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${CUDA_DEVICES:-all}
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
shm_size: ${SHM_SIZE:-1g}
|
|
ulimits:
|
|
memlock: -1
|
|
stack: 67108864
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/v2/health/ready"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
|
|
volumes:
|
|
triton_models:
|