mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-22 18:13:52 +00:00
chore: Regenerate all playbooks
This commit is contained in:
parent
35995f8384
commit
7ad00de39e
@ -1,48 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
trtllm:
|
||||
image: nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3
|
||||
deploy:
|
||||
replicas: 2
|
||||
restart_policy:
|
||||
condition: any
|
||||
delay: 5s
|
||||
max_attempts: 3
|
||||
window: 120s
|
||||
resources:
|
||||
reservations:
|
||||
generic_resources:
|
||||
- discrete_resource_spec:
|
||||
kind: 'NVIDIA_GPU'
|
||||
value: 1
|
||||
environment:
|
||||
- UCX_NET_DEVICES=enp1s0f0np0,enp1s0f1np1
|
||||
- NCCL_SOCKET_IFNAME=enp1s0f0np0,enp1s0f1np1
|
||||
- OMPI_MCA_btl_tcp_if_include=enp1s0f0np0,enp1s0f1np1
|
||||
- OMPI_MCA_orte_default_hostfile=/etc/openmpi-hostfile
|
||||
- OMPI_MCA_rmaps_ppr_n_pernode=1
|
||||
- OMPI_ALLOW_RUN_AS_ROOT=1
|
||||
- OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
|
||||
entrypoint: /opt/trtllm-mn-entrypoint.sh
|
||||
volumes:
|
||||
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
||||
- ~/trtllm-mn-entrypoint.sh:/opt/trtllm-mn-entrypoint.sh
|
||||
- ~/.ssh:/tmp/.ssh:ro
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
devices:
|
||||
- /dev/infiniband:/dev/infiniband
|
||||
networks:
|
||||
- host
|
||||
healthcheck:
|
||||
test: ["CMD", "service", "ssh", "status"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
|
||||
networks:
|
||||
host:
|
||||
name: host
|
||||
external: true
|
||||
@ -47,7 +47,7 @@ sed -i.bak \
|
||||
echo "root:root" | chpasswd
|
||||
|
||||
# Configure SSH client for root to disable host key checks within *
|
||||
echo -e '\nHost *\n StrictHostKeyChecking no\n Port '$SSH_PORT'\n UserKnownHostsFile=/dev/null' > /etc/ssh/ssh_config.d/trt-llm.conf && \
|
||||
printf '\nHost *\n StrictHostKeyChecking no\n Port %s\n UserKnownHostsFile=/dev/null\n' "$SSH_PORT" > /etc/ssh/ssh_config.d/trt-llm.conf && \
|
||||
chmod 600 /etc/ssh/ssh_config.d/trt-llm.conf
|
||||
|
||||
# Fix login session for container
|
||||
|
||||
Loading…
Reference in New Issue
Block a user