mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-25 19:33:53 +00:00
chore: Regenerate all playbooks
This commit is contained in:
parent
35995f8384
commit
7ad00de39e
@ -1,48 +0,0 @@
|
|||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
|
||||||
trtllm:
|
|
||||||
image: nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3
|
|
||||||
deploy:
|
|
||||||
replicas: 2
|
|
||||||
restart_policy:
|
|
||||||
condition: any
|
|
||||||
delay: 5s
|
|
||||||
max_attempts: 3
|
|
||||||
window: 120s
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
generic_resources:
|
|
||||||
- discrete_resource_spec:
|
|
||||||
kind: 'NVIDIA_GPU'
|
|
||||||
value: 1
|
|
||||||
environment:
|
|
||||||
- UCX_NET_DEVICES=enp1s0f0np0,enp1s0f1np1
|
|
||||||
- NCCL_SOCKET_IFNAME=enp1s0f0np0,enp1s0f1np1
|
|
||||||
- OMPI_MCA_btl_tcp_if_include=enp1s0f0np0,enp1s0f1np1
|
|
||||||
- OMPI_MCA_orte_default_hostfile=/etc/openmpi-hostfile
|
|
||||||
- OMPI_MCA_rmaps_ppr_n_pernode=1
|
|
||||||
- OMPI_ALLOW_RUN_AS_ROOT=1
|
|
||||||
- OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
|
|
||||||
entrypoint: /opt/trtllm-mn-entrypoint.sh
|
|
||||||
volumes:
|
|
||||||
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
|
||||||
- ~/trtllm-mn-entrypoint.sh:/opt/trtllm-mn-entrypoint.sh
|
|
||||||
- ~/.ssh:/tmp/.ssh:ro
|
|
||||||
ulimits:
|
|
||||||
memlock: -1
|
|
||||||
stack: 67108864
|
|
||||||
devices:
|
|
||||||
- /dev/infiniband:/dev/infiniband
|
|
||||||
networks:
|
|
||||||
- host
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "service", "ssh", "status"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 10
|
|
||||||
|
|
||||||
networks:
|
|
||||||
host:
|
|
||||||
name: host
|
|
||||||
external: true
|
|
||||||
@ -47,7 +47,7 @@ sed -i.bak \
|
|||||||
echo "root:root" | chpasswd
|
echo "root:root" | chpasswd
|
||||||
|
|
||||||
# Configure SSH client for root to disable host key checks within *
|
# Configure SSH client for root to disable host key checks within *
|
||||||
echo -e '\nHost *\n StrictHostKeyChecking no\n Port '$SSH_PORT'\n UserKnownHostsFile=/dev/null' > /etc/ssh/ssh_config.d/trt-llm.conf && \
|
printf '\nHost *\n StrictHostKeyChecking no\n Port %s\n UserKnownHostsFile=/dev/null\n' "$SSH_PORT" > /etc/ssh/ssh_config.d/trt-llm.conf && \
|
||||||
chmod 600 /etc/ssh/ssh_config.d/trt-llm.conf
|
chmod 600 /etc/ssh/ssh_config.d/trt-llm.conf
|
||||||
|
|
||||||
# Fix login session for container
|
# Fix login session for container
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user