mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-22 10:03:54 +00:00
chore: Regenerate all playbooks
This commit is contained in:
parent
e07330f8dc
commit
ab2ca0fcf1
@ -24,6 +24,12 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Check if running as root
|
||||
if [[ $EUID -eq 0 ]]; then
|
||||
echo "Error: This script should not be run as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Dynamically get interface names from ibdev2netdev output
|
||||
# Use ibdev2netdev to list Infiniband devices and their network interfaces.
|
||||
# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)
|
||||
@ -34,7 +40,7 @@ if [ ${#INTERFACES[@]} -eq 0 ]; then
|
||||
echo "ERROR: No active interfaces found via ibdev2netdev."
|
||||
exit 1
|
||||
fi
|
||||
OUTPUT_FILE="/tmp/stacked-sparks-hostfile"
|
||||
OUTPUT_FILE="~/.stacked-sparks-hostfile"
|
||||
|
||||
# Check if avahi-browse is available
|
||||
if ! command -v avahi-browse &> /dev/null; then
|
||||
|
||||
@ -1,247 +0,0 @@
|
||||
<!--
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
-->
|
||||
|
||||
# TensorRT-LLM on Stacked Spark Instructions
|
||||
|
||||
## Step 1. Setup networking between nodes
|
||||
Configure network interfaces using netplan on both DGX Spark nodes:
|
||||
|
||||
```bash
|
||||
# On both nodes, create the netplan configuration file (also available in cx7-netplan.yaml in this repository)
|
||||
sudo tee /etc/netplan/40-cx7.yaml > /dev/null <<EOF
|
||||
network:
|
||||
version: 2
|
||||
ethernets:
|
||||
enp1s0f0np0:
|
||||
link-local: [ ipv4 ]
|
||||
enp1s0f1np1:
|
||||
link-local: [ ipv4 ]
|
||||
EOF
|
||||
|
||||
# On both nodes, set appropriate permissions
|
||||
sudo chmod 600 /etc/netplan/40-cx7.yaml
|
||||
|
||||
# On both nodes, apply the netplan configuration
|
||||
sudo netplan apply
|
||||
```
|
||||
|
||||
## Step 2: Run the DGX Spark discovery script
|
||||
Automatically identify DGX Spark systems interconnected, and setup SSH passwordless authentication.
|
||||
```
|
||||
# On either node, run the following
|
||||
$ ./discover-sparks
|
||||
Found: 192.168.100.10 (spark-1b3b.local)
|
||||
Found: 192.168.100.11 (spark-1d84.local)
|
||||
|
||||
Copying your SSH public key to all discovered nodes using ssh-copy-id.
|
||||
You may be prompted for your password on each node.
|
||||
Copying SSH key to 192.168.100.10 ...
|
||||
Copying SSH key to 192.168.100.11 ...
|
||||
nvidia@192.168.100.11's password:
|
||||
|
||||
SSH key copy process complete. These two sparks can now talk to each other.
|
||||
```
|
||||
|
||||
## Step 3: Setup Docker Swarm with GPU support
|
||||
|
||||
### Substep A: Install NVIDIA Container Toolkit
|
||||
Ensure the NVIDIA drivers and the NVIDIA Container Toolkit are installed on each node (both manager and workers) that will provide GPU resources. This package enables Docker containers to access the host's GPU hardware. Ensure you complete the [installation steps](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), including the [Docker configuration](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) for NVIDIA Container Toolkit.
|
||||
|
||||
### Substep B: Enable resource advertising
|
||||
Modify the NVIDIA Container Runtime to advertise the GPUs to the Swarm by uncommenting the swarm-resource line in the config.toml file. You can do this either with your preferred text editor (e.g., vim, nano...) or with the following command:
|
||||
```bash
|
||||
sudo sed -i 's/^#\s*\(swarm-resource\s*=\s*".*"\)/\1/' /etc/nvidia-container-runtime/config.toml
|
||||
```
|
||||
To apply the changes, restart the Docker daemon
|
||||
```
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
# Step 4: Run inference in TRT-LLM
|
||||
### Substep A: Initialize Docker Swarm
|
||||
On whichever node you want to use as primary, run the following swarm inititalization command
|
||||
```bash
|
||||
docker swarm init --advertise-addr $(ip -o -4 addr show enp1s0f0np0 | awk '{print $4}' | cut -d/ -f1) $(ip -o -4 addr show enp1s0f1np1 | awk '{print $4}' | cut -d/ -f1)
|
||||
```
|
||||
|
||||
The typical output of the above would be similar to the following:
|
||||
```
|
||||
Swarm initialized: current node (node-id) is now a manager.
|
||||
|
||||
To add a worker to this swarm, run the following command:
|
||||
|
||||
docker swarm join --token <worker-token> <advertise-addr>:<port>
|
||||
|
||||
To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.
|
||||
```
|
||||
|
||||
### Substep B: Join worker nodes and deploy
|
||||
Now we can proceed with setting up other nodes of your cluster:
|
||||
|
||||
```bash
|
||||
# Run the command suggested by the docker swarm init on each worker node to join the Docker swarm
|
||||
docker swarm join --token <worker-token> <advertise-addr>:<port>
|
||||
|
||||
# On your primary node, deploy the stack using the following command
|
||||
docker stack deploy -c docker-compose.yml trtllm-multinode
|
||||
|
||||
# You can verify the status of your worker nodes using the following
|
||||
docker stack ps trtllm-multinode
|
||||
|
||||
# In case you see any errors reported by docker ps for any node, you can verify using
|
||||
docker service logs <ID>
|
||||
```
|
||||
|
||||
If everything is healthy, you should see a similar output to the following:
|
||||
```
|
||||
nvidia@spark-1b3b:~/draft-playbooks/trt-llm-on-stacked-spark$ docker stack ps trtllm-multinode
|
||||
ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS
|
||||
oe9k5o6w41le trtllm-multinode_trtllm.1 nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3 spark-1d84 Running Running 2 minutes ago
|
||||
phszqzk97p83 trtllm-multinode_trtllm.2 nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3 spark-1b3b Running Running 2 minutes ago
|
||||
```
|
||||
|
||||
### Substep C. Create hosts file
|
||||
|
||||
|
||||
You can check the available nodes using `docker node ls`
|
||||
```
|
||||
nvidia@spark-1b3b:~$ docker node ls
|
||||
ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION
|
||||
hza2b7yisatqiezo33zx4in4i * spark-1b3b Ready Active Leader 28.3.3
|
||||
m1k22g3ktgnx36qz4jg5fzhr4 spark-1d84 Ready Active 28.3.3
|
||||
```
|
||||
|
||||
Generate a file containing all Docker Swarm node addresses for MPI operations, and then copy it over to your container:
|
||||
```bash
|
||||
docker node ls --format '{{.ID}}' | xargs -n1 docker node inspect --format '{{ .Status.Addr }}' > ~/openmpi-hostfile
|
||||
docker cp ~/openmpi-hostfile $(docker ps -q -f name=trtllm-multinode):/etc/openmpi-hostfile
|
||||
```
|
||||
|
||||
### Substep D. Find your Docker container ID
|
||||
You can use `docker ps` to find your Docker container ID. Alternatively, you can save the container ID in a variable:
|
||||
```
|
||||
export TRTLLM_MN_CONTAINER=$(docker ps -q -f name=trtllm-multinode)
|
||||
```
|
||||
|
||||
### Substep E. Generate configuration file
|
||||
|
||||
```bash
|
||||
docker exec $TRTLLM_MN_CONTAINER bash -c 'cat <<EOF > /tmp/extra-llm-api-config.yml
|
||||
print_iter_log: false
|
||||
kv_cache_config:
|
||||
dtype: "fp8"
|
||||
free_gpu_memory_fraction: 0.9
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
EOF'
|
||||
```
|
||||
|
||||
### Substep F. Download model
|
||||
|
||||
```bash
|
||||
docker exec \
|
||||
-e MODEL="nvidia/Qwen3-235B-A22B-FP4" \
|
||||
-e HF_TOKEN="hf_..." \
|
||||
-it $TRTLLM_MN_CONTAINER bash -c 'mpirun -x HF_TOKEN bash -c "huggingface-cli download $MODEL"'
|
||||
```
|
||||
|
||||
### Substep G. Prepare dataset and benchmark
|
||||
|
||||
```bash
|
||||
docker exec \
|
||||
-e ISL=128 -e OSL=128 \
|
||||
-e MODEL="nvidia/Qwen3-235B-A22B-FP4" \
|
||||
-e HF_TOKEN="" \
|
||||
-it $TRTLLM_MN_CONTAINER bash -c '
|
||||
mpirun -x HF_TOKEN bash -c "python benchmarks/cpp/prepare_dataset.py --tokenizer=$MODEL --stdout token-norm-dist --num-requests=1 --input-mean=$ISL --output-mean=$OSL --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt" && \
|
||||
mpirun -x HF_TOKEN trtllm-llmapi-launch trtllm-bench -m $MODEL throughput \
|
||||
--tp 2 \
|
||||
--dataset /tmp/dataset.txt \
|
||||
--backend pytorch \
|
||||
--max_num_tokens 4096 \
|
||||
--concurrency 1 \
|
||||
--max_batch_size 4 \
|
||||
--extra_llm_api_options /tmp/extra-llm-api-config.yml \
|
||||
--streaming'
|
||||
```
|
||||
|
||||
### Substep H. Serve the model
|
||||
|
||||
```bash
|
||||
docker exec \
|
||||
-e MODEL="nvidia/Qwen3-235B-A22B-FP4" \
|
||||
-e HF_TOKEN="" \
|
||||
-it $TRTLLM_MN_CONTAINER bash -c '
|
||||
mpirun -x HF_TOKEN trtllm-llmapi-launch trtllm-serve $MODEL \
|
||||
--tp_size 2 \
|
||||
--backend pytorch \
|
||||
--max_num_tokens 32768 \
|
||||
--max_batch_size 4 \
|
||||
--extra_llm_api_options /tmp/extra-llm-api-config.yml \
|
||||
--port 8000'
|
||||
```
|
||||
|
||||
This will start the TensorRT-LLM server on port 8000. You can then make inference requests to `http://localhost:8000` using the OpenAI-compatible API format.
|
||||
|
||||
**Expected output:** Server startup logs and ready message.
|
||||
|
||||
### Example inference request
|
||||
|
||||
Once the server is running, you can test it with a CURL request. Please ensure the CURL request is run on the primary node where you previously ran Substep H.
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "nvidia/Qwen3-235B-A22B-FP4",
|
||||
"prompt": "What is artificial intelligence?",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7,
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
|
||||
## Step 6. Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| MPI hostname test returns single hostname | Network connectivity issues | Verify both nodes are on 192.168.100.0/24 subnet |
|
||||
| "Permission denied" on HuggingFace download | Invalid or missing HF_TOKEN | Set valid token: `export HF_TOKEN=<TOKEN>` |
|
||||
| "CUDA out of memory" errors | Insufficient GPU memory | Reduce `--max_batch_size` or `--max_num_tokens` |
|
||||
| Container exits immediately | Missing entrypoint script | Ensure `trtllm-mn-entrypoint.sh` download succeeded and has executable permissions |
|
||||
|
||||
## Step 7. Cleanup and rollback
|
||||
|
||||
Stop and remove containers by using the following command on the leader node:
|
||||
|
||||
```bash
|
||||
docker stack rm trtllm-multinode
|
||||
```
|
||||
|
||||
> **Warning:** This removes all inference data and performance reports. Copy `/opt/*perf-report.json` files before cleanup if needed.
|
||||
|
||||
Remove downloaded models to free disk space:
|
||||
|
||||
```bash
|
||||
rm -rf $HOME/.cache/huggingface/hub/models--nvidia--Qwen3*
|
||||
```
|
||||
|
||||
## Step 8. Next steps
|
||||
|
||||
Compare performance metrics between speculative decoding and baseline reports to quantify speed improvements. Use the multi-node setup as a foundation for deploying other large models requiring tensor parallelism, or scale to additional nodes for higher throughput workloads.
|
||||
@ -17,13 +17,14 @@
|
||||
- [Step 9. Troubleshooting](#step-9-troubleshooting)
|
||||
- [Step 10. Cleanup and rollback](#step-10-cleanup-and-rollback)
|
||||
- [Run on two Sparks](#run-on-two-sparks)
|
||||
- [Step 1. Verify connectivity and SSH setup](#step-1-verify-connectivity-and-ssh-setup)
|
||||
- [Step 2. Install NVIDIA Container Toolkit](#step-2-install-nvidia-container-toolkit)
|
||||
- [Step 3. Enable resource advertising](#step-3-enable-resource-advertising)
|
||||
- [Step 4. Initialize Docker Swarm](#step-4-initialize-docker-swarm)
|
||||
- [Step 5. Join worker nodes and deploy](#step-5-join-worker-nodes-and-deploy)
|
||||
- [Step 6. Create hosts file](#step-6-create-hosts-file)
|
||||
- [Step 7. Find your Docker container ID](#step-7-find-your-docker-container-id)
|
||||
- [Step 1. Configure Docker permissions](#step-1-configure-docker-permissions)
|
||||
- [Step 2. Configure network connectivity](#step-2-configure-network-connectivity)
|
||||
- [Step 3. Install NVIDIA Container Toolkit](#step-3-install-nvidia-container-toolkit)
|
||||
- [Step 4. Enable resource advertising](#step-4-enable-resource-advertising)
|
||||
- [Step 5. Initialize Docker Swarm](#step-5-initialize-docker-swarm)
|
||||
- [Step 6. Join worker nodes and deploy](#step-6-join-worker-nodes-and-deploy)
|
||||
- [Step 7. Create hosts file](#step-7-create-hosts-file)
|
||||
- [Step 8. Find your Docker container ID](#step-8-find-your-docker-container-id)
|
||||
- [Step 8. Generate configuration file](#step-8-generate-configuration-file)
|
||||
- [Step 9. Download model](#step-9-download-model)
|
||||
- [Step 10. Serve the model](#step-10-serve-the-model)
|
||||
@ -38,9 +39,9 @@
|
||||
|
||||
## Basic idea
|
||||
|
||||
**NVIDIA TensorRT-LLM (TRT-LLM)** is an open-source library for optimizing and accelerating large language model (LLM) inference on NVIDIA GPUs.
|
||||
**NVIDIA TensorRT-LLM (TRT-LLM)** is an open-source library for optimizing and accelerating large language model (LLM) inference on NVIDIA GPUs.
|
||||
|
||||
It provides highly efficient kernels, memory management, and parallelism strategies—like tensor, pipeline, and sequence parallelism—so developers can serve LLMs with lower latency and higher throughput.
|
||||
It provides highly efficient kernels, memory management, and parallelism strategies—like tensor, pipeline, and sequence parallelism—so developers can serve LLMs with lower latency and higher throughput.
|
||||
|
||||
TRT-LLM integrates with frameworks like Hugging Face and PyTorch, making it easier to deploy state-of-the-art models at scale.
|
||||
|
||||
@ -69,6 +70,14 @@ inference through kernel-level optimizations, efficient memory layouts, and adva
|
||||
- Internet connectivity for downloading models and container images
|
||||
- Network: open TCP ports 8355 (LLM) and 8356 (VLM) on host for OpenAI-compatible serving
|
||||
|
||||
## Ancillary files
|
||||
|
||||
All required assets can be found [here on GitHub](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main)
|
||||
|
||||
- [**discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks.sh) — script to automatically discover and configure SSH between Spark nodes
|
||||
- [**trtllm-mn-entrypoint.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/trtllm-mn-entrypoint.sh) — container entrypoint script for multi-node setup
|
||||
- [**docker-compose.yml**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/docker-compose.yml) — Docker Compose configuration for multi-node deployment
|
||||
|
||||
## Model Support Matrix
|
||||
|
||||
The following models are supported with TensorRT-LLM on Spark. All listed models are available and ready to use:
|
||||
@ -95,7 +104,7 @@ The following models are supported with TensorRT-LLM on Spark. All listed models
|
||||
| **Llama-4-Scout-17B-16E-Instruct** | NVFP4 | ✅ | `nvidia/Llama-4-Scout-17B-16E-Instruct-FP4` |
|
||||
| **Qwen3-235B-A22B (two Sparks only)** | NVFP4 | ✅ | `nvidia/Qwen3-235B-A22B-FP4` |
|
||||
|
||||
**Note:** You can use the NVFP4 Quantization documentation to generate your own NVFP4-quantized checkpoints for your favorite models. This enables you to take advantage of the performance and memory benefits of NVFP4 quantization even for models not already published by NVIDIA.
|
||||
**Note:** You can use the NVFP4 Quantization documentation to generate your own NVFP4-quantized checkpoints for your favorite models. This enables you to take advantage of the performance and memory benefits of NVFP4 quantization even for models not already published by NVIDIA.
|
||||
|
||||
Reminder: not all model architectures are supported for NVFP4 quantization.
|
||||
|
||||
@ -413,29 +422,113 @@ docker rmi nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev
|
||||
|
||||
## Run on two Sparks
|
||||
|
||||
### Step 1. Verify connectivity and SSH setup
|
||||
### Step 1. Configure Docker permissions
|
||||
|
||||
Verify that the two Spark nodes can communicate with each other using ping and that SSH passwordless authentication is properly configured.
|
||||
To easily manage containers without sudo, you must be in the `docker` group. If you choose to skip this step, you will need to run Docker commands with sudo.
|
||||
|
||||
Open a new terminal and test Docker access. In the terminal, run:
|
||||
|
||||
```bash
|
||||
## Test network connectivity between nodes (replace with your actual node IPs)
|
||||
ping -c 3 <other-node-ip>
|
||||
docker ps
|
||||
```
|
||||
|
||||
If you see a permission denied error (something like `permission denied while trying to connect to the Docker daemon socket`), add your user to the docker group:
|
||||
|
||||
```bash
|
||||
## Test SSH passwordless authentication (replace with your actual node IP)
|
||||
ssh nvidia@<other-node-ip> hostname
|
||||
sudo usermod -aG docker $USER
|
||||
```
|
||||
|
||||
**Expected results:**
|
||||
- Ping should show successful packet transmission with 0% packet loss
|
||||
- SSH command should execute without prompting for a password and return the remote hostname
|
||||
> **Warning**: After running usermod, you must log out and log back in to start a new
|
||||
> session with updated group permissions.
|
||||
|
||||
### Step 2. Install NVIDIA Container Toolkit
|
||||
### Step 2. Configure network connectivity
|
||||
|
||||
You have two options for configuring network connectivity between your DGX Spark nodes:
|
||||
|
||||
#### Option 1: Automatic IP assignment (recommended)
|
||||
|
||||
Follow these steps on both DGX Spark nodes to configure network interfaces using netplan:
|
||||
|
||||
```bash
|
||||
## Create the netplan configuration file
|
||||
sudo tee /etc/netplan/40-cx7.yaml > /dev/null <<EOF
|
||||
network:
|
||||
version: 2
|
||||
ethernets:
|
||||
enp1s0f0np0:
|
||||
link-local: [ ipv4 ]
|
||||
enp1s0f1np1:
|
||||
link-local: [ ipv4 ]
|
||||
EOF
|
||||
|
||||
## Set appropriate permissions
|
||||
sudo chmod 600 /etc/netplan/40-cx7.yaml
|
||||
|
||||
## Apply the configuration
|
||||
sudo netplan apply
|
||||
```
|
||||
|
||||
#### Option 2: Manual IP assignment (advanced)
|
||||
|
||||
First, identify which network ports are available and up:
|
||||
|
||||
```bash
|
||||
## Check network port status
|
||||
ibdev2netdev
|
||||
```
|
||||
|
||||
Example output:
|
||||
```
|
||||
roceP2p1s0f0 port 1 ==> enP2p1s0f0np0 (Up)
|
||||
roceP2p1s0f1 port 1 ==> enP2p1s0f1np1 (Down)
|
||||
rocep1s0f0 port 1 ==> enp1s0f0np0 (Up)
|
||||
rocep1s0f1 port 1 ==> enp1s0f1np1 (Down)
|
||||
```
|
||||
|
||||
Use an interface that shows as "(Up)" in your output. In this example, we'll use enP2p1s0f0np0.
|
||||
|
||||
On Node 1:
|
||||
```bash
|
||||
## Assign static IP and bring up interface
|
||||
sudo ip addr add 192.168.100.10/24 dev enp1s0f0np0
|
||||
sudo ip link set enp1s0f0np0 up
|
||||
```
|
||||
|
||||
On Node 2:
|
||||
```bash
|
||||
## Assign static IP and bring up interface
|
||||
sudo ip addr add 192.168.100.11/24 dev enp1s0f0np0
|
||||
sudo ip link set enp1s0f0np0 up
|
||||
```
|
||||
|
||||
|
||||
#### Set up passwordless SSH authentication
|
||||
|
||||
Run the DGX Spark [**discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks.sh) script on both nodes to automatically configure SSH:
|
||||
|
||||
```bash
|
||||
bash ./discover-sparks.sh
|
||||
```
|
||||
|
||||
Expected output similar to the below, with different IPs and node names. The first time you run the script, you'll be prompted for your password for each node.
|
||||
```
|
||||
Found: 192.168.100.10 (spark-1b3b.local)
|
||||
Found: 192.168.100.11 (spark-1d84.local)
|
||||
|
||||
Copying your SSH public key to all discovered nodes using ssh-copy-id.
|
||||
You may be prompted for your password on each node.
|
||||
Copying SSH key to 192.168.100.10 ...
|
||||
Copying SSH key to 192.168.100.11 ...
|
||||
nvidia@192.168.100.11's password:
|
||||
|
||||
SSH key copy process complete. These two sparks can now talk to each other.
|
||||
```
|
||||
|
||||
### Step 3. Install NVIDIA Container Toolkit
|
||||
|
||||
Ensure the NVIDIA drivers and the NVIDIA Container Toolkit are installed on each node (both manager and workers) that will provide GPU resources. This package enables Docker containers to access the host's GPU hardware. Ensure you complete the [installation steps](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), including the [Docker configuration](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) for NVIDIA Container Toolkit.
|
||||
|
||||
### Step 3. Enable resource advertising
|
||||
### Step 4. Enable resource advertising
|
||||
|
||||
Modify the NVIDIA Container Runtime to advertise the GPUs to the Swarm by uncommenting the swarm-resource line in the config.toml file. You can do this either with your preferred text editor (e.g., vim, nano...) or with the following command:
|
||||
```bash
|
||||
@ -446,7 +539,7 @@ To apply the changes, restart the Docker daemon
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
### Step 4. Initialize Docker Swarm
|
||||
### Step 5. Initialize Docker Swarm
|
||||
|
||||
On whichever node you want to use as primary, run the following swarm initialization command
|
||||
```bash
|
||||
@ -464,34 +557,37 @@ To add a worker to this swarm, run the following command:
|
||||
To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.
|
||||
```
|
||||
|
||||
### Step 5. Join worker nodes and deploy
|
||||
### Step 6. Join worker nodes and deploy
|
||||
|
||||
Now we can proceed with setting up other nodes of your cluster:
|
||||
Now we can proceed with setting up other nodes of your cluster.
|
||||
|
||||
Run the command suggested by the docker swarm init on each worker node to join the Docker swarm
|
||||
```bash
|
||||
docker swarm join --token <worker-token> <advertise-addr>:<port>
|
||||
```
|
||||
|
||||
On your primary node, deploy the TRT-LLM multi-node stack by downloading the [**docker-compose.yml**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/docker-compose.yml) and [**trtllm-mn-entrypoint.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/trtllm-mn-entrypoint.sh) files into your home directory and running the following command:
|
||||
|
||||
```bash
|
||||
## Run the command suggested by the docker swarm init on each worker node to join the Docker swarm
|
||||
docker swarm join --token <worker-token> <advertise-addr>:<port>
|
||||
chmod +x $HOME/trtllm-mn-entrypoint.sh
|
||||
docker stack deploy -c $HOME/docker-compose.yml trtllm-multinode
|
||||
```
|
||||
Note: Ensure you download both files into the same directory from which you are running the command.
|
||||
|
||||
## On your primary node, deploy the stack using the following command
|
||||
## Note: You'll need a docker-compose.yml file for TRT-LLM deployment
|
||||
docker stack deploy -c docker-compose.yml trtllm-multinode
|
||||
|
||||
## You can verify the status of your worker nodes using the following
|
||||
You can verify the status of your worker nodes using the following
|
||||
```bash
|
||||
docker stack ps trtllm-multinode
|
||||
|
||||
## In case you see any errors reported by docker ps for any node, you can verify using
|
||||
docker service logs <ID>
|
||||
```
|
||||
|
||||
If everything is healthy, you should see a similar output to the following:
|
||||
```
|
||||
nvidia@spark-1b3b:~/draft-playbooks/trt-llm-on-stacked-spark$ docker stack ps trtllm-multinode
|
||||
nvidia@spark-1b3b:~$ docker stack ps trtllm-multinode
|
||||
ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS
|
||||
oe9k5o6w41le trtllm-multinode_trtllm.1 nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev spark-1d84 Running Running 2 minutes ago
|
||||
phszqzk97p83 trtllm-multinode_trtllm.2 nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev spark-1b3b Running Running 2 minutes ago
|
||||
oe9k5o6w41le trtllm-multinode_trtllm.1 nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3 spark-1d84 Running Running 2 minutes ago
|
||||
phszqzk97p83 trtllm-multinode_trtllm.2 nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc3 spark-1b3b Running Running 2 minutes ago
|
||||
```
|
||||
|
||||
### Step 6. Create hosts file
|
||||
### Step 7. Create hosts file
|
||||
|
||||
You can check the available nodes using `docker node ls`
|
||||
```
|
||||
@ -507,7 +603,7 @@ docker node ls --format '{{.ID}}' | xargs -n1 docker node inspect --format '{{ .
|
||||
docker cp ~/openmpi-hostfile $(docker ps -q -f name=trtllm-multinode):/etc/openmpi-hostfile
|
||||
```
|
||||
|
||||
### Step 7. Find your Docker container ID
|
||||
### Step 8. Find your Docker container ID
|
||||
|
||||
You can use `docker ps` to find your Docker container ID. Alternatively, you can save the container ID in a variable:
|
||||
```bash
|
||||
@ -592,7 +688,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
|
||||
| MPI hostname test returns single hostname | Network connectivity issues | Verify both nodes are on reachable IP addresses |
|
||||
| "Permission denied" on HuggingFace download | Invalid or missing HF_TOKEN | Set valid token: `export HF_TOKEN=<TOKEN>` |
|
||||
| "CUDA out of memory" errors | Insufficient GPU memory | Reduce `--max_batch_size` or `--max_num_tokens` |
|
||||
| Container exits immediately | Missing entrypoint script | Ensure `trtllm-mn-entrypoint.sh` download succeeded and has executable permissions |
|
||||
| Container exits immediately | Missing entrypoint script | Ensure `trtllm-mn-entrypoint.sh` download succeeded and has executable permissions, also ensure you are not running the container already on your node. If port 2233 is already utilized, the entrypoint script will not start. |
|
||||
|
||||
### Step 14. Cleanup and rollback
|
||||
|
||||
|
||||
180
nvidia/trt-llm/assets/discover-sparks.sh
Executable file
180
nvidia/trt-llm/assets/discover-sparks.sh
Executable file
@ -0,0 +1,180 @@
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#!/bin/env bash
|
||||
|
||||
# discover-sparks.sh
|
||||
# Discover available systems using avahi-browse and generate MPI hosts file
|
||||
# Searches all active interfaces automatically
|
||||
#
|
||||
# Usage: bash ./discover-sparks
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Check if running as root
|
||||
if [[ $EUID -eq 0 ]]; then
|
||||
echo "Error: This script should not be run as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Dynamically get interface names from ibdev2netdev output
|
||||
# Use ibdev2netdev to list Infiniband devices and their network interfaces.
|
||||
# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)
|
||||
# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0).
|
||||
# The tr command removes any parentheses from the output.
|
||||
INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()'))
|
||||
if [ ${#INTERFACES[@]} -eq 0 ]; then
|
||||
echo "ERROR: No active interfaces found via ibdev2netdev."
|
||||
exit 1
|
||||
fi
|
||||
OUTPUT_FILE="~/.stacked-sparks-hostfile"
|
||||
|
||||
# Check if avahi-browse is available
|
||||
if ! command -v avahi-browse &> /dev/null; then
|
||||
echo "Error: avahi-browse not found. Please install avahi-utils package."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if ssh-copy-id is available
|
||||
if ! command -v ssh-copy-id &> /dev/null; then
|
||||
echo "Error: ssh-copy-id not found. Please install openssh-client package."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create temporary file for processing
|
||||
TEMP_FILE=$(mktemp)
|
||||
trap 'rm -f "$TEMP_FILE"' EXIT
|
||||
|
||||
# Run avahi-browse and filter for SSH services on specified interfaces
|
||||
# -p: parseable output
|
||||
# -r: resolve host names and addresses
|
||||
# -f: terminate after dumping all entries available at startup
|
||||
avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)
|
||||
|
||||
# Filter for both interfaces
|
||||
found_services=false
|
||||
for interface in "${INTERFACES[@]}"; do
|
||||
if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then
|
||||
found_services=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$found_services" = false ]; then
|
||||
echo "Warning: No services found on any specified interface"
|
||||
touch "$OUTPUT_FILE"
|
||||
echo "Created empty hosts file: $OUTPUT_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Extract IPv4 addresses from the avahi-browse output
|
||||
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
|
||||
|
||||
# Clear the output file
|
||||
> "$OUTPUT_FILE"
|
||||
|
||||
# Parse IPv4 entries and extract IP addresses
|
||||
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
|
||||
# Clean up any trailing data
|
||||
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
|
||||
|
||||
# Validate IP address format
|
||||
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
||||
echo "$clean_ip" >> "$OUTPUT_FILE"
|
||||
echo "Found: $clean_ip ($fqdn)"
|
||||
else
|
||||
echo "Warning: Invalid IP format: $clean_ip"
|
||||
fi
|
||||
done
|
||||
|
||||
# Sort and remove duplicates
|
||||
if [[ -s "$OUTPUT_FILE" ]]; then
|
||||
sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
|
||||
else
|
||||
echo "No IPv4 addresses found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if SSH key exists, if not, prompt to generate
|
||||
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
|
||||
ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
|
||||
echo "You may be prompted for your password on each node."
|
||||
|
||||
# Ensure authorized_keys file exists
|
||||
mkdir -p "$HOME/.ssh"
|
||||
touch "$HOME/.ssh/authorized_keys"
|
||||
chmod 700 "$HOME/.ssh"
|
||||
chmod 600 "$HOME/.ssh/authorized_keys"
|
||||
|
||||
while read -r node_ip; do
|
||||
if [[ -n "$node_ip" ]]; then
|
||||
echo ""
|
||||
echo "Setting up SSH access for $node_ip ..."
|
||||
|
||||
# Step 1: Copy local SSH key to remote node
|
||||
echo " Copying local SSH key to $node_ip ..."
|
||||
if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
|
||||
echo " ✓ Successfully copied local key to $node_ip"
|
||||
|
||||
# Step 2: Set up reverse SSH access (remote -> local)
|
||||
echo " Setting up reverse SSH access from $node_ip ..."
|
||||
|
||||
# Generate SSH key on remote node if it doesn't exist and get its public key
|
||||
remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
|
||||
# Ensure SSH directory exists
|
||||
mkdir -p ~/.ssh
|
||||
chmod 700 ~/.ssh
|
||||
|
||||
# Generate key if it doesn'"'"'t exist
|
||||
if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
|
||||
ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
|
||||
fi
|
||||
|
||||
# Output the public key
|
||||
cat ~/.ssh/id_ed25519.pub
|
||||
' 2>/dev/null)
|
||||
|
||||
if [[ -n "$remote_pubkey" ]]; then
|
||||
# Add remote public key to local authorized_keys if not already present
|
||||
if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
|
||||
echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
|
||||
echo " ✓ Added $node_ip's public key to local authorized_keys"
|
||||
else
|
||||
echo " ✓ $node_ip's public key already in local authorized_keys"
|
||||
fi
|
||||
else
|
||||
echo " ✗ Failed to get public key from $node_ip"
|
||||
fi
|
||||
else
|
||||
echo " ✗ Failed to copy local SSH key to $node_ip as $USER"
|
||||
fi
|
||||
fi
|
||||
done < "$OUTPUT_FILE"
|
||||
|
||||
# Add hostfile to remote nodes
|
||||
while read -r node_ip; do
|
||||
if [[ -n "$node_ip" ]]; then
|
||||
echo " Adding hostfile to $node_ip ..."
|
||||
scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
|
||||
fi
|
||||
done < "$OUTPUT_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Bidirectional SSH setup complete!"
|
||||
echo "Both local and remote nodes can now SSH to each other without passwords."
|
||||
@ -1,24 +1,3 @@
|
||||
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
|
||||
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
@ -1,4 +1,3 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
@ -14,6 +13,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#!/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
Loading…
Reference in New Issue
Block a user