From be2db4604f022e64157c74b5c2b2b8fa9292f0d1 Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:25:52 -0800 Subject: [PATCH 1/8] Update vLLM container version to 25.11-py3 --- nvidia/vllm/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 25e733c..efc1bed 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -57,9 +57,9 @@ support for ARM64. ## Step 1. Pull vLLM container image -Find the latest container build from https://catalog.ngc.nvidia.com/orgs/nvidia/containers/vllm?version=25.09-py3 +Find the latest container build from https://catalog.ngc.nvidia.com/orgs/nvidia/containers/vllm?version=25.11-py3 ``` -docker pull nvcr.io/nvidia/vllm:25.09-py3 +docker pull nvcr.io/nvidia/vllm:25.11-py3 ``` ## Step 2. Test vLLM in container @@ -68,7 +68,7 @@ Launch the container and start vLLM server with a test model to verify basic fun ```bash docker run -it --gpus all -p 8000:8000 \ -nvcr.io/nvidia/vllm:25.09-py3 \ +nvcr.io/nvidia/vllm:25.11-py3 \ vllm serve "Qwen/Qwen2.5-Math-1.5B-Instruct" ``` @@ -96,7 +96,7 @@ Expected response should contain `"content": "204"` or similar mathematical calc For container approach (non-destructive): ```bash -docker rm $(docker ps -aq --filter ancestor=nvcr.io/nvidia/vllm:25.09-py3) +docker rm $(docker ps -aq --filter ancestor=nvcr.io/nvidia/vllm:25.11-py3) docker rmi nvcr.io/nvidia/vllm ``` @@ -150,8 +150,8 @@ After this, you should be able to run docker commands without using `sudo`. ```bash -docker pull nvcr.io/nvidia/vllm:25.09-py3 -export VLLM_IMAGE=nvcr.io/nvidia/vllm:25.09-py3 +docker pull nvcr.io/nvidia/vllm:25.11-py3 +export VLLM_IMAGE=nvcr.io/nvidia/vllm:25.11-py3 ``` From 3b53b7aeca665cca0cd10c3dca01c5fc9df0236b Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:33:36 -0800 Subject: [PATCH 2/8] Refine Ray cluster setup instructions in README Updated instructions for starting Ray cluster nodes with dynamic IP retrieval in the run cluster script --- nvidia/vllm/README.md | 66 ++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index efc1bed..390d2d6 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -160,38 +160,58 @@ export VLLM_IMAGE=nvcr.io/nvidia/vllm:25.11-py3 Launch the Ray cluster head node on Node 1. This node coordinates the distributed inference and serves the API endpoint. ```bash -## On Node 1, start head node -export MN_IF_NAME=enP2p1s0f1np1 -bash run_cluster.sh $VLLM_IMAGE 192.168.100.10 --head ~/.cache/huggingface \ --e VLLM_HOST_IP=192.168.100.10 \ --e UCX_NET_DEVICES=$MN_IF_NAME \ --e NCCL_SOCKET_IFNAME=$MN_IF_NAME \ --e OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME \ --e GLOO_SOCKET_IFNAME=$MN_IF_NAME \ --e TP_SOCKET_IFNAME=$MN_IF_NAME \ --e RAY_memory_monitor_refresh_ms=0 \ --e MASTER_ADDR=192.168.100.10 +# On Node 1, start head node + +# Get the IP address of the high-speed interface +# Use the interface that shows "(Up)" from ibdev2netdev (enp1s0f0np0 or enp1s0f1np1) +export MN_IF_NAME=enp1s0f1np1 +export VLLM_HOST_IP=$(ip -4 addr show $MN_IF_NAME | grep -oP '(?<=inet\s)\d+(\.\d+){3}') + +echo "Using interface $MN_IF_NAME with IP $VLLM_HOST_IP" + +bash run_cluster.sh $VLLM_IMAGE $VLLM_HOST_IP --head ~/.cache/huggingface \ + -e VLLM_HOST_IP=$VLLM_HOST_IP \ + -e UCX_NET_DEVICES=$MN_IF_NAME \ + -e NCCL_SOCKET_IFNAME=$MN_IF_NAME \ + -e OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME \ + -e GLOO_SOCKET_IFNAME=$MN_IF_NAME \ + -e TP_SOCKET_IFNAME=$MN_IF_NAME \ + -e RAY_memory_monitor_refresh_ms=0 \ + -e MASTER_ADDR=$VLLM_HOST_IP ``` ## Step 5. Start Ray worker node Connect Node 2 to the Ray cluster as a worker node. This provides additional GPU resources for tensor parallelism. - ```bash -## On Node 2, join as worker -export MN_IF_NAME=enP2p1s0f1np1 -bash run_cluster.sh $VLLM_IMAGE 192.168.100.10 --worker ~/.cache/huggingface \ --e VLLM_HOST_IP=192.168.100.11 \ --e UCX_NET_DEVICES=$MN_IF_NAME \ --e NCCL_SOCKET_IFNAME=$MN_IF_NAME \ --e OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME \ --e GLOO_SOCKET_IFNAME=$MN_IF_NAME \ --e TP_SOCKET_IFNAME=$MN_IF_NAME \ --e RAY_memory_monitor_refresh_ms=0 \ --e MASTER_ADDR=192.168.100.10 +# On Node 2, join as worker + +# Set the interface name (same as Node 1) +export MN_IF_NAME=enp1s0f1np1 + +# Get Node 2's own IP address +export VLLM_HOST_IP=$(ip -4 addr show $MN_IF_NAME | grep -oP '(?<=inet\s)\d+(\.\d+){3}') + +# IMPORTANT: Set HEAD_NODE_IP to Node 1's IP address +# You must get this value from Node 1 (run: echo $VLLM_HOST_IP on Node 1) +export HEAD_NODE_IP= + +echo "Worker IP: $VLLM_HOST_IP, connecting to head node at: $HEAD_NODE_IP" + +bash run_cluster.sh $VLLM_IMAGE $HEAD_NODE_IP --worker ~/.cache/huggingface \ + -e VLLM_HOST_IP=$VLLM_HOST_IP \ + -e UCX_NET_DEVICES=$MN_IF_NAME \ + -e NCCL_SOCKET_IFNAME=$MN_IF_NAME \ + -e OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME \ + -e GLOO_SOCKET_IFNAME=$MN_IF_NAME \ + -e TP_SOCKET_IFNAME=$MN_IF_NAME \ + -e RAY_memory_monitor_refresh_ms=0 \ + -e MASTER_ADDR=$HEAD_NODE_IP ``` +> **Note:** Replace `` with the actual IP address from Node 1. If using automatic link-local addressing, this will be something like `169.254.x.x`. If using manual static IPs, it will be `192.168.100.10`. + ## Step 6. Verify cluster status Confirm both nodes are recognized and available in the Ray cluster. From c2414e48e6ffc82fe6319c6c7c5453f72399770b Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:34:35 -0800 Subject: [PATCH 3/8] Clarify cluster status verification steps Updated instructions for verifying cluster status and finding the vLLM container name. --- nvidia/vllm/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 390d2d6..658519f 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -213,12 +213,14 @@ bash run_cluster.sh $VLLM_IMAGE $HEAD_NODE_IP --worker ~/.cache/huggingface \ > **Note:** Replace `` with the actual IP address from Node 1. If using automatic link-local addressing, this will be something like `169.254.x.x`. If using manual static IPs, it will be `192.168.100.10`. ## Step 6. Verify cluster status - Confirm both nodes are recognized and available in the Ray cluster. - ```bash -## On Node 1 (head node) -docker exec node ray status +# On Node 1 (head node) +# Find the vLLM container name (it will be node-) +export VLLM_CONTAINER=$(docker ps --format '{{.Names}}' | grep -E '^node-[0-9]+$') +echo "Found container: $VLLM_CONTAINER" + +docker exec $VLLM_CONTAINER ray status ``` Expected output shows 2 nodes with available GPU resources. From 4982987b097edf1054b713f0e0fedbe81f10ecc1 Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:37:27 -0800 Subject: [PATCH 4/8] Change Ray dashboard URL to use placeholder Updated the Ray dashboard URL placeholder in README. --- nvidia/vllm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 658519f..70ee63b 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -343,7 +343,7 @@ Access the Ray dashboard for cluster monitoring and explore additional features: ```bash ## Ray dashboard available at: -http://192.168.100.10:8265 +http://:8265 ## Consider implementing for production: ## - Health checks and automatic restarts From eae85d0e5ee41d49a6bbc1bb0c0e9aa0cdd33100 Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:38:12 -0800 Subject: [PATCH 5/8] Revise cleanup section and update next steps Updated the README to remove cleanup instructions and clarify next steps. Do not remove IPs that we set up in separate guide within this playbook --- nvidia/vllm/README.md | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 70ee63b..d52a709 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -319,25 +319,7 @@ nvidia-smi docker exec node nvidia-smi --query-gpu=memory.used,memory.total --format=csv ``` -## Step 14. Cleanup and rollback - -Remove temporary configurations and containers when testing is complete. - -> [!WARNING] -> This will stop all inference services and remove cluster configuration. - -```bash -## Stop containers on both nodes -docker stop node -docker rm node - -## Remove network configuration on both nodes -sudo ip addr del 192.168.100.10/24 dev enP2p1s0f1np1 # Node 1 -sudo ip addr del 192.168.100.11/24 dev enP2p1s0f1np1 # Node 2 -sudo ip link set enP2p1s0f1np1 down -``` - -## Step 15. Next steps +## Step 14. Next steps Access the Ray dashboard for cluster monitoring and explore additional features: From 39d8426e6f86bcd58e1ba70efc61bfa10675e4bf Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:40:50 -0800 Subject: [PATCH 6/8] Update README with specific IP address instructions Clarified instructions for replacing NODE_1_IP_ADDRESS with specific interface details. --- nvidia/vllm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index d52a709..60d6cc4 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -210,7 +210,7 @@ bash run_cluster.sh $VLLM_IMAGE $HEAD_NODE_IP --worker ~/.cache/huggingface \ -e MASTER_ADDR=$HEAD_NODE_IP ``` -> **Note:** Replace `` with the actual IP address from Node 1. If using automatic link-local addressing, this will be something like `169.254.x.x`. If using manual static IPs, it will be `192.168.100.10`. +> **Note:** Replace `` with the actual IP address from Node 1, specifically the QSFP interface enp1s0f1np1 configured in the [Connect two Sparks](https://build.nvidia.com/spark/connect-two-sparks) playbook. ## Step 6. Verify cluster status Confirm both nodes are recognized and available in the Ray cluster. From 35afed3f7cdcc8bce522bdb2a0a2243f82cc2c8c Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:43:18 -0800 Subject: [PATCH 7/8] Update README with dynamic container name retrieval --- nvidia/vllm/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 60d6cc4..7828553 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -241,7 +241,8 @@ Start the vLLM inference server with tensor parallelism across both nodes. ```bash ## On Node 1, enter container and start server -docker exec -it node /bin/bash +export VLLM_CONTAINER=$(docker ps --format '{{.Names}}' | grep -E '^node-[0-9]+$') +docker exec -it $VLLM_CONTAINER /bin/bash vllm serve meta-llama/Llama-3.3-70B-Instruct \ --tensor-parallel-size 2 --max_model_len 2048 ``` @@ -282,7 +283,8 @@ Start the server with memory-constrained parameters for the large model. ```bash ## On Node 1, launch with restricted parameters -docker exec -it node /bin/bash +export VLLM_CONTAINER=$(docker ps --format '{{.Names}}' | grep -E '^node-[0-9]+$') +docker exec -it $VLLM_CONTAINER /bin/bash vllm serve hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4 \ --tensor-parallel-size 2 --max-model-len 256 --gpu-memory-utilization 1.0 \ --max-num-seqs 1 --max_num_batched_tokens 256 From 621fd1c09b55af3564b8161c5dcee30db003c53e Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Tue, 2 Dec 2025 16:44:25 -0800 Subject: [PATCH 8/8] Update README with interface name instructions Added important note about interface name and identification. --- nvidia/vllm/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md index 7828553..19a9f14 100644 --- a/nvidia/vllm/README.md +++ b/nvidia/vllm/README.md @@ -55,6 +55,8 @@ support for ARM64. ## Instructions +> **Important:** This guide uses `enp1s0f1np1` as the interface name. Your actual interface may differ depending on which QSFP port you connected. Run `ibdev2netdev` to identify which interface shows "(Up)" + ## Step 1. Pull vLLM container image Find the latest container build from https://catalog.ngc.nvidia.com/orgs/nvidia/containers/vllm?version=25.11-py3