chore: Regenerate all playbooks

This commit is contained in:
GitLab CI 2025-10-11 23:26:38 +00:00
parent 1597e519e3
commit 0a7238d651
4 changed files with 63 additions and 260 deletions

View File

@ -46,7 +46,7 @@ a functional distributed computing environment.
All required files for this playbook can be found [here on GitHub](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/)
- **discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution
- [**discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution
## Time & risk
@ -190,7 +190,7 @@ You may be prompted for your password for each node.
SSH setup complete! Both local and remote nodes can now SSH to each other without passwords.
```
Note: If you encoutner any errors, please follow Option 2 below to manually configure SSH and debug the issue.
Note: If you encounter any errors, please follow Option 2 below to manually configure SSH and debug the issue.
### Option 2: Manually discover and configure SSH
@ -217,8 +217,8 @@ In this example, the IP address for Node 1 is **169.254.35.62**. Repeat the proc
On both nodes, run the following commands to enable passwordless SSH:
```bash
## Copy your SSH public key to both nodes. Please replace the IP addresses with the ones you found in the previous step.
ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@<IP for Node 1>
ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@<IP for Node 2>
ssh-copy-id -i ~/.ssh/id_rsa.pub <username>@<IP for Node 1>
ssh-copy-id -i ~/.ssh/id_rsa.pub <username>@<IP for Node 2>
```
## Step 4. Verify Multi-Node Communication

View File

@ -16,7 +16,7 @@
#
#!/bin/env bash
# discover-sparks.sh
# discover-sparks
# Discover available systems using avahi-browse and generate MPI hosts file
# Searches all active interfaces automatically
#
@ -24,9 +24,10 @@
set -euo pipefail
# Check if running as root
if [[ $EUID -eq 0 ]]; then
echo "Error: This script should not be run as root"
# Check if running as root or with sudo
if [[ $EUID -eq 0 ]] || [[ -n "${SUDO_USER:-}" ]]; then
echo "Error: This script should not be run as root or with sudo"
echo "Please run as a regular user"
exit 1
fi
@ -40,7 +41,10 @@ if [ ${#INTERFACES[@]} -eq 0 ]; then
echo "ERROR: No active interfaces found via ibdev2netdev."
exit 1
fi
OUTPUT_FILE="~/.stacked-sparks-hostfile"
# Create temporary file for processing
TEMP_FILE=$(mktemp)
trap 'rm -f "$TEMP_FILE"' EXIT
# Check if avahi-browse is available
if ! command -v avahi-browse &> /dev/null; then
@ -48,16 +52,6 @@ if ! command -v avahi-browse &> /dev/null; then
exit 1
fi
# Check if ssh-copy-id is available
if ! command -v ssh-copy-id &> /dev/null; then
echo "Error: ssh-copy-id not found. Please install openssh-client package."
exit 1
fi
# Create temporary file for processing
TEMP_FILE=$(mktemp)
trap 'rm -f "$TEMP_FILE"' EXIT
# Run avahi-browse and filter for SSH services on specified interfaces
# -p: parseable output
# -r: resolve host names and addresses
@ -74,25 +68,18 @@ done
if [ "$found_services" = false ]; then
echo "Warning: No services found on any specified interface"
touch "$OUTPUT_FILE"
echo "Created empty hosts file: $OUTPUT_FILE"
exit 0
fi
# Extract IPv4 addresses from the avahi-browse output
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
# Clear the output file
> "$OUTPUT_FILE"
# Parse IPv4 entries and extract IP addresses
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
# Clean up any trailing data
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
# Validate IP address format
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
echo "$clean_ip" >> "$OUTPUT_FILE"
echo "$clean_ip" >> "$TEMP_FILE.sorted"
echo "Found: $clean_ip ($fqdn)"
else
echo "Warning: Invalid IP format: $clean_ip"
@ -100,81 +87,77 @@ grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface pr
done
# Sort and remove duplicates
if [[ -s "$OUTPUT_FILE" ]]; then
sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
if [[ -s "$TEMP_FILE.sorted" ]]; then
sort -u "$TEMP_FILE.sorted" -o "$TEMP_FILE.sorted"
else
echo "No IPv4 addresses found."
exit 1
fi
# Check if SSH key exists, if not, prompt to generate
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
# Generate a shared SSH key if it doesn't exist
SHARED_KEY="$HOME/.ssh/id_ed25519_shared"
if [[ ! -f "$SHARED_KEY" ]]; then
echo "Generating shared SSH key for all nodes..."
ssh-keygen -t ed25519 -N "" -f "$SHARED_KEY" -q -C "shared-cluster-key"
fi
echo ""
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
echo "Setting up shared SSH access across all nodes..."
echo "You may be prompted for your password on each node."
# Ensure authorized_keys file exists
# Ensure local .ssh directory exists with correct permissions
mkdir -p "$HOME/.ssh"
touch "$HOME/.ssh/authorized_keys"
chmod 700 "$HOME/.ssh"
chmod 600 "$HOME/.ssh/authorized_keys"
# Add shared public key to local authorized_keys
if ! grep -qF "$(cat "$SHARED_KEY.pub")" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
cat "$SHARED_KEY.pub" >> "$HOME/.ssh/authorized_keys"
chmod 600 "$HOME/.ssh/authorized_keys"
echo " ✓ Added shared public key to local authorized_keys"
fi
# Distribute shared key to all remote nodes
while read -r node_ip; do
if [[ -n "$node_ip" ]]; then
echo ""
echo "Setting up SSH access for $node_ip ..."
echo "Configuring $node_ip..."
# Step 1: Copy local SSH key to remote node
echo " Copying local SSH key to $node_ip ..."
if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
echo " ✓ Successfully copied local key to $node_ip"
# Step 2: Set up reverse SSH access (remote -> local)
echo " Setting up reverse SSH access from $node_ip ..."
# Generate SSH key on remote node if it doesn't exist and get its public key
remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
# Ensure SSH directory exists
mkdir -p ~/.ssh
# Copy shared key to remote node and set up authorized_keys
if scp -o StrictHostKeyChecking=accept-new "$SHARED_KEY" "$SHARED_KEY.pub" "$USER@$node_ip:~/.ssh/" &>/dev/null; then
ssh -n -o StrictHostKeyChecking=accept-new "$USER@$node_ip" "
chmod 700 ~/.ssh
chmod 600 ~/.ssh/id_ed25519_shared
chmod 644 ~/.ssh/id_ed25519_shared.pub
# Generate key if it doesn'"'"'t exist
if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
# Add shared public key to authorized_keys if not present
if ! grep -qF \"\$(cat ~/.ssh/id_ed25519_shared.pub)\" ~/.ssh/authorized_keys 2>/dev/null; then
cat ~/.ssh/id_ed25519_shared.pub >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
fi
# Output the public key
cat ~/.ssh/id_ed25519.pub
' 2>/dev/null)
if [[ -n "$remote_pubkey" ]]; then
# Add remote public key to local authorized_keys if not already present
if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
echo " ✓ Added $node_ip's public key to local authorized_keys"
else
echo " ✓ $node_ip's public key already in local authorized_keys"
# Create/update SSH config to use shared key by default
if ! grep -q 'IdentityFile.*id_ed25519_shared' ~/.ssh/config 2>/dev/null; then
echo 'Host *' >> ~/.ssh/config
echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> ~/.ssh/config
chmod 600 ~/.ssh/config
fi
else
echo " ✗ Failed to get public key from $node_ip"
fi
" &>/dev/null
echo " ✓ Successfully configured $node_ip with shared key"
else
echo " ✗ Failed to copy local SSH key to $node_ip as $USER"
echo " ✗ Failed to configure $node_ip"
fi
fi
done < "$OUTPUT_FILE"
done < "$TEMP_FILE.sorted"
# Add hostfile to remote nodes
while read -r node_ip; do
if [[ -n "$node_ip" ]]; then
echo " Adding hostfile to $node_ip ..."
scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
fi
done < "$OUTPUT_FILE"
# Update local SSH config to use shared key
if ! grep -q 'IdentityFile.*id_ed25519_shared' "$HOME/.ssh/config" 2>/dev/null; then
touch "$HOME/.ssh/config"
echo 'Host *' >> "$HOME/.ssh/config"
echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> "$HOME/.ssh/config"
chmod 600 "$HOME/.ssh/config"
echo " ✓ Updated local SSH config to use shared key"
fi
echo ""
echo "Bidirectional SSH setup complete!"
echo "Both local and remote nodes can now SSH to each other without passwords."
echo "Shared SSH setup complete!"
echo "All nodes can now SSH to each other using the shared key (id_ed25519_shared)."

View File

@ -1,180 +0,0 @@
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#!/bin/env bash
# discover-sparks.sh
# Discover available systems using avahi-browse and generate MPI hosts file
# Searches all active interfaces automatically
#
# Usage: bash ./discover-sparks
set -euo pipefail
# Check if running as root
if [[ $EUID -eq 0 ]]; then
echo "Error: This script should not be run as root"
exit 1
fi
# Dynamically get interface names from ibdev2netdev output
# Use ibdev2netdev to list Infiniband devices and their network interfaces.
# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)
# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0).
# The tr command removes any parentheses from the output.
INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()'))
if [ ${#INTERFACES[@]} -eq 0 ]; then
echo "ERROR: No active interfaces found via ibdev2netdev."
exit 1
fi
OUTPUT_FILE="~/.stacked-sparks-hostfile"
# Check if avahi-browse is available
if ! command -v avahi-browse &> /dev/null; then
echo "Error: avahi-browse not found. Please install avahi-utils package."
exit 1
fi
# Check if ssh-copy-id is available
if ! command -v ssh-copy-id &> /dev/null; then
echo "Error: ssh-copy-id not found. Please install openssh-client package."
exit 1
fi
# Create temporary file for processing
TEMP_FILE=$(mktemp)
trap 'rm -f "$TEMP_FILE"' EXIT
# Run avahi-browse and filter for SSH services on specified interfaces
# -p: parseable output
# -r: resolve host names and addresses
# -f: terminate after dumping all entries available at startup
avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)
# Filter for both interfaces
found_services=false
for interface in "${INTERFACES[@]}"; do
if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then
found_services=true
fi
done
if [ "$found_services" = false ]; then
echo "Warning: No services found on any specified interface"
touch "$OUTPUT_FILE"
echo "Created empty hosts file: $OUTPUT_FILE"
exit 0
fi
# Extract IPv4 addresses from the avahi-browse output
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
# Clear the output file
> "$OUTPUT_FILE"
# Parse IPv4 entries and extract IP addresses
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
# Clean up any trailing data
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
# Validate IP address format
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
echo "$clean_ip" >> "$OUTPUT_FILE"
echo "Found: $clean_ip ($fqdn)"
else
echo "Warning: Invalid IP format: $clean_ip"
fi
done
# Sort and remove duplicates
if [[ -s "$OUTPUT_FILE" ]]; then
sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
else
echo "No IPv4 addresses found."
exit 1
fi
# Check if SSH key exists, if not, prompt to generate
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
fi
echo ""
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
echo "You may be prompted for your password on each node."
# Ensure authorized_keys file exists
mkdir -p "$HOME/.ssh"
touch "$HOME/.ssh/authorized_keys"
chmod 700 "$HOME/.ssh"
chmod 600 "$HOME/.ssh/authorized_keys"
while read -r node_ip; do
if [[ -n "$node_ip" ]]; then
echo ""
echo "Setting up SSH access for $node_ip ..."
# Step 1: Copy local SSH key to remote node
echo " Copying local SSH key to $node_ip ..."
if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
echo " ✓ Successfully copied local key to $node_ip"
# Step 2: Set up reverse SSH access (remote -> local)
echo " Setting up reverse SSH access from $node_ip ..."
# Generate SSH key on remote node if it doesn't exist and get its public key
remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
# Ensure SSH directory exists
mkdir -p ~/.ssh
chmod 700 ~/.ssh
# Generate key if it doesn'"'"'t exist
if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
fi
# Output the public key
cat ~/.ssh/id_ed25519.pub
' 2>/dev/null)
if [[ -n "$remote_pubkey" ]]; then
# Add remote public key to local authorized_keys if not already present
if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
echo " ✓ Added $node_ip's public key to local authorized_keys"
else
echo "$node_ip's public key already in local authorized_keys"
fi
else
echo " ✗ Failed to get public key from $node_ip"
fi
else
echo " ✗ Failed to copy local SSH key to $node_ip as $USER"
fi
fi
done < "$OUTPUT_FILE"
# Add hostfile to remote nodes
while read -r node_ip; do
if [[ -n "$node_ip" ]]; then
echo " Adding hostfile to $node_ip ..."
scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
fi
done < "$OUTPUT_FILE"
echo ""
echo "Bidirectional SSH setup complete!"
echo "Both local and remote nodes can now SSH to each other without passwords."

View File

@ -22,7 +22,7 @@ While NVIDIA NIMs are not yet widely supported, this guide uses **Ollama** with
### What You'll Accomplish
Youll have a fully configured DGX Spark system capable of:
You'll have a fully configured DGX Spark system capable of:
- Running local code assistance through Ollama.
- Serving models remotely for Continue.dev and VSCode integration.
- Hosting large LLMs like GPT-OSS 120B using unified memory.
@ -144,7 +144,7 @@ Add additional model entries for any other Ollama models you wish to host remote
- Verify Docker and GPU drivers are installed correctly.
- Run `ollama serve` manually to view errors.
**2. VSCode cant connect**
**2. VSCode can't connect**
- Ensure port 11434 is open and accessible from your workstation.
- Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf`.