mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-25 19:33:53 +00:00
chore: Regenerate all playbooks
This commit is contained in:
parent
1597e519e3
commit
0a7238d651
@ -46,7 +46,7 @@ a functional distributed computing environment.
|
|||||||
|
|
||||||
All required files for this playbook can be found [here on GitHub](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/)
|
All required files for this playbook can be found [here on GitHub](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/)
|
||||||
|
|
||||||
- **discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution
|
- [**discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution
|
||||||
|
|
||||||
## Time & risk
|
## Time & risk
|
||||||
|
|
||||||
@ -190,7 +190,7 @@ You may be prompted for your password for each node.
|
|||||||
SSH setup complete! Both local and remote nodes can now SSH to each other without passwords.
|
SSH setup complete! Both local and remote nodes can now SSH to each other without passwords.
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: If you encoutner any errors, please follow Option 2 below to manually configure SSH and debug the issue.
|
Note: If you encounter any errors, please follow Option 2 below to manually configure SSH and debug the issue.
|
||||||
|
|
||||||
### Option 2: Manually discover and configure SSH
|
### Option 2: Manually discover and configure SSH
|
||||||
|
|
||||||
@ -217,8 +217,8 @@ In this example, the IP address for Node 1 is **169.254.35.62**. Repeat the proc
|
|||||||
On both nodes, run the following commands to enable passwordless SSH:
|
On both nodes, run the following commands to enable passwordless SSH:
|
||||||
```bash
|
```bash
|
||||||
## Copy your SSH public key to both nodes. Please replace the IP addresses with the ones you found in the previous step.
|
## Copy your SSH public key to both nodes. Please replace the IP addresses with the ones you found in the previous step.
|
||||||
ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@<IP for Node 1>
|
ssh-copy-id -i ~/.ssh/id_rsa.pub <username>@<IP for Node 1>
|
||||||
ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@<IP for Node 2>
|
ssh-copy-id -i ~/.ssh/id_rsa.pub <username>@<IP for Node 2>
|
||||||
```
|
```
|
||||||
|
|
||||||
## Step 4. Verify Multi-Node Communication
|
## Step 4. Verify Multi-Node Communication
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
#
|
#
|
||||||
#!/bin/env bash
|
#!/bin/env bash
|
||||||
|
|
||||||
# discover-sparks.sh
|
# discover-sparks
|
||||||
# Discover available systems using avahi-browse and generate MPI hosts file
|
# Discover available systems using avahi-browse and generate MPI hosts file
|
||||||
# Searches all active interfaces automatically
|
# Searches all active interfaces automatically
|
||||||
#
|
#
|
||||||
@ -24,9 +24,10 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Check if running as root
|
# Check if running as root or with sudo
|
||||||
if [[ $EUID -eq 0 ]]; then
|
if [[ $EUID -eq 0 ]] || [[ -n "${SUDO_USER:-}" ]]; then
|
||||||
echo "Error: This script should not be run as root"
|
echo "Error: This script should not be run as root or with sudo"
|
||||||
|
echo "Please run as a regular user"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -40,7 +41,10 @@ if [ ${#INTERFACES[@]} -eq 0 ]; then
|
|||||||
echo "ERROR: No active interfaces found via ibdev2netdev."
|
echo "ERROR: No active interfaces found via ibdev2netdev."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
OUTPUT_FILE="~/.stacked-sparks-hostfile"
|
|
||||||
|
# Create temporary file for processing
|
||||||
|
TEMP_FILE=$(mktemp)
|
||||||
|
trap 'rm -f "$TEMP_FILE"' EXIT
|
||||||
|
|
||||||
# Check if avahi-browse is available
|
# Check if avahi-browse is available
|
||||||
if ! command -v avahi-browse &> /dev/null; then
|
if ! command -v avahi-browse &> /dev/null; then
|
||||||
@ -48,16 +52,6 @@ if ! command -v avahi-browse &> /dev/null; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check if ssh-copy-id is available
|
|
||||||
if ! command -v ssh-copy-id &> /dev/null; then
|
|
||||||
echo "Error: ssh-copy-id not found. Please install openssh-client package."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create temporary file for processing
|
|
||||||
TEMP_FILE=$(mktemp)
|
|
||||||
trap 'rm -f "$TEMP_FILE"' EXIT
|
|
||||||
|
|
||||||
# Run avahi-browse and filter for SSH services on specified interfaces
|
# Run avahi-browse and filter for SSH services on specified interfaces
|
||||||
# -p: parseable output
|
# -p: parseable output
|
||||||
# -r: resolve host names and addresses
|
# -r: resolve host names and addresses
|
||||||
@ -74,25 +68,18 @@ done
|
|||||||
|
|
||||||
if [ "$found_services" = false ]; then
|
if [ "$found_services" = false ]; then
|
||||||
echo "Warning: No services found on any specified interface"
|
echo "Warning: No services found on any specified interface"
|
||||||
touch "$OUTPUT_FILE"
|
|
||||||
echo "Created empty hosts file: $OUTPUT_FILE"
|
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Extract IPv4 addresses from the avahi-browse output
|
# Extract IPv4 addresses from the avahi-browse output
|
||||||
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
|
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
|
||||||
|
|
||||||
# Clear the output file
|
|
||||||
> "$OUTPUT_FILE"
|
|
||||||
|
|
||||||
# Parse IPv4 entries and extract IP addresses
|
|
||||||
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
|
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
|
||||||
# Clean up any trailing data
|
# Clean up any trailing data
|
||||||
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
|
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
|
||||||
|
|
||||||
# Validate IP address format
|
# Validate IP address format
|
||||||
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
||||||
echo "$clean_ip" >> "$OUTPUT_FILE"
|
echo "$clean_ip" >> "$TEMP_FILE.sorted"
|
||||||
echo "Found: $clean_ip ($fqdn)"
|
echo "Found: $clean_ip ($fqdn)"
|
||||||
else
|
else
|
||||||
echo "Warning: Invalid IP format: $clean_ip"
|
echo "Warning: Invalid IP format: $clean_ip"
|
||||||
@ -100,81 +87,77 @@ grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface pr
|
|||||||
done
|
done
|
||||||
|
|
||||||
# Sort and remove duplicates
|
# Sort and remove duplicates
|
||||||
if [[ -s "$OUTPUT_FILE" ]]; then
|
if [[ -s "$TEMP_FILE.sorted" ]]; then
|
||||||
sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
|
sort -u "$TEMP_FILE.sorted" -o "$TEMP_FILE.sorted"
|
||||||
else
|
else
|
||||||
echo "No IPv4 addresses found."
|
echo "No IPv4 addresses found."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check if SSH key exists, if not, prompt to generate
|
# Generate a shared SSH key if it doesn't exist
|
||||||
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
|
SHARED_KEY="$HOME/.ssh/id_ed25519_shared"
|
||||||
ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
|
if [[ ! -f "$SHARED_KEY" ]]; then
|
||||||
|
echo "Generating shared SSH key for all nodes..."
|
||||||
|
ssh-keygen -t ed25519 -N "" -f "$SHARED_KEY" -q -C "shared-cluster-key"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
|
echo "Setting up shared SSH access across all nodes..."
|
||||||
echo "You may be prompted for your password on each node."
|
echo "You may be prompted for your password on each node."
|
||||||
|
|
||||||
# Ensure authorized_keys file exists
|
# Ensure local .ssh directory exists with correct permissions
|
||||||
mkdir -p "$HOME/.ssh"
|
mkdir -p "$HOME/.ssh"
|
||||||
touch "$HOME/.ssh/authorized_keys"
|
|
||||||
chmod 700 "$HOME/.ssh"
|
chmod 700 "$HOME/.ssh"
|
||||||
chmod 600 "$HOME/.ssh/authorized_keys"
|
|
||||||
|
|
||||||
|
# Add shared public key to local authorized_keys
|
||||||
|
if ! grep -qF "$(cat "$SHARED_KEY.pub")" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
|
||||||
|
cat "$SHARED_KEY.pub" >> "$HOME/.ssh/authorized_keys"
|
||||||
|
chmod 600 "$HOME/.ssh/authorized_keys"
|
||||||
|
echo " ✓ Added shared public key to local authorized_keys"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Distribute shared key to all remote nodes
|
||||||
while read -r node_ip; do
|
while read -r node_ip; do
|
||||||
if [[ -n "$node_ip" ]]; then
|
if [[ -n "$node_ip" ]]; then
|
||||||
echo ""
|
echo "Configuring $node_ip..."
|
||||||
echo "Setting up SSH access for $node_ip ..."
|
|
||||||
|
|
||||||
# Step 1: Copy local SSH key to remote node
|
# Copy shared key to remote node and set up authorized_keys
|
||||||
echo " Copying local SSH key to $node_ip ..."
|
if scp -o StrictHostKeyChecking=accept-new "$SHARED_KEY" "$SHARED_KEY.pub" "$USER@$node_ip:~/.ssh/" &>/dev/null; then
|
||||||
if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
|
ssh -n -o StrictHostKeyChecking=accept-new "$USER@$node_ip" "
|
||||||
echo " ✓ Successfully copied local key to $node_ip"
|
|
||||||
|
|
||||||
# Step 2: Set up reverse SSH access (remote -> local)
|
|
||||||
echo " Setting up reverse SSH access from $node_ip ..."
|
|
||||||
|
|
||||||
# Generate SSH key on remote node if it doesn't exist and get its public key
|
|
||||||
remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
|
|
||||||
# Ensure SSH directory exists
|
|
||||||
mkdir -p ~/.ssh
|
|
||||||
chmod 700 ~/.ssh
|
chmod 700 ~/.ssh
|
||||||
|
chmod 600 ~/.ssh/id_ed25519_shared
|
||||||
|
chmod 644 ~/.ssh/id_ed25519_shared.pub
|
||||||
|
|
||||||
# Generate key if it doesn'"'"'t exist
|
# Add shared public key to authorized_keys if not present
|
||||||
if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
|
if ! grep -qF \"\$(cat ~/.ssh/id_ed25519_shared.pub)\" ~/.ssh/authorized_keys 2>/dev/null; then
|
||||||
ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
|
cat ~/.ssh/id_ed25519_shared.pub >> ~/.ssh/authorized_keys
|
||||||
|
chmod 600 ~/.ssh/authorized_keys
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Output the public key
|
# Create/update SSH config to use shared key by default
|
||||||
cat ~/.ssh/id_ed25519.pub
|
if ! grep -q 'IdentityFile.*id_ed25519_shared' ~/.ssh/config 2>/dev/null; then
|
||||||
' 2>/dev/null)
|
echo 'Host *' >> ~/.ssh/config
|
||||||
|
echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> ~/.ssh/config
|
||||||
|
chmod 600 ~/.ssh/config
|
||||||
|
fi
|
||||||
|
" &>/dev/null
|
||||||
|
|
||||||
if [[ -n "$remote_pubkey" ]]; then
|
echo " ✓ Successfully configured $node_ip with shared key"
|
||||||
# Add remote public key to local authorized_keys if not already present
|
|
||||||
if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
|
|
||||||
echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
|
|
||||||
echo " ✓ Added $node_ip's public key to local authorized_keys"
|
|
||||||
else
|
else
|
||||||
echo " ✓ $node_ip's public key already in local authorized_keys"
|
echo " ✗ Failed to configure $node_ip"
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " ✗ Failed to get public key from $node_ip"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " ✗ Failed to copy local SSH key to $node_ip as $USER"
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
done < "$OUTPUT_FILE"
|
done < "$TEMP_FILE.sorted"
|
||||||
|
|
||||||
# Add hostfile to remote nodes
|
# Update local SSH config to use shared key
|
||||||
while read -r node_ip; do
|
if ! grep -q 'IdentityFile.*id_ed25519_shared' "$HOME/.ssh/config" 2>/dev/null; then
|
||||||
if [[ -n "$node_ip" ]]; then
|
touch "$HOME/.ssh/config"
|
||||||
echo " Adding hostfile to $node_ip ..."
|
echo 'Host *' >> "$HOME/.ssh/config"
|
||||||
scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
|
echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> "$HOME/.ssh/config"
|
||||||
fi
|
chmod 600 "$HOME/.ssh/config"
|
||||||
done < "$OUTPUT_FILE"
|
echo " ✓ Updated local SSH config to use shared key"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Bidirectional SSH setup complete!"
|
echo "Shared SSH setup complete!"
|
||||||
echo "Both local and remote nodes can now SSH to each other without passwords."
|
echo "All nodes can now SSH to each other using the shared key (id_ed25519_shared)."
|
||||||
|
|||||||
@ -1,180 +0,0 @@
|
|||||||
#
|
|
||||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
#!/bin/env bash
|
|
||||||
|
|
||||||
# discover-sparks.sh
|
|
||||||
# Discover available systems using avahi-browse and generate MPI hosts file
|
|
||||||
# Searches all active interfaces automatically
|
|
||||||
#
|
|
||||||
# Usage: bash ./discover-sparks
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Check if running as root
|
|
||||||
if [[ $EUID -eq 0 ]]; then
|
|
||||||
echo "Error: This script should not be run as root"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Dynamically get interface names from ibdev2netdev output
|
|
||||||
# Use ibdev2netdev to list Infiniband devices and their network interfaces.
|
|
||||||
# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)
|
|
||||||
# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0).
|
|
||||||
# The tr command removes any parentheses from the output.
|
|
||||||
INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()'))
|
|
||||||
if [ ${#INTERFACES[@]} -eq 0 ]; then
|
|
||||||
echo "ERROR: No active interfaces found via ibdev2netdev."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
OUTPUT_FILE="~/.stacked-sparks-hostfile"
|
|
||||||
|
|
||||||
# Check if avahi-browse is available
|
|
||||||
if ! command -v avahi-browse &> /dev/null; then
|
|
||||||
echo "Error: avahi-browse not found. Please install avahi-utils package."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if ssh-copy-id is available
|
|
||||||
if ! command -v ssh-copy-id &> /dev/null; then
|
|
||||||
echo "Error: ssh-copy-id not found. Please install openssh-client package."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create temporary file for processing
|
|
||||||
TEMP_FILE=$(mktemp)
|
|
||||||
trap 'rm -f "$TEMP_FILE"' EXIT
|
|
||||||
|
|
||||||
# Run avahi-browse and filter for SSH services on specified interfaces
|
|
||||||
# -p: parseable output
|
|
||||||
# -r: resolve host names and addresses
|
|
||||||
# -f: terminate after dumping all entries available at startup
|
|
||||||
avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)
|
|
||||||
|
|
||||||
# Filter for both interfaces
|
|
||||||
found_services=false
|
|
||||||
for interface in "${INTERFACES[@]}"; do
|
|
||||||
if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then
|
|
||||||
found_services=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "$found_services" = false ]; then
|
|
||||||
echo "Warning: No services found on any specified interface"
|
|
||||||
touch "$OUTPUT_FILE"
|
|
||||||
echo "Created empty hosts file: $OUTPUT_FILE"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Extract IPv4 addresses from the avahi-browse output
|
|
||||||
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;
|
|
||||||
|
|
||||||
# Clear the output file
|
|
||||||
> "$OUTPUT_FILE"
|
|
||||||
|
|
||||||
# Parse IPv4 entries and extract IP addresses
|
|
||||||
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
|
|
||||||
# Clean up any trailing data
|
|
||||||
clean_ip=$(echo "$ip_address" | sed 's/;.*$//')
|
|
||||||
|
|
||||||
# Validate IP address format
|
|
||||||
if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
|
||||||
echo "$clean_ip" >> "$OUTPUT_FILE"
|
|
||||||
echo "Found: $clean_ip ($fqdn)"
|
|
||||||
else
|
|
||||||
echo "Warning: Invalid IP format: $clean_ip"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Sort and remove duplicates
|
|
||||||
if [[ -s "$OUTPUT_FILE" ]]; then
|
|
||||||
sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
|
|
||||||
else
|
|
||||||
echo "No IPv4 addresses found."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if SSH key exists, if not, prompt to generate
|
|
||||||
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
|
|
||||||
ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
|
|
||||||
echo "You may be prompted for your password on each node."
|
|
||||||
|
|
||||||
# Ensure authorized_keys file exists
|
|
||||||
mkdir -p "$HOME/.ssh"
|
|
||||||
touch "$HOME/.ssh/authorized_keys"
|
|
||||||
chmod 700 "$HOME/.ssh"
|
|
||||||
chmod 600 "$HOME/.ssh/authorized_keys"
|
|
||||||
|
|
||||||
while read -r node_ip; do
|
|
||||||
if [[ -n "$node_ip" ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "Setting up SSH access for $node_ip ..."
|
|
||||||
|
|
||||||
# Step 1: Copy local SSH key to remote node
|
|
||||||
echo " Copying local SSH key to $node_ip ..."
|
|
||||||
if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
|
|
||||||
echo " ✓ Successfully copied local key to $node_ip"
|
|
||||||
|
|
||||||
# Step 2: Set up reverse SSH access (remote -> local)
|
|
||||||
echo " Setting up reverse SSH access from $node_ip ..."
|
|
||||||
|
|
||||||
# Generate SSH key on remote node if it doesn't exist and get its public key
|
|
||||||
remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
|
|
||||||
# Ensure SSH directory exists
|
|
||||||
mkdir -p ~/.ssh
|
|
||||||
chmod 700 ~/.ssh
|
|
||||||
|
|
||||||
# Generate key if it doesn'"'"'t exist
|
|
||||||
if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
|
|
||||||
ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Output the public key
|
|
||||||
cat ~/.ssh/id_ed25519.pub
|
|
||||||
' 2>/dev/null)
|
|
||||||
|
|
||||||
if [[ -n "$remote_pubkey" ]]; then
|
|
||||||
# Add remote public key to local authorized_keys if not already present
|
|
||||||
if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
|
|
||||||
echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
|
|
||||||
echo " ✓ Added $node_ip's public key to local authorized_keys"
|
|
||||||
else
|
|
||||||
echo " ✓ $node_ip's public key already in local authorized_keys"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " ✗ Failed to get public key from $node_ip"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " ✗ Failed to copy local SSH key to $node_ip as $USER"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done < "$OUTPUT_FILE"
|
|
||||||
|
|
||||||
# Add hostfile to remote nodes
|
|
||||||
while read -r node_ip; do
|
|
||||||
if [[ -n "$node_ip" ]]; then
|
|
||||||
echo " Adding hostfile to $node_ip ..."
|
|
||||||
scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
|
|
||||||
fi
|
|
||||||
done < "$OUTPUT_FILE"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Bidirectional SSH setup complete!"
|
|
||||||
echo "Both local and remote nodes can now SSH to each other without passwords."
|
|
||||||
@ -22,7 +22,7 @@ While NVIDIA NIMs are not yet widely supported, this guide uses **Ollama** with
|
|||||||
|
|
||||||
### What You'll Accomplish
|
### What You'll Accomplish
|
||||||
|
|
||||||
You’ll have a fully configured DGX Spark system capable of:
|
You'll have a fully configured DGX Spark system capable of:
|
||||||
- Running local code assistance through Ollama.
|
- Running local code assistance through Ollama.
|
||||||
- Serving models remotely for Continue.dev and VSCode integration.
|
- Serving models remotely for Continue.dev and VSCode integration.
|
||||||
- Hosting large LLMs like GPT-OSS 120B using unified memory.
|
- Hosting large LLMs like GPT-OSS 120B using unified memory.
|
||||||
@ -144,7 +144,7 @@ Add additional model entries for any other Ollama models you wish to host remote
|
|||||||
- Verify Docker and GPU drivers are installed correctly.
|
- Verify Docker and GPU drivers are installed correctly.
|
||||||
- Run `ollama serve` manually to view errors.
|
- Run `ollama serve` manually to view errors.
|
||||||
|
|
||||||
**2. VSCode can’t connect**
|
**2. VSCode can't connect**
|
||||||
- Ensure port 11434 is open and accessible from your workstation.
|
- Ensure port 11434 is open and accessible from your workstation.
|
||||||
- Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf`.
|
- Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf`.
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user