dgx-spark-playbooks/nvidia/stack-sparks/assets/discover-sparks

#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#!/bin/env bash

# discover-sparks.sh
# Discover available systems using avahi-browse and generate MPI hosts file
# Searches all active interfaces automatically
#
# Usage: bash ./discover-sparks

set -euo pipefail

# Check if running as root
if [[ $EUID -eq 0 ]]; then
    echo "Error: This script should not be run as root"
    exit 1
fi

# Dynamically get interface names from ibdev2netdev output
# Use ibdev2netdev to list Infiniband devices and their network interfaces.
# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)
# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0).
# The tr command removes any parentheses from the output.
INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()'))
if [ ${#INTERFACES[@]} -eq 0 ]; then
    echo "ERROR: No active interfaces found via ibdev2netdev."
    exit 1
fi
OUTPUT_FILE="~/.stacked-sparks-hostfile"

# Check if avahi-browse is available
if ! command -v avahi-browse &> /dev/null; then
    echo "Error: avahi-browse not found. Please install avahi-utils package."
    exit 1
fi

# Check if ssh-copy-id is available
if ! command -v ssh-copy-id &> /dev/null; then
    echo "Error: ssh-copy-id not found. Please install openssh-client package."
    exit 1
fi

# Create temporary file for processing
TEMP_FILE=$(mktemp)
trap 'rm -f "$TEMP_FILE"' EXIT

# Run avahi-browse and filter for SSH services on specified interfaces
# -p: parseable output
# -r: resolve host names and addresses
# -f: terminate after dumping all entries available at startup
avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)

# Filter for both interfaces
found_services=false
for interface in "${INTERFACES[@]}"; do
    if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then
        found_services=true
    fi
done

if [ "$found_services" = false ]; then
    echo "Warning: No services found on any specified interface"
    touch "$OUTPUT_FILE"
    echo "Created empty hosts file: $OUTPUT_FILE"
    exit 0
fi

# Extract IPv4 addresses from the avahi-browse output
# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;

# Clear the output file
> "$OUTPUT_FILE"

# Parse IPv4 entries and extract IP addresses
grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do
    # Clean up any trailing data
    clean_ip=$(echo "$ip_address" | sed 's/;.*$//')

    # Validate IP address format
    if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
        echo "$clean_ip" >> "$OUTPUT_FILE"
        echo "Found: $clean_ip ($fqdn)"
    else
        echo "Warning: Invalid IP format: $clean_ip"
    fi
done

# Sort and remove duplicates
if [[ -s "$OUTPUT_FILE" ]]; then
    sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"
else
    echo "No IPv4 addresses found."
    exit 1
fi

# Check if SSH key exists, if not, prompt to generate
if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then
    ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q
fi

echo ""
echo "Setting up bidirectional SSH access (local <-> remote nodes)..."
echo "You may be prompted for your password on each node."

# Ensure authorized_keys file exists
mkdir -p "$HOME/.ssh"
touch "$HOME/.ssh/authorized_keys"
chmod 700 "$HOME/.ssh"
chmod 600 "$HOME/.ssh/authorized_keys"

while read -r node_ip; do
    if [[ -n "$node_ip" ]]; then
        echo ""
        echo "Setting up SSH access for $node_ip ..."

        # Step 1: Copy local SSH key to remote node
        echo "  Copying local SSH key to $node_ip ..."
        if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then
            echo "  ✓ Successfully copied local key to $node_ip"

            # Step 2: Set up reverse SSH access (remote -> local)
            echo "  Setting up reverse SSH access from $node_ip ..."

            # Generate SSH key on remote node if it doesn't exist and get its public key
            remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '
                # Ensure SSH directory exists
                mkdir -p ~/.ssh
                chmod 700 ~/.ssh

                # Generate key if it doesn'"'"'t exist
                if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then
                    ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q
                fi

                # Output the public key
                cat ~/.ssh/id_ed25519.pub
            ' 2>/dev/null)

            if [[ -n "$remote_pubkey" ]]; then
                # Add remote public key to local authorized_keys if not already present
                if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then
                    echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"
                    echo "  ✓ Added $node_ip's public key to local authorized_keys"
                else
                    echo "  ✓ $node_ip's public key already in local authorized_keys"
                fi
            else
                echo "  ✗ Failed to get public key from $node_ip"
            fi
        else
            echo "  ✗ Failed to copy local SSH key to $node_ip as $USER"
        fi
    fi
done < "$OUTPUT_FILE"

# Add hostfile to remote nodes
while read -r node_ip; do
    if [[ -n "$node_ip" ]]; then
        echo "  Adding hostfile to $node_ip ..."
        scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"
    fi
done < "$OUTPUT_FILE"

echo ""
echo "Bidirectional SSH setup complete!"
echo "Both local and remote nodes can now SSH to each other without passwords."
chore: Regenerate all playbooks 2025-10-03 20:46:11 +00:00			`#`
			`# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
			`# SPDX-License-Identifier: Apache-2.0`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`#!/bin/env bash`

			`# discover-sparks.sh`
			`# Discover available systems using avahi-browse and generate MPI hosts file`
			`# Searches all active interfaces automatically`
			`#`
			`# Usage: bash ./discover-sparks`

			`set -euo pipefail`

chore: Regenerate all playbooks 2025-10-07 23:45:29 +00:00			`# Check if running as root`
			`if [[ $EUID -eq 0 ]]; then`
			`echo "Error: This script should not be run as root"`
			`exit 1`
			`fi`

chore: Regenerate all playbooks 2025-10-03 20:46:11 +00:00			`# Dynamically get interface names from ibdev2netdev output`
			`# Use ibdev2netdev to list Infiniband devices and their network interfaces.`
			`# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up)`
			`# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0).`
			`# The tr command removes any parentheses from the output.`
			`INTERFACES=($(ibdev2netdev \| awk '/Up\)/ {print $5}' \| tr -d '()'))`
			`if [ ${#INTERFACES[@]} -eq 0 ]; then`
			`echo "ERROR: No active interfaces found via ibdev2netdev."`
			`exit 1`
			`fi`
chore: Regenerate all playbooks 2025-10-07 23:45:29 +00:00			`OUTPUT_FILE="~/.stacked-sparks-hostfile"`
chore: Regenerate all playbooks 2025-10-03 20:46:11 +00:00
			`# Check if avahi-browse is available`
			`if ! command -v avahi-browse &> /dev/null; then`
			`echo "Error: avahi-browse not found. Please install avahi-utils package."`
			`exit 1`
			`fi`

			`# Check if ssh-copy-id is available`
			`if ! command -v ssh-copy-id &> /dev/null; then`
			`echo "Error: ssh-copy-id not found. Please install openssh-client package."`
			`exit 1`
			`fi`

			`# Create temporary file for processing`
			`TEMP_FILE=$(mktemp)`
			`trap 'rm -f "$TEMP_FILE"' EXIT`

			`# Run avahi-browse and filter for SSH services on specified interfaces`
			`# -p: parseable output`
			`# -r: resolve host names and addresses`
			`# -f: terminate after dumping all entries available at startup`
			`avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)`

			`# Filter for both interfaces`
			`found_services=false`
			`for interface in "${INTERFACES[@]}"; do`
			`if echo "$avahi_output" \| grep "$interface" >> "$TEMP_FILE"; then`
			`found_services=true`
			`fi`
			`done`

			`if [ "$found_services" = false ]; then`
			`echo "Warning: No services found on any specified interface"`
			`touch "$OUTPUT_FILE"`
			`echo "Created empty hosts file: $OUTPUT_FILE"`
			`exit 0`
			`fi`

			`# Extract IPv4 addresses from the avahi-browse output`
			`# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port;`

			`# Clear the output file`
			`> "$OUTPUT_FILE"`

			`# Parse IPv4 entries and extract IP addresses`
			`grep "^=" "$TEMP_FILE" \| grep "IPv4" \| while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do`
			`# Clean up any trailing data`
			`clean_ip=$(echo "$ip_address" \| sed 's/;.*$//')`

			`# Validate IP address format`
			`if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then`
			`echo "$clean_ip" >> "$OUTPUT_FILE"`
			`echo "Found: $clean_ip ($fqdn)"`
			`else`
			`echo "Warning: Invalid IP format: $clean_ip"`
			`fi`
			`done`

			`# Sort and remove duplicates`
			`if [[ -s "$OUTPUT_FILE" ]]; then`
			`sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE"`
			`else`
			`echo "No IPv4 addresses found."`
			`exit 1`
			`fi`

			`# Check if SSH key exists, if not, prompt to generate`
			`if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then`
			`ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q`
			`fi`

			`echo ""`
			`echo "Setting up bidirectional SSH access (local <-> remote nodes)..."`
			`echo "You may be prompted for your password on each node."`

			`# Ensure authorized_keys file exists`
			`mkdir -p "$HOME/.ssh"`
			`touch "$HOME/.ssh/authorized_keys"`
			`chmod 700 "$HOME/.ssh"`
			`chmod 600 "$HOME/.ssh/authorized_keys"`

			`while read -r node_ip; do`
			`if [[ -n "$node_ip" ]]; then`
			`echo ""`
			`echo "Setting up SSH access for $node_ip ..."`

			`# Step 1: Copy local SSH key to remote node`
			`echo " Copying local SSH key to $node_ip ..."`
			`if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then`
			`echo " ✓ Successfully copied local key to $node_ip"`

			`# Step 2: Set up reverse SSH access (remote -> local)`
			`echo " Setting up reverse SSH access from $node_ip ..."`

			`# Generate SSH key on remote node if it doesn't exist and get its public key`
			`remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" '`
			`# Ensure SSH directory exists`
			`mkdir -p ~/.ssh`
			`chmod 700 ~/.ssh`

			`# Generate key if it doesn'"'"'t exist`
			`if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then`
			`ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q`
			`fi`

			`# Output the public key`
			`cat ~/.ssh/id_ed25519.pub`
			`' 2>/dev/null)`

			`if [[ -n "$remote_pubkey" ]]; then`
			`# Add remote public key to local authorized_keys if not already present`
			`if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then`
			`echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys"`
			`echo " ✓ Added $node_ip's public key to local authorized_keys"`
			`else`
			`echo " ✓ $node_ip's public key already in local authorized_keys"`
			`fi`
			`else`
			`echo " ✗ Failed to get public key from $node_ip"`
			`fi`
			`else`
			`echo " ✗ Failed to copy local SSH key to $node_ip as $USER"`
			`fi`
			`fi`
			`done < "$OUTPUT_FILE"`

			`# Add hostfile to remote nodes`
			`while read -r node_ip; do`
			`if [[ -n "$node_ip" ]]; then`
			`echo " Adding hostfile to $node_ip ..."`
			`scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE"`
			`fi`
			`done < "$OUTPUT_FILE"`

			`echo ""`
			`echo "Bidirectional SSH setup complete!"`
			`echo "Both local and remote nodes can now SSH to each other without passwords."`