# # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # #!/bin/env bash # discover-sparks.sh # Discover available systems using avahi-browse and generate MPI hosts file # Searches all active interfaces automatically # # Usage: bash ./discover-sparks set -euo pipefail # Check if running as root if [[ $EUID -eq 0 ]]; then echo "Error: This script should not be run as root" exit 1 fi # Dynamically get interface names from ibdev2netdev output # Use ibdev2netdev to list Infiniband devices and their network interfaces. # The awk command searches for lines containing 'Up)' (i.e., interfaces that are up) # and prints the 5th field, which is the interface name (e.g., enp1s0f0np0). # The tr command removes any parentheses from the output. INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()')) if [ ${#INTERFACES[@]} -eq 0 ]; then echo "ERROR: No active interfaces found via ibdev2netdev." exit 1 fi OUTPUT_FILE="~/.stacked-sparks-hostfile" # Check if avahi-browse is available if ! command -v avahi-browse &> /dev/null; then echo "Error: avahi-browse not found. Please install avahi-utils package." exit 1 fi # Check if ssh-copy-id is available if ! command -v ssh-copy-id &> /dev/null; then echo "Error: ssh-copy-id not found. Please install openssh-client package." exit 1 fi # Create temporary file for processing TEMP_FILE=$(mktemp) trap 'rm -f "$TEMP_FILE"' EXIT # Run avahi-browse and filter for SSH services on specified interfaces # -p: parseable output # -r: resolve host names and addresses # -f: terminate after dumping all entries available at startup avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null) # Filter for both interfaces found_services=false for interface in "${INTERFACES[@]}"; do if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then found_services=true fi done if [ "$found_services" = false ]; then echo "Warning: No services found on any specified interface" touch "$OUTPUT_FILE" echo "Created empty hosts file: $OUTPUT_FILE" exit 0 fi # Extract IPv4 addresses from the avahi-browse output # Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port; # Clear the output file > "$OUTPUT_FILE" # Parse IPv4 entries and extract IP addresses grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do # Clean up any trailing data clean_ip=$(echo "$ip_address" | sed 's/;.*$//') # Validate IP address format if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then echo "$clean_ip" >> "$OUTPUT_FILE" echo "Found: $clean_ip ($fqdn)" else echo "Warning: Invalid IP format: $clean_ip" fi done # Sort and remove duplicates if [[ -s "$OUTPUT_FILE" ]]; then sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE" else echo "No IPv4 addresses found." exit 1 fi # Check if SSH key exists, if not, prompt to generate if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q fi echo "" echo "Setting up bidirectional SSH access (local <-> remote nodes)..." echo "You may be prompted for your password on each node." # Ensure authorized_keys file exists mkdir -p "$HOME/.ssh" touch "$HOME/.ssh/authorized_keys" chmod 700 "$HOME/.ssh" chmod 600 "$HOME/.ssh/authorized_keys" while read -r node_ip; do if [[ -n "$node_ip" ]]; then echo "" echo "Setting up SSH access for $node_ip ..." # Step 1: Copy local SSH key to remote node echo " Copying local SSH key to $node_ip ..." if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then echo " ✓ Successfully copied local key to $node_ip" # Step 2: Set up reverse SSH access (remote -> local) echo " Setting up reverse SSH access from $node_ip ..." # Generate SSH key on remote node if it doesn't exist and get its public key remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" ' # Ensure SSH directory exists mkdir -p ~/.ssh chmod 700 ~/.ssh # Generate key if it doesn'"'"'t exist if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q fi # Output the public key cat ~/.ssh/id_ed25519.pub ' 2>/dev/null) if [[ -n "$remote_pubkey" ]]; then # Add remote public key to local authorized_keys if not already present if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys" echo " ✓ Added $node_ip's public key to local authorized_keys" else echo " ✓ $node_ip's public key already in local authorized_keys" fi else echo " ✗ Failed to get public key from $node_ip" fi else echo " ✗ Failed to copy local SSH key to $node_ip as $USER" fi fi done < "$OUTPUT_FILE" # Add hostfile to remote nodes while read -r node_ip; do if [[ -n "$node_ip" ]]; then echo " Adding hostfile to $node_ip ..." scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE" fi done < "$OUTPUT_FILE" echo "" echo "Bidirectional SSH setup complete!" echo "Both local and remote nodes can now SSH to each other without passwords."