From 0a7238d65190affddd354f94b67228d58b513527 Mon Sep 17 00:00:00 2001 From: GitLab CI Date: Sat, 11 Oct 2025 23:26:38 +0000 Subject: [PATCH] chore: Regenerate all playbooks --- nvidia/stack-sparks/README.md | 8 +- nvidia/stack-sparks/assets/discover-sparks | 131 +++++++-------- nvidia/trt-llm/assets/discover-sparks.sh | 180 --------------------- nvidia/vibe-coding/README.md | 4 +- 4 files changed, 63 insertions(+), 260 deletions(-) delete mode 100755 nvidia/trt-llm/assets/discover-sparks.sh diff --git a/nvidia/stack-sparks/README.md b/nvidia/stack-sparks/README.md index 3624d77..392b034 100644 --- a/nvidia/stack-sparks/README.md +++ b/nvidia/stack-sparks/README.md @@ -46,7 +46,7 @@ a functional distributed computing environment. All required files for this playbook can be found [here on GitHub](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/) -- **discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution +- [**discover-sparks.sh**](https://gitlab.com/nvidia/dgx-spark/temp-external-playbook-assets/dgx-spark-playbook-assets/-/blob/main/${MODEL}/assets/discover-sparks) script for automatic node discovery and SSH key distribution ## Time & risk @@ -190,7 +190,7 @@ You may be prompted for your password for each node. SSH setup complete! Both local and remote nodes can now SSH to each other without passwords. ``` -Note: If you encoutner any errors, please follow Option 2 below to manually configure SSH and debug the issue. +Note: If you encounter any errors, please follow Option 2 below to manually configure SSH and debug the issue. ### Option 2: Manually discover and configure SSH @@ -217,8 +217,8 @@ In this example, the IP address for Node 1 is **169.254.35.62**. Repeat the proc On both nodes, run the following commands to enable passwordless SSH: ```bash ## Copy your SSH public key to both nodes. Please replace the IP addresses with the ones you found in the previous step. -ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@ -ssh-copy-id -i ~/.ssh/id_rsa.pub nvidia@ +ssh-copy-id -i ~/.ssh/id_rsa.pub @ +ssh-copy-id -i ~/.ssh/id_rsa.pub @ ``` ## Step 4. Verify Multi-Node Communication diff --git a/nvidia/stack-sparks/assets/discover-sparks b/nvidia/stack-sparks/assets/discover-sparks index d204d68..208426b 100755 --- a/nvidia/stack-sparks/assets/discover-sparks +++ b/nvidia/stack-sparks/assets/discover-sparks @@ -16,7 +16,7 @@ # #!/bin/env bash -# discover-sparks.sh +# discover-sparks # Discover available systems using avahi-browse and generate MPI hosts file # Searches all active interfaces automatically # @@ -24,9 +24,10 @@ set -euo pipefail -# Check if running as root -if [[ $EUID -eq 0 ]]; then - echo "Error: This script should not be run as root" +# Check if running as root or with sudo +if [[ $EUID -eq 0 ]] || [[ -n "${SUDO_USER:-}" ]]; then + echo "Error: This script should not be run as root or with sudo" + echo "Please run as a regular user" exit 1 fi @@ -40,7 +41,10 @@ if [ ${#INTERFACES[@]} -eq 0 ]; then echo "ERROR: No active interfaces found via ibdev2netdev." exit 1 fi -OUTPUT_FILE="~/.stacked-sparks-hostfile" + +# Create temporary file for processing +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT # Check if avahi-browse is available if ! command -v avahi-browse &> /dev/null; then @@ -48,16 +52,6 @@ if ! command -v avahi-browse &> /dev/null; then exit 1 fi -# Check if ssh-copy-id is available -if ! command -v ssh-copy-id &> /dev/null; then - echo "Error: ssh-copy-id not found. Please install openssh-client package." - exit 1 -fi - -# Create temporary file for processing -TEMP_FILE=$(mktemp) -trap 'rm -f "$TEMP_FILE"' EXIT - # Run avahi-browse and filter for SSH services on specified interfaces # -p: parseable output # -r: resolve host names and addresses @@ -74,25 +68,18 @@ done if [ "$found_services" = false ]; then echo "Warning: No services found on any specified interface" - touch "$OUTPUT_FILE" - echo "Created empty hosts file: $OUTPUT_FILE" exit 0 fi # Extract IPv4 addresses from the avahi-browse output # Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port; - -# Clear the output file -> "$OUTPUT_FILE" - -# Parse IPv4 entries and extract IP addresses grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do # Clean up any trailing data clean_ip=$(echo "$ip_address" | sed 's/;.*$//') # Validate IP address format if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then - echo "$clean_ip" >> "$OUTPUT_FILE" + echo "$clean_ip" >> "$TEMP_FILE.sorted" echo "Found: $clean_ip ($fqdn)" else echo "Warning: Invalid IP format: $clean_ip" @@ -100,81 +87,77 @@ grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface pr done # Sort and remove duplicates -if [[ -s "$OUTPUT_FILE" ]]; then - sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE" +if [[ -s "$TEMP_FILE.sorted" ]]; then + sort -u "$TEMP_FILE.sorted" -o "$TEMP_FILE.sorted" else echo "No IPv4 addresses found." exit 1 fi -# Check if SSH key exists, if not, prompt to generate -if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then - ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q +# Generate a shared SSH key if it doesn't exist +SHARED_KEY="$HOME/.ssh/id_ed25519_shared" +if [[ ! -f "$SHARED_KEY" ]]; then + echo "Generating shared SSH key for all nodes..." + ssh-keygen -t ed25519 -N "" -f "$SHARED_KEY" -q -C "shared-cluster-key" fi echo "" -echo "Setting up bidirectional SSH access (local <-> remote nodes)..." +echo "Setting up shared SSH access across all nodes..." echo "You may be prompted for your password on each node." -# Ensure authorized_keys file exists +# Ensure local .ssh directory exists with correct permissions mkdir -p "$HOME/.ssh" -touch "$HOME/.ssh/authorized_keys" chmod 700 "$HOME/.ssh" -chmod 600 "$HOME/.ssh/authorized_keys" +# Add shared public key to local authorized_keys +if ! grep -qF "$(cat "$SHARED_KEY.pub")" "$HOME/.ssh/authorized_keys" 2>/dev/null; then + cat "$SHARED_KEY.pub" >> "$HOME/.ssh/authorized_keys" + chmod 600 "$HOME/.ssh/authorized_keys" + echo " ✓ Added shared public key to local authorized_keys" +fi + +# Distribute shared key to all remote nodes while read -r node_ip; do if [[ -n "$node_ip" ]]; then - echo "" - echo "Setting up SSH access for $node_ip ..." + echo "Configuring $node_ip..." - # Step 1: Copy local SSH key to remote node - echo " Copying local SSH key to $node_ip ..." - if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then - echo " ✓ Successfully copied local key to $node_ip" - - # Step 2: Set up reverse SSH access (remote -> local) - echo " Setting up reverse SSH access from $node_ip ..." - - # Generate SSH key on remote node if it doesn't exist and get its public key - remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" ' - # Ensure SSH directory exists - mkdir -p ~/.ssh + # Copy shared key to remote node and set up authorized_keys + if scp -o StrictHostKeyChecking=accept-new "$SHARED_KEY" "$SHARED_KEY.pub" "$USER@$node_ip:~/.ssh/" &>/dev/null; then + ssh -n -o StrictHostKeyChecking=accept-new "$USER@$node_ip" " chmod 700 ~/.ssh + chmod 600 ~/.ssh/id_ed25519_shared + chmod 644 ~/.ssh/id_ed25519_shared.pub - # Generate key if it doesn'"'"'t exist - if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then - ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q + # Add shared public key to authorized_keys if not present + if ! grep -qF \"\$(cat ~/.ssh/id_ed25519_shared.pub)\" ~/.ssh/authorized_keys 2>/dev/null; then + cat ~/.ssh/id_ed25519_shared.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys fi - # Output the public key - cat ~/.ssh/id_ed25519.pub - ' 2>/dev/null) - - if [[ -n "$remote_pubkey" ]]; then - # Add remote public key to local authorized_keys if not already present - if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then - echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys" - echo " ✓ Added $node_ip's public key to local authorized_keys" - else - echo " ✓ $node_ip's public key already in local authorized_keys" + # Create/update SSH config to use shared key by default + if ! grep -q 'IdentityFile.*id_ed25519_shared' ~/.ssh/config 2>/dev/null; then + echo 'Host *' >> ~/.ssh/config + echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> ~/.ssh/config + chmod 600 ~/.ssh/config fi - else - echo " ✗ Failed to get public key from $node_ip" - fi + " &>/dev/null + + echo " ✓ Successfully configured $node_ip with shared key" else - echo " ✗ Failed to copy local SSH key to $node_ip as $USER" + echo " ✗ Failed to configure $node_ip" fi fi -done < "$OUTPUT_FILE" +done < "$TEMP_FILE.sorted" -# Add hostfile to remote nodes -while read -r node_ip; do - if [[ -n "$node_ip" ]]; then - echo " Adding hostfile to $node_ip ..." - scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE" - fi -done < "$OUTPUT_FILE" +# Update local SSH config to use shared key +if ! grep -q 'IdentityFile.*id_ed25519_shared' "$HOME/.ssh/config" 2>/dev/null; then + touch "$HOME/.ssh/config" + echo 'Host *' >> "$HOME/.ssh/config" + echo ' IdentityFile ~/.ssh/id_ed25519_shared' >> "$HOME/.ssh/config" + chmod 600 "$HOME/.ssh/config" + echo " ✓ Updated local SSH config to use shared key" +fi echo "" -echo "Bidirectional SSH setup complete!" -echo "Both local and remote nodes can now SSH to each other without passwords." +echo "Shared SSH setup complete!" +echo "All nodes can now SSH to each other using the shared key (id_ed25519_shared)." diff --git a/nvidia/trt-llm/assets/discover-sparks.sh b/nvidia/trt-llm/assets/discover-sparks.sh deleted file mode 100755 index d204d68..0000000 --- a/nvidia/trt-llm/assets/discover-sparks.sh +++ /dev/null @@ -1,180 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -#!/bin/env bash - -# discover-sparks.sh -# Discover available systems using avahi-browse and generate MPI hosts file -# Searches all active interfaces automatically -# -# Usage: bash ./discover-sparks - -set -euo pipefail - -# Check if running as root -if [[ $EUID -eq 0 ]]; then - echo "Error: This script should not be run as root" - exit 1 -fi - -# Dynamically get interface names from ibdev2netdev output -# Use ibdev2netdev to list Infiniband devices and their network interfaces. -# The awk command searches for lines containing 'Up)' (i.e., interfaces that are up) -# and prints the 5th field, which is the interface name (e.g., enp1s0f0np0). -# The tr command removes any parentheses from the output. -INTERFACES=($(ibdev2netdev | awk '/Up\)/ {print $5}' | tr -d '()')) -if [ ${#INTERFACES[@]} -eq 0 ]; then - echo "ERROR: No active interfaces found via ibdev2netdev." - exit 1 -fi -OUTPUT_FILE="~/.stacked-sparks-hostfile" - -# Check if avahi-browse is available -if ! command -v avahi-browse &> /dev/null; then - echo "Error: avahi-browse not found. Please install avahi-utils package." - exit 1 -fi - -# Check if ssh-copy-id is available -if ! command -v ssh-copy-id &> /dev/null; then - echo "Error: ssh-copy-id not found. Please install openssh-client package." - exit 1 -fi - -# Create temporary file for processing -TEMP_FILE=$(mktemp) -trap 'rm -f "$TEMP_FILE"' EXIT - -# Run avahi-browse and filter for SSH services on specified interfaces -# -p: parseable output -# -r: resolve host names and addresses -# -f: terminate after dumping all entries available at startup -avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null) - -# Filter for both interfaces -found_services=false -for interface in "${INTERFACES[@]}"; do - if echo "$avahi_output" | grep "$interface" >> "$TEMP_FILE"; then - found_services=true - fi -done - -if [ "$found_services" = false ]; then - echo "Warning: No services found on any specified interface" - touch "$OUTPUT_FILE" - echo "Created empty hosts file: $OUTPUT_FILE" - exit 0 -fi - -# Extract IPv4 addresses from the avahi-browse output -# Format: =;interface;IPv4;hostname\032service;description;local;fqdn;ip_address;port; - -# Clear the output file -> "$OUTPUT_FILE" - -# Parse IPv4 entries and extract IP addresses -grep "^=" "$TEMP_FILE" | grep "IPv4" | while IFS=';' read -r prefix interface protocol hostname_service description local fqdn ip_address port rest; do - # Clean up any trailing data - clean_ip=$(echo "$ip_address" | sed 's/;.*$//') - - # Validate IP address format - if [[ $clean_ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then - echo "$clean_ip" >> "$OUTPUT_FILE" - echo "Found: $clean_ip ($fqdn)" - else - echo "Warning: Invalid IP format: $clean_ip" - fi -done - -# Sort and remove duplicates -if [[ -s "$OUTPUT_FILE" ]]; then - sort -u "$OUTPUT_FILE" -o "$OUTPUT_FILE" -else - echo "No IPv4 addresses found." - exit 1 -fi - -# Check if SSH key exists, if not, prompt to generate -if [[ ! -f "$HOME/.ssh/id_rsa.pub" && ! -f "$HOME/.ssh/id_ed25519.pub" ]]; then - ssh-keygen -t ed25519 -N "" -f "$HOME/.ssh/id_ed25519" -q -fi - -echo "" -echo "Setting up bidirectional SSH access (local <-> remote nodes)..." -echo "You may be prompted for your password on each node." - -# Ensure authorized_keys file exists -mkdir -p "$HOME/.ssh" -touch "$HOME/.ssh/authorized_keys" -chmod 700 "$HOME/.ssh" -chmod 600 "$HOME/.ssh/authorized_keys" - -while read -r node_ip; do - if [[ -n "$node_ip" ]]; then - echo "" - echo "Setting up SSH access for $node_ip ..." - - # Step 1: Copy local SSH key to remote node - echo " Copying local SSH key to $node_ip ..." - if ssh-copy-id -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=accept-new "$USER@$node_ip" &>/dev/null; then - echo " ✓ Successfully copied local key to $node_ip" - - # Step 2: Set up reverse SSH access (remote -> local) - echo " Setting up reverse SSH access from $node_ip ..." - - # Generate SSH key on remote node if it doesn't exist and get its public key - remote_pubkey=$(ssh -o StrictHostKeyChecking=accept-new "$USER@$node_ip" ' - # Ensure SSH directory exists - mkdir -p ~/.ssh - chmod 700 ~/.ssh - - # Generate key if it doesn'"'"'t exist - if [[ ! -f ~/.ssh/id_ed25519.pub ]]; then - ssh-keygen -t ed25519 -N "" -f ~/.ssh/id_ed25519 -q - fi - - # Output the public key - cat ~/.ssh/id_ed25519.pub - ' 2>/dev/null) - - if [[ -n "$remote_pubkey" ]]; then - # Add remote public key to local authorized_keys if not already present - if ! grep -q "$remote_pubkey" "$HOME/.ssh/authorized_keys" 2>/dev/null; then - echo "$remote_pubkey" >> "$HOME/.ssh/authorized_keys" - echo " ✓ Added $node_ip's public key to local authorized_keys" - else - echo " ✓ $node_ip's public key already in local authorized_keys" - fi - else - echo " ✗ Failed to get public key from $node_ip" - fi - else - echo " ✗ Failed to copy local SSH key to $node_ip as $USER" - fi - fi -done < "$OUTPUT_FILE" - -# Add hostfile to remote nodes -while read -r node_ip; do - if [[ -n "$node_ip" ]]; then - echo " Adding hostfile to $node_ip ..." - scp "$OUTPUT_FILE" "$USER@$node_ip:$OUTPUT_FILE" - fi -done < "$OUTPUT_FILE" - -echo "" -echo "Bidirectional SSH setup complete!" -echo "Both local and remote nodes can now SSH to each other without passwords." diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md index 67a7627..e817e26 100644 --- a/nvidia/vibe-coding/README.md +++ b/nvidia/vibe-coding/README.md @@ -22,7 +22,7 @@ While NVIDIA NIMs are not yet widely supported, this guide uses **Ollama** with ### What You'll Accomplish -You’ll have a fully configured DGX Spark system capable of: +You'll have a fully configured DGX Spark system capable of: - Running local code assistance through Ollama. - Serving models remotely for Continue.dev and VSCode integration. - Hosting large LLMs like GPT-OSS 120B using unified memory. @@ -144,7 +144,7 @@ Add additional model entries for any other Ollama models you wish to host remote - Verify Docker and GPU drivers are installed correctly. - Run `ollama serve` manually to view errors. -**2. VSCode can’t connect** +**2. VSCode can't connect** - Ensure port 11434 is open and accessible from your workstation. - Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf`.