From c3770ec3c7fe0afd239a3e85ef80669919af4587 Mon Sep 17 00:00:00 2001 From: GitLab CI Date: Mon, 30 Mar 2026 15:12:21 +0000 Subject: [PATCH] chore: Regenerate all playbooks --- nvidia/connect-three-sparks/README.md | 26 +++++++++---------- ...detect_and_configure_cluster_networking.py | 23 +++++++++++++++- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/nvidia/connect-three-sparks/README.md b/nvidia/connect-three-sparks/README.md index f6923a5..1ed489f 100644 --- a/nvidia/connect-three-sparks/README.md +++ b/nvidia/connect-three-sparks/README.md @@ -158,15 +158,15 @@ network: enP2p1s0f0np0: dhcp4: false addresses: - - 192.168.0.2/24 + - 192.168.1.1/24 enp1s0f1np1: dhcp4: false addresses: - - 192.168.1.1/24 + - 192.168.2.1/24 enP2p1s0f1np1: dhcp4: false addresses: - - 192.168.1.2/24 + - 192.168.3.1/24 EOF ## Set appropriate permissions @@ -186,19 +186,19 @@ network: enp1s0f0np0: dhcp4: false addresses: - - 192.168.2.1/24 + - 192.168.4.1/24 enP2p1s0f0np0: dhcp4: false addresses: - - 192.168.2.2/24 + - 192.168.5.1/24 enp1s0f1np1: dhcp4: false addresses: - - 192.168.0.3/24 + - 192.168.0.2/24 enP2p1s0f1np1: dhcp4: false addresses: - - 192.168.0.4/24 + - 192.168.1.2/24 EOF ## Set appropriate permissions @@ -218,19 +218,19 @@ network: enp1s0f0np0: dhcp4: false addresses: - - 192.168.1.3/24 + - 192.168.2.2/24 enP2p1s0f0np0: dhcp4: false addresses: - - 192.168.1.4/24 + - 192.168.3.2/24 enp1s0f1np1: dhcp4: false addresses: - - 192.168.2.3/24 + - 192.168.4.2/24 enP2p1s0f1np1: dhcp4: false addresses: - - 192.168.2.4/24 + - 192.168.5.2/24 EOF ## Set appropriate permissions @@ -254,8 +254,8 @@ bash ./discover-sparks Expected output similar to the below, with different IPs and node names. You may see more than one IP for each node as four interfaces (**enp1s0f0np0**, **enP2p1s0f0np0**, **enp1s0f1np1** and **enP2p1s0f1np1**) have IP addresses assigned. This is expected and does not cause any issues. The first time you run the script, you'll be prompted for your password for each node. ``` Found: 192.168.0.1 (dgx-spark-1.local) -Found: 192.168.0.3 (dgx-spark-2.local) -Found: 192.168.1.3 (dgx-spark-3.local) +Found: 192.168.0.2 (dgx-spark-2.local) +Found: 192.168.3.2 (dgx-spark-3.local) Setting up bidirectional SSH access (local <-> remote nodes)... You may be prompted for your password for each node. diff --git a/nvidia/multi-sparks-through-switch/assets/spark_cluster_setup/node_scripts/detect_and_configure_cluster_networking.py b/nvidia/multi-sparks-through-switch/assets/spark_cluster_setup/node_scripts/detect_and_configure_cluster_networking.py index 47cc0f5..68430bd 100644 --- a/nvidia/multi-sparks-through-switch/assets/spark_cluster_setup/node_scripts/detect_and_configure_cluster_networking.py +++ b/nvidia/multi-sparks-through-switch/assets/spark_cluster_setup/node_scripts/detect_and_configure_cluster_networking.py @@ -268,6 +268,27 @@ def ip_for_2node_link(link_index: int, node_id: int, local_index_in_pair: int) - host = 1 + (0 if node_id == 1 else 2) + local_index_in_pair return f"192.168.{link_index}.{host}/24" +def ip_for_3node_ring_link(link_index: int, node_id: int, local_index_in_pair: int) -> str: + """ + /24 scheme for 3-node ring topology. + + For each node_id: + network = 192.168.third_octet.node_id/24 + third_octet = link_index * 2 + local_index_in_pair + + Node 1: + 192.168.[0, 1].1/24 -> Node 2 + 192.168.[2, 3].1/24 -> Node 3 + + Node 2: + 192.168.[4, 5].1/24 -> Node 3 + 192.168.[0, 1].2/24 -> Node 1 + + Node 3: + 192.168.[2, 3].2/24 -> Node 1 + 192.168.[4, 5].2/24 -> Node 2 + """ + return f"192.168.{link_index * 2 + local_index_in_pair}.{node_id}/24" def ip_for_switch_link(link_index: int, node_index: int, local_index_in_pair: int) -> str: """ @@ -602,7 +623,7 @@ def main() -> bool: node_id_link = 1 if local_machine_id < neighbor_machine else 2 for local_idx, cfg_iface in enumerate(config_ifaces): - ip_cidr = ip_for_2node_link(link_index, node_id_link, local_idx) + ip_cidr = ip_for_3node_ring_link(link_index, node_id_link, local_idx) iface_to_ip[cfg_iface] = ip_cidr print(