Merge 2d52e1aab3 into 8452a1c5b1

chore: Regenerate all playbooks
2026-04-28 12:43:52 +00:00 · 2026-04-08 03:29:10 +00:00 · 2026-04-08 02:41:59 +00:00 · 2026-04-07 04:13:30 +00:00 · 2026-04-06 19:32:24 +00:00 · 2025-11-25 19:22:44 -05:00
8 changed files with 261 additions and 7 deletions
--- a/nvidia/dgx-spark-mcp-server/README.md
+++ b/nvidia/dgx-spark-mcp-server/README.md
@ -0,0 +1,84 @@
 # DGX Spark MCP Server Playbook
 This playbook installs and configures the **DGX Spark MCP Server**, a tool that provides hardware-aware Apache Spark optimization for NVIDIA DGX systems via the Model Context Protocol (MCP).
 ## Overview
 The DGX Spark MCP Server enables MCP clients (like Claude Desktop or Claude Code) to:
 *   **Detect Hardware**: Automatically read DGX GPU topology, memory, and CPU specs.
 *   **Optimize Spark**: Generate tuned Spark configurations (`spark-submit` args) based on detected hardware and workload type (ETL, ML Training, Inference).
 *   **Monitor**: Check real-time GPU availability before submitting jobs.
 ## Prerequisites
 *   **NVIDIA DGX System** (or compatible GPU server)
 *   **NVIDIA Drivers** installed (`nvidia-smi` available)
 *   **Node.js 18+**
 *   **Root access** (for systemd service installation)
 ## Directory Structure
 ```
 .
 ├── config/
 │   └── default.json       # Default configuration
 ├── deploy/
 │   └── dgx-spark-mcp.service # Systemd service file
 └── scripts/
    └── install.sh         # Automated installer
 ```
 ## Installation
 1.  **Run the installer**:
    ```bash
    sudo ./scripts/install.sh
    ```
    This script will:
    *   Install `dgx-spark-mcp` globally via `npm`.
    *   Create a dedicated system user (`dgx`).
    *   Setup logging directory `/var/log/dgx-spark-mcp`.
    *   Install and start the systemd service.
 2.  **Verify Installation**:
    ```bash
    systemctl status dgx-spark-mcp
    ```
 ## Configuration
 The configuration file is located at `/etc/dgx-spark-mcp/config.json`.
 ### Key Settings
 *   **`mcp.transport`**: `stdio` (default) or `sse`.
 *   **`hardware.enableGpuMonitoring`**: Set to `true` to enable real-time `nvidia-smi` queries.
 *   **`logging.level`**: `info` or `debug`.
 ## Usage with Claude Desktop
 Add the following to your `claude_desktop_config.json`:
 ```json
 {
  "mcpServers": {
    "dgx-spark": {
      "command": "dgx-spark-mcp"
    }
  }
 }
 ```
 ## Troubleshooting
 **Service fails to start?**
 Check logs:
 ```bash
 journalctl -u dgx-spark-mcp -f
 ```
 **Permission denied?**
 Ensure the `dgx` user has permissions to access `nvidia-smi`. You may need to add the user to the `video` group:
 ```bash
 usermod -a -G video dgx
 ```
--- a/nvidia/dgx-spark-mcp-server/config/default.json
+++ b/nvidia/dgx-spark-mcp-server/config/default.json
@ -0,0 +1,33 @@
 {
  "server": {
    "port": 3000,
    "host": "localhost",
    "nodeEnv": "production"
  },
  "logging": {
    "level": "info",
    "format": "json",
    "dir": "/var/log/dgx-spark-mcp",
    "maxFiles": 10,
    "maxSize": "10m"
  },
  "mcp": {
    "serverName": "dgx-spark-mcp",
    "serverVersion": "0.1.0",
    "transport": "stdio"
  },
  "hardware": {
    "nvidiaSmiPath": "/usr/bin/nvidia-smi",
    "cacheTTL": 30000,
    "enableGpuMonitoring": true
  },
  "spark": {},
  "performance": {
    "enableMetrics": true,
    "metricsInterval": 60000,
    "healthCheckInterval": 30000
  },
  "security": {
    "enableAuth": false
  }
 }
--- a/nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
+++ b/nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
@ -0,0 +1,48 @@
 [Unit]
 Description=DGX Spark MCP Server
 Documentation=https://github.com/raibid-labs/dgx-spark-mcp
 After=network.target
 Wants=network-online.target
 [Service]
 Type=simple
 User=dgx
 Group=dgx
 # Environment variables
 Environment="NODE_ENV=production"
 Environment="DGX_MCP_CONFIG_PATH=/etc/dgx-spark-mcp/config.json"
 # Start the service
 # Assumes installed globally via npm
 ExecStart=/usr/local/bin/dgx-spark-mcp
 # Restart policy
 Restart=on-failure
 RestartSec=10
 StartLimitInterval=600
 StartLimitBurst=5
 # Resource limits
 LimitNOFILE=65536
 LimitNPROC=4096
 # Security hardening
 NoNewPrivileges=true
 PrivateTmp=true
 ProtectSystem=strict
 ProtectHome=true
 # Allow write access to logs
 ReadWritePaths=/var/log/dgx-spark-mcp
 # Logging
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=dgx-spark-mcp
 # Process management
 KillMode=mixed
 KillSignal=SIGTERM
 TimeoutStopSec=30
 [Install]
 WantedBy=multi-user.target
--- a/nvidia/dgx-spark-mcp-server/scripts/install.sh
+++ b/nvidia/dgx-spark-mcp-server/scripts/install.sh
@ -0,0 +1,78 @@
 #!/bin/bash
 set -euo pipefail
 # DGX Spark MCP Server - Playbook Installation Script
 # Installs the server from NPM and configures systemd
 # Configuration
 PACKAGE_NAME="dgx-spark-mcp"
 SERVICE_NAME="dgx-spark-mcp"
 CONFIG_DIR="/etc/dgx-spark-mcp"
 LOG_DIR="/var/log/dgx-spark-mcp"
 USER="dgx"
 GROUP="dgx"
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 NC='\033[0m'
 log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
 log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
 # Check root
 if [[ $EUID -ne 0 ]]; then
   log_error "This script must be run as root" 
   exit 1
 fi
 # 1. Install Node.js (if missing) - Brief check
 if ! command -v node &> /dev/null; then
    log_info "Node.js not found. Please install Node.js 18+."
    exit 1
 fi
 # 2. Install Package
 log_info "Installing $PACKAGE_NAME from registry..."
 npm install -g $PACKAGE_NAME
 # 3. Create User
 if ! id -u "$USER" &>/dev/null; then
    log_info "Creating user $USER..."
    useradd --system --no-create-home --shell /bin/false "$USER"
 fi
 # 4. Setup Directories
 log_info "Setting up directories..."
 mkdir -p "$CONFIG_DIR"
 mkdir -p "$LOG_DIR"
 # Copy config if provided in playbook
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 if [[ -f "$SCRIPT_DIR/../config/default.json" ]]; then
    cp "$SCRIPT_DIR/../config/default.json" "$CONFIG_DIR/config.json"
 else
    log_info "No default config found, using internal defaults."
 fi
 # Permissions
 chown -R "$USER:$GROUP" "$LOG_DIR"
 chown -R "$USER:$GROUP" "$CONFIG_DIR"
 chmod 755 "$LOG_DIR"
 chmod 755 "$CONFIG_DIR"
 # 5. Setup Service
 log_info "Configuring systemd service..."
 if [[ -f "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" ]]; then
    cp "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" "/etc/systemd/system/$SERVICE_NAME.service"
    systemctl daemon-reload
    systemctl enable "$SERVICE_NAME"
    systemctl restart "$SERVICE_NAME"
    log_info "Service started."
 else
    log_error "Service file not found."
    exit 1
 fi
 log_info "Installation complete."
 log_info "Status: systemctl status $SERVICE_NAME"
--- a/nvidia/nemo-fine-tune/README.md
+++ b/nvidia/nemo-fine-tune/README.md
@ -47,8 +47,8 @@ All necessary files for the playbook can be found [here on GitHub](https://githu
 * **Duration:** 45-90 minutes for complete setup and initial model fine-tuning
 * **Risks:** Model downloads can be large (several GB), ARM64 package compatibility issues may require troubleshooting, distributed training setup complexity increases with multi-node configurations
 * **Rollback:** Virtual environments can be completely removed; no system-level changes are made to the host system beyond package installations.
-* **Last Updated:** 01/15/2026
+* **Last Updated:** 03/04/2026
-  * Fix qLoRA fine-tuning workflow
+  * Recommend running Nemo finetune workflow via Docker
 ## Instructions
--- a/nvidia/nemoclaw/README.md
+++ b/nvidia/nemoclaw/README.md
@ -172,12 +172,15 @@ Verify the NVIDIA runtime works:
 docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
 ```
-If you get a permission denied error on `docker`, add your user to the Docker group and log out/in:
+If you get a permission denied error on `docker`, add your user to the Docker group and activate the new group in your current session:
 ```bash
 sudo usermod -aG docker $USER
 newgrp docker
 ```
 This applies the group change immediately. Alternatively, you can log out and back in instead of running `newgrp docker`.
 > [!NOTE]
 > DGX Spark uses cgroup v2. OpenShell's gateway embeds k3s inside Docker and needs host cgroup namespace access. Without `default-cgroupns-mode: host`, the gateway can fail with "Failed to start ContainerManager" errors.
@ -322,13 +325,21 @@ http://127.0.0.1:18789/#token=<long-token-here>
 **If accessing the Web UI from a remote machine**, you need to set up port forwarding.
 First, find your Spark's IP address. On the Spark, run:
 ```bash
 hostname -I | awk '{print $1}'
 ```
 This prints the primary IP address (e.g. `192.168.1.42`). You can also find it in **Settings > Wi-Fi** or **Settings > Network** on the Spark's desktop, or check your router's connected-devices list.
 Start the port forward on the Spark host:
 ```bash
 openshell forward start 18789 my-assistant --background
 ```
-Then from your remote machine, create an SSH tunnel to the Spark:
+Then from your remote machine, create an SSH tunnel to the Spark (replace `<your-spark-ip>` with the IP address from above):
 ```bash
 ssh -L 18789:127.0.0.1:18789 <your-user>@<your-spark-ip>
--- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
+++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
@ -27,8 +27,8 @@ services:
      # Ollama configuration
      - OLLAMA_BASE_URL=http://ollama:11434/v1
      - OLLAMA_MODEL=llama3.1:8b
-      # Disable vLLM
+      # vLLM disabled in default Ollama mode
-      - VLLM_BASE_URL=http://localhost:8001/v1
+      # - VLLM_BASE_URL=http://localhost:8001/v1
      - VLLM_MODEL=disabled
      # Vector DB configuration
      - QDRANT_URL=http://qdrant:6333
--- a/nvidia/txt2kg/assets/frontend/lib/text-processor.ts
+++ b/nvidia/txt2kg/assets/frontend/lib/text-processor.ts
@ -108,7 +108,7 @@ export class TextProcessor {
    // Determine which LLM provider to use based on configuration
    // Priority: vLLM > NVIDIA > Ollama
-    if (process.env.VLLM_BASE_URL) {
+    if (process.env.VLLM_BASE_URL && process.env.VLLM_MODEL && process.env.VLLM_MODEL !== 'disabled') {
      this.selectedLLMProvider = 'vllm';
    } else if (process.env.NVIDIA_API_KEY) {
      this.selectedLLMProvider = 'nvidia';
Author	SHA1	Message	Date
Aaron Brewbaker	f8af25586f	Merge `2d52e1aab3` into `8452a1c5b1`	2026-04-08 03:29:10 +00:00
GitLab CI	8452a1c5b1	chore: Regenerate all playbooks	2026-04-08 02:41:59 +00:00
GitLab CI	9414a5141f	chore: Regenerate all playbooks	2026-04-07 04:13:30 +00:00
GitLab CI	911ca6db8b	chore: Regenerate all playbooks	2026-04-06 19:32:24 +00:00
Aaron Brewbaker	2d52e1aab3	feat: add DGX Spark MCP Server playbook This playbook installs the DGX Spark MCP Server, a tool for hardware-aware Spark optimization on DGX systems. Includes: - Installation script (npm based) - Systemd service configuration - Default configuration - Documentation	2025-11-25 19:22:44 -05:00