mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-28 12:43:52 +00:00
Compare commits
5 Commits
d2409f74ac
...
f8af25586f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f8af25586f | ||
|
|
8452a1c5b1 | ||
|
|
9414a5141f | ||
|
|
911ca6db8b | ||
|
|
2d52e1aab3 |
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# DGX Spark MCP Server Playbook
|
||||||
|
|
||||||
|
This playbook installs and configures the **DGX Spark MCP Server**, a tool that provides hardware-aware Apache Spark optimization for NVIDIA DGX systems via the Model Context Protocol (MCP).
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The DGX Spark MCP Server enables MCP clients (like Claude Desktop or Claude Code) to:
|
||||||
|
* **Detect Hardware**: Automatically read DGX GPU topology, memory, and CPU specs.
|
||||||
|
* **Optimize Spark**: Generate tuned Spark configurations (`spark-submit` args) based on detected hardware and workload type (ETL, ML Training, Inference).
|
||||||
|
* **Monitor**: Check real-time GPU availability before submitting jobs.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
* **NVIDIA DGX System** (or compatible GPU server)
|
||||||
|
* **NVIDIA Drivers** installed (`nvidia-smi` available)
|
||||||
|
* **Node.js 18+**
|
||||||
|
* **Root access** (for systemd service installation)
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── config/
|
||||||
|
│ └── default.json # Default configuration
|
||||||
|
├── deploy/
|
||||||
|
│ └── dgx-spark-mcp.service # Systemd service file
|
||||||
|
└── scripts/
|
||||||
|
└── install.sh # Automated installer
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
1. **Run the installer**:
|
||||||
|
```bash
|
||||||
|
sudo ./scripts/install.sh
|
||||||
|
```
|
||||||
|
This script will:
|
||||||
|
* Install `dgx-spark-mcp` globally via `npm`.
|
||||||
|
* Create a dedicated system user (`dgx`).
|
||||||
|
* Setup logging directory `/var/log/dgx-spark-mcp`.
|
||||||
|
* Install and start the systemd service.
|
||||||
|
|
||||||
|
2. **Verify Installation**:
|
||||||
|
```bash
|
||||||
|
systemctl status dgx-spark-mcp
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
The configuration file is located at `/etc/dgx-spark-mcp/config.json`.
|
||||||
|
|
||||||
|
### Key Settings
|
||||||
|
|
||||||
|
* **`mcp.transport`**: `stdio` (default) or `sse`.
|
||||||
|
* **`hardware.enableGpuMonitoring`**: Set to `true` to enable real-time `nvidia-smi` queries.
|
||||||
|
* **`logging.level`**: `info` or `debug`.
|
||||||
|
|
||||||
|
## Usage with Claude Desktop
|
||||||
|
|
||||||
|
Add the following to your `claude_desktop_config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"dgx-spark": {
|
||||||
|
"command": "dgx-spark-mcp"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**Service fails to start?**
|
||||||
|
Check logs:
|
||||||
|
```bash
|
||||||
|
journalctl -u dgx-spark-mcp -f
|
||||||
|
```
|
||||||
|
|
||||||
|
**Permission denied?**
|
||||||
|
Ensure the `dgx` user has permissions to access `nvidia-smi`. You may need to add the user to the `video` group:
|
||||||
|
```bash
|
||||||
|
usermod -a -G video dgx
|
||||||
|
```
|
||||||
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
{
|
||||||
|
"server": {
|
||||||
|
"port": 3000,
|
||||||
|
"host": "localhost",
|
||||||
|
"nodeEnv": "production"
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"level": "info",
|
||||||
|
"format": "json",
|
||||||
|
"dir": "/var/log/dgx-spark-mcp",
|
||||||
|
"maxFiles": 10,
|
||||||
|
"maxSize": "10m"
|
||||||
|
},
|
||||||
|
"mcp": {
|
||||||
|
"serverName": "dgx-spark-mcp",
|
||||||
|
"serverVersion": "0.1.0",
|
||||||
|
"transport": "stdio"
|
||||||
|
},
|
||||||
|
"hardware": {
|
||||||
|
"nvidiaSmiPath": "/usr/bin/nvidia-smi",
|
||||||
|
"cacheTTL": 30000,
|
||||||
|
"enableGpuMonitoring": true
|
||||||
|
},
|
||||||
|
"spark": {},
|
||||||
|
"performance": {
|
||||||
|
"enableMetrics": true,
|
||||||
|
"metricsInterval": 60000,
|
||||||
|
"healthCheckInterval": 30000
|
||||||
|
},
|
||||||
|
"security": {
|
||||||
|
"enableAuth": false
|
||||||
|
}
|
||||||
|
}
|
||||||
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DGX Spark MCP Server
|
||||||
|
Documentation=https://github.com/raibid-labs/dgx-spark-mcp
|
||||||
|
After=network.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=dgx
|
||||||
|
Group=dgx
|
||||||
|
# Environment variables
|
||||||
|
Environment="NODE_ENV=production"
|
||||||
|
Environment="DGX_MCP_CONFIG_PATH=/etc/dgx-spark-mcp/config.json"
|
||||||
|
|
||||||
|
# Start the service
|
||||||
|
# Assumes installed globally via npm
|
||||||
|
ExecStart=/usr/local/bin/dgx-spark-mcp
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=600
|
||||||
|
StartLimitBurst=5
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
LimitNPROC=4096
|
||||||
|
|
||||||
|
# Security hardening
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
# Allow write access to logs
|
||||||
|
ReadWritePaths=/var/log/dgx-spark-mcp
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=dgx-spark-mcp
|
||||||
|
|
||||||
|
# Process management
|
||||||
|
KillMode=mixed
|
||||||
|
KillSignal=SIGTERM
|
||||||
|
TimeoutStopSec=30
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# DGX Spark MCP Server - Playbook Installation Script
|
||||||
|
# Installs the server from NPM and configures systemd
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
PACKAGE_NAME="dgx-spark-mcp"
|
||||||
|
SERVICE_NAME="dgx-spark-mcp"
|
||||||
|
CONFIG_DIR="/etc/dgx-spark-mcp"
|
||||||
|
LOG_DIR="/var/log/dgx-spark-mcp"
|
||||||
|
USER="dgx"
|
||||||
|
GROUP="dgx"
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||||
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||||
|
|
||||||
|
# Check root
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
log_error "This script must be run as root"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 1. Install Node.js (if missing) - Brief check
|
||||||
|
if ! command -v node &> /dev/null; then
|
||||||
|
log_info "Node.js not found. Please install Node.js 18+."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Install Package
|
||||||
|
log_info "Installing $PACKAGE_NAME from registry..."
|
||||||
|
npm install -g $PACKAGE_NAME
|
||||||
|
|
||||||
|
# 3. Create User
|
||||||
|
if ! id -u "$USER" &>/dev/null; then
|
||||||
|
log_info "Creating user $USER..."
|
||||||
|
useradd --system --no-create-home --shell /bin/false "$USER"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. Setup Directories
|
||||||
|
log_info "Setting up directories..."
|
||||||
|
mkdir -p "$CONFIG_DIR"
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
|
||||||
|
# Copy config if provided in playbook
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
if [[ -f "$SCRIPT_DIR/../config/default.json" ]]; then
|
||||||
|
cp "$SCRIPT_DIR/../config/default.json" "$CONFIG_DIR/config.json"
|
||||||
|
else
|
||||||
|
log_info "No default config found, using internal defaults."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Permissions
|
||||||
|
chown -R "$USER:$GROUP" "$LOG_DIR"
|
||||||
|
chown -R "$USER:$GROUP" "$CONFIG_DIR"
|
||||||
|
chmod 755 "$LOG_DIR"
|
||||||
|
chmod 755 "$CONFIG_DIR"
|
||||||
|
|
||||||
|
# 5. Setup Service
|
||||||
|
log_info "Configuring systemd service..."
|
||||||
|
if [[ -f "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" ]]; then
|
||||||
|
cp "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" "/etc/systemd/system/$SERVICE_NAME.service"
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable "$SERVICE_NAME"
|
||||||
|
systemctl restart "$SERVICE_NAME"
|
||||||
|
log_info "Service started."
|
||||||
|
else
|
||||||
|
log_error "Service file not found."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Installation complete."
|
||||||
|
log_info "Status: systemctl status $SERVICE_NAME"
|
||||||
@ -47,8 +47,8 @@ All necessary files for the playbook can be found [here on GitHub](https://githu
|
|||||||
* **Duration:** 45-90 minutes for complete setup and initial model fine-tuning
|
* **Duration:** 45-90 minutes for complete setup and initial model fine-tuning
|
||||||
* **Risks:** Model downloads can be large (several GB), ARM64 package compatibility issues may require troubleshooting, distributed training setup complexity increases with multi-node configurations
|
* **Risks:** Model downloads can be large (several GB), ARM64 package compatibility issues may require troubleshooting, distributed training setup complexity increases with multi-node configurations
|
||||||
* **Rollback:** Virtual environments can be completely removed; no system-level changes are made to the host system beyond package installations.
|
* **Rollback:** Virtual environments can be completely removed; no system-level changes are made to the host system beyond package installations.
|
||||||
* **Last Updated:** 01/15/2026
|
* **Last Updated:** 03/04/2026
|
||||||
* Fix qLoRA fine-tuning workflow
|
* Recommend running Nemo finetune workflow via Docker
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
|
|
||||||
|
|||||||
@ -172,12 +172,15 @@ Verify the NVIDIA runtime works:
|
|||||||
docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
|
docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
|
||||||
```
|
```
|
||||||
|
|
||||||
If you get a permission denied error on `docker`, add your user to the Docker group and log out/in:
|
If you get a permission denied error on `docker`, add your user to the Docker group and activate the new group in your current session:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo usermod -aG docker $USER
|
sudo usermod -aG docker $USER
|
||||||
|
newgrp docker
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This applies the group change immediately. Alternatively, you can log out and back in instead of running `newgrp docker`.
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> DGX Spark uses cgroup v2. OpenShell's gateway embeds k3s inside Docker and needs host cgroup namespace access. Without `default-cgroupns-mode: host`, the gateway can fail with "Failed to start ContainerManager" errors.
|
> DGX Spark uses cgroup v2. OpenShell's gateway embeds k3s inside Docker and needs host cgroup namespace access. Without `default-cgroupns-mode: host`, the gateway can fail with "Failed to start ContainerManager" errors.
|
||||||
|
|
||||||
@ -322,13 +325,21 @@ http://127.0.0.1:18789/#token=<long-token-here>
|
|||||||
|
|
||||||
**If accessing the Web UI from a remote machine**, you need to set up port forwarding.
|
**If accessing the Web UI from a remote machine**, you need to set up port forwarding.
|
||||||
|
|
||||||
|
First, find your Spark's IP address. On the Spark, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hostname -I | awk '{print $1}'
|
||||||
|
```
|
||||||
|
|
||||||
|
This prints the primary IP address (e.g. `192.168.1.42`). You can also find it in **Settings > Wi-Fi** or **Settings > Network** on the Spark's desktop, or check your router's connected-devices list.
|
||||||
|
|
||||||
Start the port forward on the Spark host:
|
Start the port forward on the Spark host:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
openshell forward start 18789 my-assistant --background
|
openshell forward start 18789 my-assistant --background
|
||||||
```
|
```
|
||||||
|
|
||||||
Then from your remote machine, create an SSH tunnel to the Spark:
|
Then from your remote machine, create an SSH tunnel to the Spark (replace `<your-spark-ip>` with the IP address from above):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ssh -L 18789:127.0.0.1:18789 <your-user>@<your-spark-ip>
|
ssh -L 18789:127.0.0.1:18789 <your-user>@<your-spark-ip>
|
||||||
|
|||||||
@ -27,8 +27,8 @@ services:
|
|||||||
# Ollama configuration
|
# Ollama configuration
|
||||||
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
||||||
- OLLAMA_MODEL=llama3.1:8b
|
- OLLAMA_MODEL=llama3.1:8b
|
||||||
# Disable vLLM
|
# vLLM disabled in default Ollama mode
|
||||||
- VLLM_BASE_URL=http://localhost:8001/v1
|
# - VLLM_BASE_URL=http://localhost:8001/v1
|
||||||
- VLLM_MODEL=disabled
|
- VLLM_MODEL=disabled
|
||||||
# Vector DB configuration
|
# Vector DB configuration
|
||||||
- QDRANT_URL=http://qdrant:6333
|
- QDRANT_URL=http://qdrant:6333
|
||||||
|
|||||||
@ -108,7 +108,7 @@ export class TextProcessor {
|
|||||||
|
|
||||||
// Determine which LLM provider to use based on configuration
|
// Determine which LLM provider to use based on configuration
|
||||||
// Priority: vLLM > NVIDIA > Ollama
|
// Priority: vLLM > NVIDIA > Ollama
|
||||||
if (process.env.VLLM_BASE_URL) {
|
if (process.env.VLLM_BASE_URL && process.env.VLLM_MODEL && process.env.VLLM_MODEL !== 'disabled') {
|
||||||
this.selectedLLMProvider = 'vllm';
|
this.selectedLLMProvider = 'vllm';
|
||||||
} else if (process.env.NVIDIA_API_KEY) {
|
} else if (process.env.NVIDIA_API_KEY) {
|
||||||
this.selectedLLMProvider = 'nvidia';
|
this.selectedLLMProvider = 'nvidia';
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user