mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-21 17:43:52 +00:00
feat: add DGX Spark MCP Server playbook
This playbook installs the DGX Spark MCP Server, a tool for hardware-aware Spark optimization on DGX systems. Includes: - Installation script (npm based) - Systemd service configuration - Default configuration - Documentation
This commit is contained in:
parent
699df25ee3
commit
2d52e1aab3
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
@ -0,0 +1,84 @@
|
||||
# DGX Spark MCP Server Playbook
|
||||
|
||||
This playbook installs and configures the **DGX Spark MCP Server**, a tool that provides hardware-aware Apache Spark optimization for NVIDIA DGX systems via the Model Context Protocol (MCP).
|
||||
|
||||
## Overview
|
||||
|
||||
The DGX Spark MCP Server enables MCP clients (like Claude Desktop or Claude Code) to:
|
||||
* **Detect Hardware**: Automatically read DGX GPU topology, memory, and CPU specs.
|
||||
* **Optimize Spark**: Generate tuned Spark configurations (`spark-submit` args) based on detected hardware and workload type (ETL, ML Training, Inference).
|
||||
* **Monitor**: Check real-time GPU availability before submitting jobs.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* **NVIDIA DGX System** (or compatible GPU server)
|
||||
* **NVIDIA Drivers** installed (`nvidia-smi` available)
|
||||
* **Node.js 18+**
|
||||
* **Root access** (for systemd service installation)
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
.
|
||||
├── config/
|
||||
│ └── default.json # Default configuration
|
||||
├── deploy/
|
||||
│ └── dgx-spark-mcp.service # Systemd service file
|
||||
└── scripts/
|
||||
└── install.sh # Automated installer
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Run the installer**:
|
||||
```bash
|
||||
sudo ./scripts/install.sh
|
||||
```
|
||||
This script will:
|
||||
* Install `dgx-spark-mcp` globally via `npm`.
|
||||
* Create a dedicated system user (`dgx`).
|
||||
* Setup logging directory `/var/log/dgx-spark-mcp`.
|
||||
* Install and start the systemd service.
|
||||
|
||||
2. **Verify Installation**:
|
||||
```bash
|
||||
systemctl status dgx-spark-mcp
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The configuration file is located at `/etc/dgx-spark-mcp/config.json`.
|
||||
|
||||
### Key Settings
|
||||
|
||||
* **`mcp.transport`**: `stdio` (default) or `sse`.
|
||||
* **`hardware.enableGpuMonitoring`**: Set to `true` to enable real-time `nvidia-smi` queries.
|
||||
* **`logging.level`**: `info` or `debug`.
|
||||
|
||||
## Usage with Claude Desktop
|
||||
|
||||
Add the following to your `claude_desktop_config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"dgx-spark": {
|
||||
"command": "dgx-spark-mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Service fails to start?**
|
||||
Check logs:
|
||||
```bash
|
||||
journalctl -u dgx-spark-mcp -f
|
||||
```
|
||||
|
||||
**Permission denied?**
|
||||
Ensure the `dgx` user has permissions to access `nvidia-smi`. You may need to add the user to the `video` group:
|
||||
```bash
|
||||
usermod -a -G video dgx
|
||||
```
|
||||
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
"server": {
|
||||
"port": 3000,
|
||||
"host": "localhost",
|
||||
"nodeEnv": "production"
|
||||
},
|
||||
"logging": {
|
||||
"level": "info",
|
||||
"format": "json",
|
||||
"dir": "/var/log/dgx-spark-mcp",
|
||||
"maxFiles": 10,
|
||||
"maxSize": "10m"
|
||||
},
|
||||
"mcp": {
|
||||
"serverName": "dgx-spark-mcp",
|
||||
"serverVersion": "0.1.0",
|
||||
"transport": "stdio"
|
||||
},
|
||||
"hardware": {
|
||||
"nvidiaSmiPath": "/usr/bin/nvidia-smi",
|
||||
"cacheTTL": 30000,
|
||||
"enableGpuMonitoring": true
|
||||
},
|
||||
"spark": {},
|
||||
"performance": {
|
||||
"enableMetrics": true,
|
||||
"metricsInterval": 60000,
|
||||
"healthCheckInterval": 30000
|
||||
},
|
||||
"security": {
|
||||
"enableAuth": false
|
||||
}
|
||||
}
|
||||
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
@ -0,0 +1,48 @@
|
||||
[Unit]
|
||||
Description=DGX Spark MCP Server
|
||||
Documentation=https://github.com/raibid-labs/dgx-spark-mcp
|
||||
After=network.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=dgx
|
||||
Group=dgx
|
||||
# Environment variables
|
||||
Environment="NODE_ENV=production"
|
||||
Environment="DGX_MCP_CONFIG_PATH=/etc/dgx-spark-mcp/config.json"
|
||||
|
||||
# Start the service
|
||||
# Assumes installed globally via npm
|
||||
ExecStart=/usr/local/bin/dgx-spark-mcp
|
||||
|
||||
# Restart policy
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
StartLimitInterval=600
|
||||
StartLimitBurst=5
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
LimitNPROC=4096
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
# Allow write access to logs
|
||||
ReadWritePaths=/var/log/dgx-spark-mcp
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=dgx-spark-mcp
|
||||
|
||||
# Process management
|
||||
KillMode=mixed
|
||||
KillSignal=SIGTERM
|
||||
TimeoutStopSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# DGX Spark MCP Server - Playbook Installation Script
|
||||
# Installs the server from NPM and configures systemd
|
||||
|
||||
# Configuration
|
||||
PACKAGE_NAME="dgx-spark-mcp"
|
||||
SERVICE_NAME="dgx-spark-mcp"
|
||||
CONFIG_DIR="/etc/dgx-spark-mcp"
|
||||
LOG_DIR="/var/log/dgx-spark-mcp"
|
||||
USER="dgx"
|
||||
GROUP="dgx"
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
NC='\033[0m'
|
||||
|
||||
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||
|
||||
# Check root
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "This script must be run as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 1. Install Node.js (if missing) - Brief check
|
||||
if ! command -v node &> /dev/null; then
|
||||
log_info "Node.js not found. Please install Node.js 18+."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. Install Package
|
||||
log_info "Installing $PACKAGE_NAME from registry..."
|
||||
npm install -g $PACKAGE_NAME
|
||||
|
||||
# 3. Create User
|
||||
if ! id -u "$USER" &>/dev/null; then
|
||||
log_info "Creating user $USER..."
|
||||
useradd --system --no-create-home --shell /bin/false "$USER"
|
||||
fi
|
||||
|
||||
# 4. Setup Directories
|
||||
log_info "Setting up directories..."
|
||||
mkdir -p "$CONFIG_DIR"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Copy config if provided in playbook
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
if [[ -f "$SCRIPT_DIR/../config/default.json" ]]; then
|
||||
cp "$SCRIPT_DIR/../config/default.json" "$CONFIG_DIR/config.json"
|
||||
else
|
||||
log_info "No default config found, using internal defaults."
|
||||
fi
|
||||
|
||||
# Permissions
|
||||
chown -R "$USER:$GROUP" "$LOG_DIR"
|
||||
chown -R "$USER:$GROUP" "$CONFIG_DIR"
|
||||
chmod 755 "$LOG_DIR"
|
||||
chmod 755 "$CONFIG_DIR"
|
||||
|
||||
# 5. Setup Service
|
||||
log_info "Configuring systemd service..."
|
||||
if [[ -f "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" ]]; then
|
||||
cp "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" "/etc/systemd/system/$SERVICE_NAME.service"
|
||||
systemctl daemon-reload
|
||||
systemctl enable "$SERVICE_NAME"
|
||||
systemctl restart "$SERVICE_NAME"
|
||||
log_info "Service started."
|
||||
else
|
||||
log_error "Service file not found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Installation complete."
|
||||
log_info "Status: systemctl status $SERVICE_NAME"
|
||||
Loading…
Reference in New Issue
Block a user