mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-24 02:43:55 +00:00
Merge 2d52e1aab3 into 36ac5b74eb
This commit is contained in:
commit
25fe85b411
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
84
nvidia/dgx-spark-mcp-server/README.md
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# DGX Spark MCP Server Playbook
|
||||||
|
|
||||||
|
This playbook installs and configures the **DGX Spark MCP Server**, a tool that provides hardware-aware Apache Spark optimization for NVIDIA DGX systems via the Model Context Protocol (MCP).
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The DGX Spark MCP Server enables MCP clients (like Claude Desktop or Claude Code) to:
|
||||||
|
* **Detect Hardware**: Automatically read DGX GPU topology, memory, and CPU specs.
|
||||||
|
* **Optimize Spark**: Generate tuned Spark configurations (`spark-submit` args) based on detected hardware and workload type (ETL, ML Training, Inference).
|
||||||
|
* **Monitor**: Check real-time GPU availability before submitting jobs.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
* **NVIDIA DGX System** (or compatible GPU server)
|
||||||
|
* **NVIDIA Drivers** installed (`nvidia-smi` available)
|
||||||
|
* **Node.js 18+**
|
||||||
|
* **Root access** (for systemd service installation)
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── config/
|
||||||
|
│ └── default.json # Default configuration
|
||||||
|
├── deploy/
|
||||||
|
│ └── dgx-spark-mcp.service # Systemd service file
|
||||||
|
└── scripts/
|
||||||
|
└── install.sh # Automated installer
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
1. **Run the installer**:
|
||||||
|
```bash
|
||||||
|
sudo ./scripts/install.sh
|
||||||
|
```
|
||||||
|
This script will:
|
||||||
|
* Install `dgx-spark-mcp` globally via `npm`.
|
||||||
|
* Create a dedicated system user (`dgx`).
|
||||||
|
* Setup logging directory `/var/log/dgx-spark-mcp`.
|
||||||
|
* Install and start the systemd service.
|
||||||
|
|
||||||
|
2. **Verify Installation**:
|
||||||
|
```bash
|
||||||
|
systemctl status dgx-spark-mcp
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
The configuration file is located at `/etc/dgx-spark-mcp/config.json`.
|
||||||
|
|
||||||
|
### Key Settings
|
||||||
|
|
||||||
|
* **`mcp.transport`**: `stdio` (default) or `sse`.
|
||||||
|
* **`hardware.enableGpuMonitoring`**: Set to `true` to enable real-time `nvidia-smi` queries.
|
||||||
|
* **`logging.level`**: `info` or `debug`.
|
||||||
|
|
||||||
|
## Usage with Claude Desktop
|
||||||
|
|
||||||
|
Add the following to your `claude_desktop_config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"dgx-spark": {
|
||||||
|
"command": "dgx-spark-mcp"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**Service fails to start?**
|
||||||
|
Check logs:
|
||||||
|
```bash
|
||||||
|
journalctl -u dgx-spark-mcp -f
|
||||||
|
```
|
||||||
|
|
||||||
|
**Permission denied?**
|
||||||
|
Ensure the `dgx` user has permissions to access `nvidia-smi`. You may need to add the user to the `video` group:
|
||||||
|
```bash
|
||||||
|
usermod -a -G video dgx
|
||||||
|
```
|
||||||
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
33
nvidia/dgx-spark-mcp-server/config/default.json
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
{
|
||||||
|
"server": {
|
||||||
|
"port": 3000,
|
||||||
|
"host": "localhost",
|
||||||
|
"nodeEnv": "production"
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"level": "info",
|
||||||
|
"format": "json",
|
||||||
|
"dir": "/var/log/dgx-spark-mcp",
|
||||||
|
"maxFiles": 10,
|
||||||
|
"maxSize": "10m"
|
||||||
|
},
|
||||||
|
"mcp": {
|
||||||
|
"serverName": "dgx-spark-mcp",
|
||||||
|
"serverVersion": "0.1.0",
|
||||||
|
"transport": "stdio"
|
||||||
|
},
|
||||||
|
"hardware": {
|
||||||
|
"nvidiaSmiPath": "/usr/bin/nvidia-smi",
|
||||||
|
"cacheTTL": 30000,
|
||||||
|
"enableGpuMonitoring": true
|
||||||
|
},
|
||||||
|
"spark": {},
|
||||||
|
"performance": {
|
||||||
|
"enableMetrics": true,
|
||||||
|
"metricsInterval": 60000,
|
||||||
|
"healthCheckInterval": 30000
|
||||||
|
},
|
||||||
|
"security": {
|
||||||
|
"enableAuth": false
|
||||||
|
}
|
||||||
|
}
|
||||||
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
48
nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DGX Spark MCP Server
|
||||||
|
Documentation=https://github.com/raibid-labs/dgx-spark-mcp
|
||||||
|
After=network.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=dgx
|
||||||
|
Group=dgx
|
||||||
|
# Environment variables
|
||||||
|
Environment="NODE_ENV=production"
|
||||||
|
Environment="DGX_MCP_CONFIG_PATH=/etc/dgx-spark-mcp/config.json"
|
||||||
|
|
||||||
|
# Start the service
|
||||||
|
# Assumes installed globally via npm
|
||||||
|
ExecStart=/usr/local/bin/dgx-spark-mcp
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=600
|
||||||
|
StartLimitBurst=5
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
LimitNPROC=4096
|
||||||
|
|
||||||
|
# Security hardening
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
# Allow write access to logs
|
||||||
|
ReadWritePaths=/var/log/dgx-spark-mcp
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=dgx-spark-mcp
|
||||||
|
|
||||||
|
# Process management
|
||||||
|
KillMode=mixed
|
||||||
|
KillSignal=SIGTERM
|
||||||
|
TimeoutStopSec=30
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
78
nvidia/dgx-spark-mcp-server/scripts/install.sh
Executable file
@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# DGX Spark MCP Server - Playbook Installation Script
|
||||||
|
# Installs the server from NPM and configures systemd
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
PACKAGE_NAME="dgx-spark-mcp"
|
||||||
|
SERVICE_NAME="dgx-spark-mcp"
|
||||||
|
CONFIG_DIR="/etc/dgx-spark-mcp"
|
||||||
|
LOG_DIR="/var/log/dgx-spark-mcp"
|
||||||
|
USER="dgx"
|
||||||
|
GROUP="dgx"
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||||
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||||
|
|
||||||
|
# Check root
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
log_error "This script must be run as root"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 1. Install Node.js (if missing) - Brief check
|
||||||
|
if ! command -v node &> /dev/null; then
|
||||||
|
log_info "Node.js not found. Please install Node.js 18+."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Install Package
|
||||||
|
log_info "Installing $PACKAGE_NAME from registry..."
|
||||||
|
npm install -g $PACKAGE_NAME
|
||||||
|
|
||||||
|
# 3. Create User
|
||||||
|
if ! id -u "$USER" &>/dev/null; then
|
||||||
|
log_info "Creating user $USER..."
|
||||||
|
useradd --system --no-create-home --shell /bin/false "$USER"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. Setup Directories
|
||||||
|
log_info "Setting up directories..."
|
||||||
|
mkdir -p "$CONFIG_DIR"
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
|
||||||
|
# Copy config if provided in playbook
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
if [[ -f "$SCRIPT_DIR/../config/default.json" ]]; then
|
||||||
|
cp "$SCRIPT_DIR/../config/default.json" "$CONFIG_DIR/config.json"
|
||||||
|
else
|
||||||
|
log_info "No default config found, using internal defaults."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Permissions
|
||||||
|
chown -R "$USER:$GROUP" "$LOG_DIR"
|
||||||
|
chown -R "$USER:$GROUP" "$CONFIG_DIR"
|
||||||
|
chmod 755 "$LOG_DIR"
|
||||||
|
chmod 755 "$CONFIG_DIR"
|
||||||
|
|
||||||
|
# 5. Setup Service
|
||||||
|
log_info "Configuring systemd service..."
|
||||||
|
if [[ -f "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" ]]; then
|
||||||
|
cp "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" "/etc/systemd/system/$SERVICE_NAME.service"
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable "$SERVICE_NAME"
|
||||||
|
systemctl restart "$SERVICE_NAME"
|
||||||
|
log_info "Service started."
|
||||||
|
else
|
||||||
|
log_error "Service file not found."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Installation complete."
|
||||||
|
log_info "Status: systemctl status $SERVICE_NAME"
|
||||||
Loading…
Reference in New Issue
Block a user