dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/launch_server.sh
2026-01-14 16:05:35 +00:00

150 lines
5.6 KiB
Bash
Executable File

#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Launch vLLM with NVIDIA Triton Inference Server optimized build
# This should have proper support for compute capability 12.1 (DGX Spark)
# Enable unified memory usage for DGX Spark
export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
export PYTORCH_ALLOC_CONF=expandable_segments:True
# Enable CUDA unified memory and oversubscription
export PYTORCH_NO_CUDA_MEMORY_CACHING=0
# Optimized environment for performance
export VLLM_LOGGING_LEVEL=INFO
export PYTHONUNBUFFERED=1
# Enable CUDA optimizations
export VLLM_USE_MODELSCOPE=false
# Enable FP8 MoE optimizations for Nemotron and other MoE models
export VLLM_USE_FLASHINFER_MOE_FP8=1
export VLLM_USE_FLASHINFER_MOE_FP4=1
# Enable FlashInfer attention backend for better performance
export VLLM_ATTENTION_BACKEND=FLASHINFER
# First, test basic CUDA functionality
echo "=== Testing CUDA functionality ==="
python3 -c "
import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
print(f'CUDA version: {torch.version.cuda}')
print(f'GPU count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
print(f'GPU {i}: {props.name} (compute capability {props.major}.{props.minor})')
# Try basic CUDA operation
try:
x = torch.randn(10, 10).cuda(i)
y = torch.matmul(x, x.T)
print(f'GPU {i}: Basic CUDA operations work')
except Exception as e:
print(f'GPU {i}: CUDA operation failed: {e}')
"
echo "=== Starting optimized vLLM server ==="
# Check GPU compute capability for optimal settings
COMPUTE_CAPABILITY=$(nvidia-smi -i 0 --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null || echo "unknown")
echo "Detected GPU compute capability: $COMPUTE_CAPABILITY"
# Use environment variable if set, otherwise default to Qwen (not gated)
if [ -n "$VLLM_MODEL" ]; then
MODEL_TO_USE="$VLLM_MODEL"
echo "Using model from environment: $MODEL_TO_USE"
else
# Default to Qwen 2.5 7B - not gated, no HuggingFace token required
MODEL_TO_USE="Qwen/Qwen2.5-7B-Instruct"
echo "Using default model: $MODEL_TO_USE"
fi
# Configure settings based on model size and GPU architecture
# Check if using 8B or smaller model
if [[ "$MODEL_TO_USE" == *"8B"* ]] || [[ "$MODEL_TO_USE" == *"7B"* ]] || [[ "$MODEL_TO_USE" == *"3B"* ]] || [[ "$MODEL_TO_USE" == *"1B"* ]]; then
echo "Configuring for smaller model (8B or less)"
QUANTIZATION_FLAG=""
GPU_MEMORY_UTIL="${VLLM_GPU_MEMORY_UTILIZATION:-0.9}"
MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-8192}"
MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-64}"
MAX_BATCHED_TOKENS="${VLLM_MAX_NUM_BATCHED_TOKENS:-8192}"
CPU_OFFLOAD_GB="${VLLM_CPU_OFFLOAD_GB:-0}"
elif [[ "$COMPUTE_CAPABILITY" == "12.1" ]] || [[ "$COMPUTE_CAPABILITY" == "10.0" ]]; then
# Blackwell/DGX Spark architecture with larger model - use CPU offloading
echo "Configuring for large model on Blackwell/DGX Spark with CPU offloading"
QUANTIZATION_FLAG=""
GPU_MEMORY_UTIL="${VLLM_GPU_MEMORY_UTILIZATION:-0.7}"
MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-4096}"
MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-16}"
MAX_BATCHED_TOKENS="${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}"
CPU_OFFLOAD_GB="${VLLM_CPU_OFFLOAD_GB:-50}"
else
# Other architectures with larger model
echo "Configuring for large model on GPU architecture: $COMPUTE_CAPABILITY"
QUANTIZATION_FLAG=""
GPU_MEMORY_UTIL="${VLLM_GPU_MEMORY_UTILIZATION:-0.7}"
MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-4096}"
MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-16}"
MAX_BATCHED_TOKENS="${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}"
CPU_OFFLOAD_GB="${VLLM_CPU_OFFLOAD_GB:-40}"
fi
echo ""
echo "=== vLLM Configuration ==="
echo "Model: $MODEL_TO_USE"
echo "GPU memory utilization: $GPU_MEMORY_UTIL"
echo "Max model length: $MAX_MODEL_LEN"
echo "Max num seqs: $MAX_NUM_SEQS"
echo "Max batched tokens: $MAX_BATCHED_TOKENS"
echo "CPU Offload: ${CPU_OFFLOAD_GB}GB"
echo "Quantization: ${QUANTIZATION_FLAG:-'none'}"
echo ""
# Build command - only add cpu-offload-gb if > 0
VLLM_CMD="vllm serve $MODEL_TO_USE \
--host 0.0.0.0 \
--port 8001 \
--tensor-parallel-size 1 \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs $MAX_NUM_SEQS \
--gpu-memory-utilization $GPU_MEMORY_UTIL \
--kv-cache-dtype auto \
--trust-remote-code \
--served-model-name $MODEL_TO_USE"
# Note: For FP8 models, vLLM auto-detects quantization from model config
# No need to specify --dtype float8 (not supported in vLLM 0.11.0)
if [[ "$MODEL_TO_USE" == *"FP8"* ]] || [[ "$MODEL_TO_USE" == *"fp8"* ]]; then
echo "Detected FP8 model - vLLM will auto-detect FP8 quantization from model config"
fi
# Add CPU offload only for larger models
if [ "$CPU_OFFLOAD_GB" -gt 0 ] 2>/dev/null; then
VLLM_CMD="$VLLM_CMD --cpu-offload-gb $CPU_OFFLOAD_GB"
fi
# Add quantization if specified
if [ -n "$QUANTIZATION_FLAG" ]; then
VLLM_CMD="$VLLM_CMD $QUANTIZATION_FLAG"
fi
echo "Running: $VLLM_CMD"
exec $VLLM_CMD