dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/launch_server.sh
2025-12-02 19:43:52 +00:00

131 lines
4.8 KiB
Bash
Executable File

#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Launch vLLM with NVIDIA Triton Inference Server optimized build
# This should have proper support for compute capability 12.1 (DGX Spark)
# Enable unified memory usage for DGX Spark
export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Enable CUDA unified memory and oversubscription
export CUDA_VISIBLE_DEVICES=0
export PYTORCH_NO_CUDA_MEMORY_CACHING=0
# Force vLLM to use CPU offloading for large models
export VLLM_CPU_OFFLOAD_GB=50
export VLLM_ALLOW_RUNTIME_LORA_UPDATES_WITH_SGD_LORA=1
export VLLM_SKIP_WARMUP=0
# Optimized environment for performance
export VLLM_LOGGING_LEVEL=INFO
export PYTHONUNBUFFERED=1
# Enable CUDA optimizations
export VLLM_USE_MODELSCOPE=false
# Enable unified memory in vLLM
export VLLM_USE_V1=0
# First, test basic CUDA functionality
echo "=== Testing CUDA functionality ==="
python3 -c "
import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
print(f'CUDA version: {torch.version.cuda}')
print(f'GPU count: {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
print(f'GPU {i}: {props.name} (compute capability {props.major}.{props.minor})')
# Try basic CUDA operation
try:
x = torch.randn(10, 10).cuda(i)
y = torch.matmul(x, x.T)
print(f'GPU {i}: Basic CUDA operations work')
except Exception as e:
print(f'GPU {i}: CUDA operation failed: {e}')
"
echo "=== Starting optimized vLLM server ==="
# Optimized configuration for DGX Spark performance with NVFP4 quantization
# Available quantized models from NVIDIA
NVFP4_MODEL="nvidia/Llama-3.3-70B-Instruct-FP4"
NVFP8_MODEL="nvidia/Llama-3.1-8B-Instruct-FP8"
STANDARD_MODEL="meta-llama/Llama-3.1-70B-Instruct"
# Check GPU compute capability for optimal quantization
COMPUTE_CAPABILITY=$(nvidia-smi -i 0 --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null || echo "unknown")
echo "Detected GPU compute capability: $COMPUTE_CAPABILITY"
# Configure quantization based on GPU architecture
if [[ "$COMPUTE_CAPABILITY" == "12.1" ]] || [[ "$COMPUTE_CAPABILITY" == "10.0" ]]; then
# Blackwell/DGX Spark architecture - use standard 70B model with CPU offloading
echo "Using standard Llama-3.1-70B model for Blackwell/DGX Spark with CPU offloading"
QUANTIZATION_FLAG=""
MODEL_TO_USE="$STANDARD_MODEL" # Use standard 70B model
GPU_MEMORY_UTIL="0.7" # Lower GPU memory to allow unified memory
MAX_MODEL_LEN="4096" # Shorter sequences for memory efficiency
MAX_NUM_SEQS="16" # Lower concurrent sequences for 70B
MAX_BATCHED_TOKENS="4096"
CPU_OFFLOAD_GB="50" # Offload 50GB to CPU/unified memory
elif [[ "$COMPUTE_CAPABILITY" == "9.0" ]]; then
# Hopper architecture - use standard model
echo "Using standard 70B model for Hopper architecture"
QUANTIZATION_FLAG=""
MODEL_TO_USE="$STANDARD_MODEL"
GPU_MEMORY_UTIL="0.7"
MAX_MODEL_LEN="4096"
MAX_NUM_SEQS="16"
MAX_BATCHED_TOKENS="4096"
CPU_OFFLOAD_GB="40"
else
# Other architectures - use standard precision
echo "Using standard 70B model for GPU architecture: $COMPUTE_CAPABILITY"
QUANTIZATION_FLAG=""
MODEL_TO_USE="$STANDARD_MODEL"
GPU_MEMORY_UTIL="0.7"
MAX_MODEL_LEN="2048"
MAX_NUM_SEQS="16"
MAX_BATCHED_TOKENS="2048"
CPU_OFFLOAD_GB="40"
fi
echo "Using model: $MODEL_TO_USE"
echo "Quantization: ${QUANTIZATION_FLAG:-'disabled'}"
echo "GPU memory utilization: $GPU_MEMORY_UTIL"
echo "CPU Offload: ${CPU_OFFLOAD_GB}GB"
vllm serve "$MODEL_TO_USE" \
--host 0.0.0.0 \
--port 8001 \
--tensor-parallel-size 1 \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens "$MAX_BATCHED_TOKENS" \
--gpu-memory-utilization "$GPU_MEMORY_UTIL" \
--cpu-offload-gb "$CPU_OFFLOAD_GB" \
--kv-cache-dtype auto \
--trust-remote-code \
--served-model-name "$MODEL_TO_USE" \
--enable-chunked-prefill \
--disable-custom-all-reduce \
--disable-async-output-proc \
$QUANTIZATION_FLAG