dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/start-vllm.sh
2025-12-02 19:43:52 +00:00

113 lines
4.3 KiB
Bash
Executable File

#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# vLLM startup script with NVFP4 quantization support for Llama 4 Scout
# Optimized for NVIDIA Blackwell and Hopper architectures
set -e
# Default configuration - using supported Llama 3.1 model for testing
VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
VLLM_PORT=${VLLM_PORT:-8001}
VLLM_HOST=${VLLM_HOST:-"0.0.0.0"}
VLLM_TENSOR_PARALLEL_SIZE=${VLLM_TENSOR_PARALLEL_SIZE:-2}
VLLM_MAX_MODEL_LEN=${VLLM_MAX_MODEL_LEN:-8192}
VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-128}
VLLM_MAX_NUM_BATCHED_TOKENS=${VLLM_MAX_NUM_BATCHED_TOKENS:-8192}
VLLM_KV_CACHE_DTYPE=${VLLM_KV_CACHE_DTYPE:-"auto"}
# Detect GPU compute capability and set optimizations
COMPUTE_CAPABILITY=$(nvidia-smi -i 0 --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || echo "unknown")
echo "Starting vLLM service with the following configuration:"
echo "Model: $VLLM_MODEL"
echo "Port: $VLLM_PORT"
echo "Host: $VLLM_HOST"
echo "Tensor Parallel Size: $VLLM_TENSOR_PARALLEL_SIZE"
echo "Max Model Length: $VLLM_MAX_MODEL_LEN"
echo "Max Num Seqs: $VLLM_MAX_NUM_SEQS"
echo "Max Batched Tokens: $VLLM_MAX_NUM_BATCHED_TOKENS"
echo "GPU Memory Utilization: $VLLM_GPU_MEMORY_UTILIZATION"
echo "KV Cache Dtype: $VLLM_KV_CACHE_DTYPE"
echo "GPU Compute Capability: $COMPUTE_CAPABILITY"
# Set up environment variables for optimal performance based on GPU architecture
if [ "$COMPUTE_CAPABILITY" = "10.0" ]; then
echo "Detected Blackwell architecture - enabling NVFP4 optimizations"
# Use FlashInfer backend for attentions
export VLLM_ATTENTION_BACKEND=FLASHINFER
# Use FlashInfer trtllm-gen attention kernels
export VLLM_USE_TRTLLM_ATTENTION=1
# Use FlashInfer FP8/FP4 MoE
export VLLM_USE_FLASHINFER_MOE_FP8=1
export VLLM_USE_FLASHINFER_MOE_FP4=1
# Use FlashInfer trtllm-gen MoE backend
export VLLM_FLASHINFER_MOE_BACKEND="latency"
# Enable async scheduling
ASYNC_SCHEDULING_FLAG="--async-scheduling"
# Enable FlashInfer fusions
FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"full_cuda_graph":true}'
elif [ "$COMPUTE_CAPABILITY" = "9.0" ]; then
echo "Detected Hopper architecture - enabling FP8 optimizations"
# Disable async scheduling on Hopper architecture due to vLLM limitations
ASYNC_SCHEDULING_FLAG=""
# Disable FlashInfer fusions since they are not supported on Hopper architecture
FUSION_FLAG="{}"
else
echo "GPU architecture not specifically optimized - using default settings"
ASYNC_SCHEDULING_FLAG=""
FUSION_FLAG="{}"
fi
# Check GPU availability
if ! nvidia-smi > /dev/null 2>&1; then
echo "Warning: NVIDIA GPU not detected. vLLM may not work properly."
fi
# Create model cache directory
mkdir -p /app/models
echo "Starting vLLM's built-in OpenAI API server"
# Build vLLM command with NVFP4 optimizations
VLLM_CMD="vllm serve $VLLM_MODEL \
--host $VLLM_HOST \
--port $VLLM_PORT \
--tensor-parallel-size $VLLM_TENSOR_PARALLEL_SIZE \
--max-model-len $VLLM_MAX_MODEL_LEN \
--max-num-seqs $VLLM_MAX_NUM_SEQS \
--max-num-batched-tokens $VLLM_MAX_NUM_BATCHED_TOKENS \
--gpu-memory-utilization $VLLM_GPU_MEMORY_UTILIZATION \
--kv-cache-dtype $VLLM_KV_CACHE_DTYPE \
--trust-remote-code \
--served-model-name $VLLM_MODEL"
# Add async scheduling if supported
if [ -n "$ASYNC_SCHEDULING_FLAG" ]; then
VLLM_CMD="$VLLM_CMD $ASYNC_SCHEDULING_FLAG"
fi
# Add fusion optimizations if available
if [ "$FUSION_FLAG" != "{}" ]; then
VLLM_CMD="$VLLM_CMD --compilation-config '$FUSION_FLAG'"
fi
# Start vLLM server
exec $VLLM_CMD