#!/bin/bash # # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # vLLM startup script with NVFP4 quantization support for Llama 4 Scout # Optimized for NVIDIA Blackwell and Hopper architectures set -e # Default configuration - using supported Llama 3.1 model for testing VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} VLLM_PORT=${VLLM_PORT:-8001} VLLM_HOST=${VLLM_HOST:-"0.0.0.0"} VLLM_TENSOR_PARALLEL_SIZE=${VLLM_TENSOR_PARALLEL_SIZE:-2} VLLM_MAX_MODEL_LEN=${VLLM_MAX_MODEL_LEN:-8192} VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.9} VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-128} VLLM_MAX_NUM_BATCHED_TOKENS=${VLLM_MAX_NUM_BATCHED_TOKENS:-8192} VLLM_KV_CACHE_DTYPE=${VLLM_KV_CACHE_DTYPE:-"auto"} # Detect GPU compute capability and set optimizations COMPUTE_CAPABILITY=$(nvidia-smi -i 0 --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || echo "unknown") echo "Starting vLLM service with the following configuration:" echo "Model: $VLLM_MODEL" echo "Port: $VLLM_PORT" echo "Host: $VLLM_HOST" echo "Tensor Parallel Size: $VLLM_TENSOR_PARALLEL_SIZE" echo "Max Model Length: $VLLM_MAX_MODEL_LEN" echo "Max Num Seqs: $VLLM_MAX_NUM_SEQS" echo "Max Batched Tokens: $VLLM_MAX_NUM_BATCHED_TOKENS" echo "GPU Memory Utilization: $VLLM_GPU_MEMORY_UTILIZATION" echo "KV Cache Dtype: $VLLM_KV_CACHE_DTYPE" echo "GPU Compute Capability: $COMPUTE_CAPABILITY" # Set up environment variables for optimal performance based on GPU architecture if [ "$COMPUTE_CAPABILITY" = "10.0" ]; then echo "Detected Blackwell architecture - enabling NVFP4 optimizations" # Use FlashInfer backend for attentions export VLLM_ATTENTION_BACKEND=FLASHINFER # Use FlashInfer trtllm-gen attention kernels export VLLM_USE_TRTLLM_ATTENTION=1 # Use FlashInfer FP8/FP4 MoE export VLLM_USE_FLASHINFER_MOE_FP8=1 export VLLM_USE_FLASHINFER_MOE_FP4=1 # Use FlashInfer trtllm-gen MoE backend export VLLM_FLASHINFER_MOE_BACKEND="latency" # Enable async scheduling ASYNC_SCHEDULING_FLAG="--async-scheduling" # Enable FlashInfer fusions FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"full_cuda_graph":true}' elif [ "$COMPUTE_CAPABILITY" = "9.0" ]; then echo "Detected Hopper architecture - enabling FP8 optimizations" # Disable async scheduling on Hopper architecture due to vLLM limitations ASYNC_SCHEDULING_FLAG="" # Disable FlashInfer fusions since they are not supported on Hopper architecture FUSION_FLAG="{}" else echo "GPU architecture not specifically optimized - using default settings" ASYNC_SCHEDULING_FLAG="" FUSION_FLAG="{}" fi # Check GPU availability if ! nvidia-smi > /dev/null 2>&1; then echo "Warning: NVIDIA GPU not detected. vLLM may not work properly." fi # Create model cache directory mkdir -p /app/models echo "Starting vLLM's built-in OpenAI API server" # Build vLLM command with NVFP4 optimizations VLLM_CMD="vllm serve $VLLM_MODEL \ --host $VLLM_HOST \ --port $VLLM_PORT \ --tensor-parallel-size $VLLM_TENSOR_PARALLEL_SIZE \ --max-model-len $VLLM_MAX_MODEL_LEN \ --max-num-seqs $VLLM_MAX_NUM_SEQS \ --max-num-batched-tokens $VLLM_MAX_NUM_BATCHED_TOKENS \ --gpu-memory-utilization $VLLM_GPU_MEMORY_UTILIZATION \ --kv-cache-dtype $VLLM_KV_CACHE_DTYPE \ --trust-remote-code \ --served-model-name $VLLM_MODEL" # Add async scheduling if supported if [ -n "$ASYNC_SCHEDULING_FLAG" ]; then VLLM_CMD="$VLLM_CMD $ASYNC_SCHEDULING_FLAG" fi # Add fusion optimizations if available if [ "$FUSION_FLAG" != "{}" ]; then VLLM_CMD="$VLLM_CMD --compilation-config '$FUSION_FLAG'" fi # Start vLLM server exec $VLLM_CMD