dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/run_benchmark.sh
2025-12-02 19:43:52 +00:00

216 lines
6.2 KiB
Bash
Executable File

#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# vLLM Llama3 8B Benchmark Runner
# Uses NVIDIA vLLM container for optimal performance
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VLLM_URL="http://localhost:8001"
RUNS=3
MAX_TOKENS=512
OUTPUT_FILE=""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
print_header() {
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} 🚀 vLLM Llama3 8B Benchmark Suite${NC}"
echo -e "${BLUE}========================================${NC}"
}
print_usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " -u, --url URL vLLM service URL (default: http://localhost:8001)"
echo " -r, --runs NUMBER Number of runs per prompt (default: 3)"
echo " -t, --max-tokens NUM Maximum tokens to generate (default: 512)"
echo " -o, --output FILE Output file for detailed results (JSON)"
echo " -d, --docker Run using Docker Compose"
echo " -s, --start-service Start vLLM service first"
echo " -h, --health-check Only run health check"
echo " --help Show this help message"
echo ""
echo "Examples:"
echo " $0 # Run basic benchmark"
echo " $0 --docker --start-service # Start service and run benchmark in Docker"
echo " $0 -r 5 -t 1024 -o results.json # Custom settings with output file"
echo " $0 --health-check # Check if service is running"
}
check_dependencies() {
if ! command -v python3 &> /dev/null; then
echo -e "${RED}❌ Python3 is required but not installed${NC}"
exit 1
fi
if ! python3 -c "import aiohttp, asyncio" &> /dev/null; then
echo -e "${YELLOW}⚠️ Installing required Python packages...${NC}"
pip3 install aiohttp asyncio
fi
}
check_nvidia_docker() {
if ! command -v docker &> /dev/null; then
echo -e "${RED}❌ Docker is required but not installed${NC}"
exit 1
fi
if ! docker info | grep -q "nvidia"; then
echo -e "${YELLOW}⚠️ NVIDIA Docker runtime not detected. Make sure nvidia-container-toolkit is installed${NC}"
fi
}
start_vllm_service() {
echo -e "${BLUE}🚀 Starting vLLM Llama3 8B service...${NC}"
cd "$SCRIPT_DIR"
docker-compose -f docker-compose.llama3-8b.yml up -d vllm-llama3-8b
echo -e "${YELLOW}⏳ Waiting for model to load (this may take several minutes)...${NC}"
# Wait for service to be healthy
local max_attempts=60 # 10 minutes
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -sf "$VLLM_URL/v1/models" > /dev/null 2>&1; then
echo -e "${GREEN}✅ vLLM service is ready!${NC}"
return 0
fi
echo -e "${YELLOW}⏳ Attempt $attempt/$max_attempts - waiting for service...${NC}"
sleep 10
((attempt++))
done
echo -e "${RED}❌ vLLM service failed to start within timeout${NC}"
echo -e "${YELLOW}📋 Checking service logs:${NC}"
docker-compose -f docker-compose.llama3-8b.yml logs vllm-llama3-8b
exit 1
}
run_benchmark() {
local cmd_args=("--url" "$VLLM_URL" "--runs" "$RUNS" "--max-tokens" "$MAX_TOKENS")
if [ -n "$OUTPUT_FILE" ]; then
cmd_args+=("--output" "$OUTPUT_FILE")
fi
if [ "$HEALTH_CHECK_ONLY" = true ]; then
cmd_args+=("--health-check-only")
fi
echo -e "${BLUE}🧪 Running vLLM Llama3 8B benchmark...${NC}"
echo -e "${BLUE}URL: $VLLM_URL${NC}"
echo -e "${BLUE}Runs per prompt: $RUNS${NC}"
echo -e "${BLUE}Max tokens: $MAX_TOKENS${NC}"
if [ "$USE_DOCKER" = true ]; then
# Run benchmark in Docker
cd "$SCRIPT_DIR"
docker-compose -f docker-compose.llama3-8b.yml run --rm vllm-benchmark \
python /app/vllm_llama3_benchmark.py "${cmd_args[@]}"
else
# Run benchmark locally
python3 "$SCRIPT_DIR/vllm_llama3_benchmark.py" "${cmd_args[@]}"
fi
}
# Parse command line arguments
USE_DOCKER=false
START_SERVICE=false
HEALTH_CHECK_ONLY=false
while [[ $# -gt 0 ]]; do
case $1 in
-u|--url)
VLLM_URL="$2"
shift 2
;;
-r|--runs)
RUNS="$2"
shift 2
;;
-t|--max-tokens)
MAX_TOKENS="$2"
shift 2
;;
-o|--output)
OUTPUT_FILE="$2"
shift 2
;;
-d|--docker)
USE_DOCKER=true
shift
;;
-s|--start-service)
START_SERVICE=true
shift
;;
-h|--health-check)
HEALTH_CHECK_ONLY=true
shift
;;
--help)
print_usage
exit 0
;;
*)
echo -e "${RED}❌ Unknown option: $1${NC}"
print_usage
exit 1
;;
esac
done
# Main execution
print_header
if [ "$USE_DOCKER" = true ]; then
check_nvidia_docker
if [ "$START_SERVICE" = true ]; then
start_vllm_service
fi
run_benchmark
else
check_dependencies
if [ "$START_SERVICE" = true ]; then
echo -e "${YELLOW}⚠️ --start-service only works with --docker flag${NC}"
exit 1
fi
run_benchmark
fi
echo -e "${GREEN}✅ Benchmark completed successfully!${NC}"
if [ -n "$OUTPUT_FILE" ] && [ -f "$OUTPUT_FILE" ]; then
echo -e "${BLUE}📊 Detailed results saved to: $OUTPUT_FILE${NC}"
fi