mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
101 lines
2.4 KiB
YAML
101 lines
2.4 KiB
YAML
services:
|
|
vllm-llama3-8b:
|
|
image: nvcr.io/nvidia/vllm:25.09-py3
|
|
container_name: vllm-llama3-8b
|
|
ports:
|
|
- "8001:8001"
|
|
environment:
|
|
# Model configuration - Llama3 8B
|
|
- MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
|
|
- TENSOR_PARALLEL_SIZE=1
|
|
- MAX_MODEL_LEN=4096
|
|
- GPU_MEMORY_UTILIZATION=0.9
|
|
|
|
# Performance optimizations
|
|
- QUANTIZATION=fp8
|
|
- KV_CACHE_DTYPE=fp8
|
|
- ENABLE_CHUNKED_PREFILL=true
|
|
- MAX_NUM_BATCHED_TOKENS=8192
|
|
- MAX_NUM_SEQS=256
|
|
|
|
# Service configuration
|
|
- HOST=0.0.0.0
|
|
- PORT=8001
|
|
- DISABLE_LOG_STATS=false
|
|
- DISABLE_LOG_REQUESTS=false
|
|
|
|
# CUDA settings
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
- NCCL_DEBUG=INFO
|
|
|
|
# Hugging Face settings
|
|
- HF_HOME=/app/.cache/huggingface
|
|
- TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
|
|
|
|
volumes:
|
|
# Cache Hugging Face models for faster startup
|
|
- ~/.cache/huggingface:/app/.cache/huggingface
|
|
- /tmp:/tmp
|
|
|
|
command: >
|
|
python -m vllm.entrypoints.openai.api_server
|
|
--model meta-llama/Llama-3.1-8B-Instruct
|
|
--host 0.0.0.0
|
|
--port 8001
|
|
--tensor-parallel-size 1
|
|
--max-model-len 4096
|
|
--gpu-memory-utilization 0.9
|
|
--quantization fp8
|
|
--kv-cache-dtype fp8
|
|
--enable-chunked-prefill
|
|
--max-num-batched-tokens 8192
|
|
--max-num-seqs 256
|
|
--disable-log-stats
|
|
--trust-remote-code
|
|
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
|
|
restart: unless-stopped
|
|
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 300s # 5 minutes for model loading
|
|
|
|
networks:
|
|
- vllm-network
|
|
|
|
# Benchmark runner service
|
|
vllm-benchmark:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile.benchmark
|
|
container_name: vllm-benchmark
|
|
depends_on:
|
|
vllm-llama3-8b:
|
|
condition: service_healthy
|
|
environment:
|
|
- VLLM_URL=http://vllm-llama3-8b:8001
|
|
volumes:
|
|
- ./benchmark_results:/app/results
|
|
networks:
|
|
- vllm-network
|
|
profiles:
|
|
- benchmark # Only start when explicitly requested
|
|
|
|
networks:
|
|
vllm-network:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
vllm_cache:
|
|
driver: local
|