dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/docker-compose.llama3-8b.yml
2025-10-06 17:05:41 +00:00

101 lines
2.4 KiB
YAML

services:
vllm-llama3-8b:
image: nvcr.io/nvidia/vllm:25.09-py3
container_name: vllm-llama3-8b
ports:
- "8001:8001"
environment:
# Model configuration - Llama3 8B
- MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
- TENSOR_PARALLEL_SIZE=1
- MAX_MODEL_LEN=4096
- GPU_MEMORY_UTILIZATION=0.9
# Performance optimizations
- QUANTIZATION=fp8
- KV_CACHE_DTYPE=fp8
- ENABLE_CHUNKED_PREFILL=true
- MAX_NUM_BATCHED_TOKENS=8192
- MAX_NUM_SEQS=256
# Service configuration
- HOST=0.0.0.0
- PORT=8001
- DISABLE_LOG_STATS=false
- DISABLE_LOG_REQUESTS=false
# CUDA settings
- CUDA_VISIBLE_DEVICES=0
- NCCL_DEBUG=INFO
# Hugging Face settings
- HF_HOME=/app/.cache/huggingface
- TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
volumes:
# Cache Hugging Face models for faster startup
- ~/.cache/huggingface:/app/.cache/huggingface
- /tmp:/tmp
command: >
python -m vllm.entrypoints.openai.api_server
--model meta-llama/Llama-3.1-8B-Instruct
--host 0.0.0.0
--port 8001
--tensor-parallel-size 1
--max-model-len 4096
--gpu-memory-utilization 0.9
--quantization fp8
--kv-cache-dtype fp8
--enable-chunked-prefill
--max-num-batched-tokens 8192
--max-num-seqs 256
--disable-log-stats
--trust-remote-code
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s # 5 minutes for model loading
networks:
- vllm-network
# Benchmark runner service
vllm-benchmark:
build:
context: .
dockerfile: Dockerfile.benchmark
container_name: vllm-benchmark
depends_on:
vllm-llama3-8b:
condition: service_healthy
environment:
- VLLM_URL=http://vllm-llama3-8b:8001
volumes:
- ./benchmark_results:/app/results
networks:
- vllm-network
profiles:
- benchmark # Only start when explicitly requested
networks:
vllm-network:
driver: bridge
volumes:
vllm_cache:
driver: local