services: vllm-llama3-8b: image: nvcr.io/nvidia/vllm:25.09-py3 container_name: vllm-llama3-8b ports: - "8001:8001" environment: # Model configuration - Llama3 8B - MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct - TENSOR_PARALLEL_SIZE=1 - MAX_MODEL_LEN=4096 - GPU_MEMORY_UTILIZATION=0.9 # Performance optimizations - QUANTIZATION=fp8 - KV_CACHE_DTYPE=fp8 - ENABLE_CHUNKED_PREFILL=true - MAX_NUM_BATCHED_TOKENS=8192 - MAX_NUM_SEQS=256 # Service configuration - HOST=0.0.0.0 - PORT=8001 - DISABLE_LOG_STATS=false - DISABLE_LOG_REQUESTS=false # CUDA settings - CUDA_VISIBLE_DEVICES=0 - NCCL_DEBUG=INFO # Hugging Face settings - HF_HOME=/app/.cache/huggingface - TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers volumes: # Cache Hugging Face models for faster startup - ~/.cache/huggingface:/app/.cache/huggingface - /tmp:/tmp command: > python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 8001 --tensor-parallel-size 1 --max-model-len 4096 --gpu-memory-utilization 0.9 --quantization fp8 --kv-cache-dtype fp8 --enable-chunked-prefill --max-num-batched-tokens 8192 --max-num-seqs 256 --disable-log-stats --trust-remote-code deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"] interval: 30s timeout: 10s retries: 5 start_period: 300s # 5 minutes for model loading networks: - vllm-network # Benchmark runner service vllm-benchmark: build: context: . dockerfile: Dockerfile.benchmark container_name: vllm-benchmark depends_on: vllm-llama3-8b: condition: service_healthy environment: - VLLM_URL=http://vllm-llama3-8b:8001 volumes: - ./benchmark_results:/app/results networks: - vllm-network profiles: - benchmark # Only start when explicitly requested networks: vllm-network: driver: bridge volumes: vllm_cache: driver: local