mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
52 lines
1.1 KiB
YAML
52 lines
1.1 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
vllm-nvfp4:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
container_name: vllm-nvfp4-server
|
|
ports:
|
|
- "8001:8001"
|
|
environment:
|
|
# HuggingFace configuration
|
|
- HF_TOKEN=${HF_TOKEN}
|
|
- HF_HOME=/app/models/.cache
|
|
|
|
volumes:
|
|
# Cache HuggingFace models locally
|
|
- ./models:/app/models
|
|
- huggingface_cache:/app/models/.cache
|
|
# Mount the launch script
|
|
- ./launch_server.sh:/app/launch_server.sh
|
|
|
|
# NVIDIA recommended settings for PyTorch
|
|
ipc: host
|
|
ulimits:
|
|
memlock: -1
|
|
stack: 67108864
|
|
shm_size: 2gb
|
|
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
|
|
restart: unless-stopped
|
|
|
|
entrypoint: ["/bin/bash", "/app/launch_server.sh"]
|
|
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 120s
|
|
|
|
volumes:
|
|
huggingface_cache:
|
|
driver: local
|