mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-25 03:13:53 +00:00
67 lines
2.1 KiB
YAML
67 lines
2.1 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
ollama:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
image: ollama-custom:latest
|
|
container_name: ollama-server
|
|
ports:
|
|
- "11434:11434"
|
|
volumes:
|
|
- ollama_models:/root/.ollama
|
|
environment:
|
|
- OLLAMA_HOST=0.0.0.0:11434
|
|
- OLLAMA_FLASH_ATTENTION=1
|
|
- OLLAMA_KEEP_ALIVE=30m
|
|
- OLLAMA_CUDA=1
|
|
# Performance tuning for large models like Llama3 70B
|
|
- OLLAMA_LLM_LIBRARY=cuda
|
|
- OLLAMA_NUM_PARALLEL=1 # Favor latency/stability for 70B; increase for smaller models
|
|
- OLLAMA_MAX_LOADED_MODELS=1 # Avoid VRAM contention
|
|
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM with minimal perf impact
|
|
# Removed restrictive settings for 70B model testing:
|
|
# - OLLAMA_CONTEXT_LENGTH=8192 (let Ollama auto-detect)
|
|
# - OLLAMA_NUM_PARALLEL=4 (let Ollama decide)
|
|
# - OLLAMA_MAX_LOADED=1 (allow multiple models)
|
|
# - OLLAMA_NUM_THREADS=16 (may force CPU usage)
|
|
runtime: nvidia
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
|
|
# GPU Memory Monitor - only for unified memory systems like DGX Spark
|
|
gpu-monitor:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile.monitor
|
|
container_name: ollama-gpu-monitor
|
|
depends_on:
|
|
- ollama
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
environment:
|
|
- CHECK_INTERVAL=60 # Check every 60 seconds
|
|
- MIN_AVAILABLE_PERCENT=70 # Alert if less than 70% GPU memory available
|
|
- AUTO_FIX=true # Automatically fix buffer cache issues
|
|
privileged: true # Required to clear buffer cache and restart containers
|
|
restart: unless-stopped
|
|
profiles:
|
|
- unified-memory # Only start with --profile unified-memory
|
|
|
|
volumes:
|
|
ollama_models:
|
|
driver: local
|