dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/ollama/docker-compose.yml
2025-10-06 17:05:41 +00:00

67 lines
2.1 KiB
YAML

version: '3.8'
services:
ollama:
build:
context: .
dockerfile: Dockerfile
image: ollama-custom:latest
container_name: ollama-server
ports:
- "11434:11434"
volumes:
- ollama_models:/root/.ollama
environment:
- OLLAMA_HOST=0.0.0.0:11434
- OLLAMA_FLASH_ATTENTION=1
- OLLAMA_KEEP_ALIVE=30m
- OLLAMA_CUDA=1
# Performance tuning for large models like Llama3 70B
- OLLAMA_LLM_LIBRARY=cuda
- OLLAMA_NUM_PARALLEL=1 # Favor latency/stability for 70B; increase for smaller models
- OLLAMA_MAX_LOADED_MODELS=1 # Avoid VRAM contention
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM with minimal perf impact
# Removed restrictive settings for 70B model testing:
# - OLLAMA_CONTEXT_LENGTH=8192 (let Ollama auto-detect)
# - OLLAMA_NUM_PARALLEL=4 (let Ollama decide)
# - OLLAMA_MAX_LOADED=1 (allow multiple models)
# - OLLAMA_NUM_THREADS=16 (may force CPU usage)
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# GPU Memory Monitor - only for unified memory systems like DGX Spark
gpu-monitor:
build:
context: .
dockerfile: Dockerfile.monitor
container_name: ollama-gpu-monitor
depends_on:
- ollama
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
environment:
- CHECK_INTERVAL=60 # Check every 60 seconds
- MIN_AVAILABLE_PERCENT=70 # Alert if less than 70% GPU memory available
- AUTO_FIX=true # Automatically fix buffer cache issues
privileged: true # Required to clear buffer cache and restart containers
restart: unless-stopped
profiles:
- unified-memory # Only start with --profile unified-memory
volumes:
ollama_models:
driver: local