version: '3.8' services: ollama: build: context: . dockerfile: Dockerfile image: ollama-custom:latest container_name: ollama-server ports: - "11434:11434" volumes: - ollama_models:/root/.ollama environment: - OLLAMA_HOST=0.0.0.0:11434 - OLLAMA_FLASH_ATTENTION=1 - OLLAMA_KEEP_ALIVE=30m - OLLAMA_CUDA=1 # Performance tuning for large models like Llama3 70B - OLLAMA_LLM_LIBRARY=cuda - OLLAMA_NUM_PARALLEL=1 # Favor latency/stability for 70B; increase for smaller models - OLLAMA_MAX_LOADED_MODELS=1 # Avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM with minimal perf impact # Removed restrictive settings for 70B model testing: # - OLLAMA_CONTEXT_LENGTH=8192 (let Ollama auto-detect) # - OLLAMA_NUM_PARALLEL=4 (let Ollama decide) # - OLLAMA_MAX_LOADED=1 (allow multiple models) # - OLLAMA_NUM_THREADS=16 (may force CPU usage) runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] interval: 30s timeout: 10s retries: 3 start_period: 60s # GPU Memory Monitor - only for unified memory systems like DGX Spark gpu-monitor: build: context: . dockerfile: Dockerfile.monitor container_name: ollama-gpu-monitor depends_on: - ollama volumes: - /var/run/docker.sock:/var/run/docker.sock:ro environment: - CHECK_INTERVAL=60 # Check every 60 seconds - MIN_AVAILABLE_PERCENT=70 # Alert if less than 70% GPU memory available - AUTO_FIX=true # Automatically fix buffer cache issues privileged: true # Required to clear buffer cache and restart containers restart: unless-stopped profiles: - unified-memory # Only start with --profile unified-memory volumes: ollama_models: driver: local