version: '3.8'

services:
  ollama:
    build:
      context: .
      dockerfile: Dockerfile
    image: ollama-custom:latest
    container_name: ollama-server
    ports:
      - "11434:11434"
    volumes:
      - ollama_models:/root/.ollama
    environment:
      - OLLAMA_HOST=0.0.0.0:11434
      - OLLAMA_FLASH_ATTENTION=1
      - OLLAMA_KEEP_ALIVE=30m
      - OLLAMA_CUDA=1
      # Performance tuning for large models like Llama3 70B
      - OLLAMA_LLM_LIBRARY=cuda
      - OLLAMA_NUM_PARALLEL=1         # Favor latency/stability for 70B; increase for smaller models
      - OLLAMA_MAX_LOADED_MODELS=1    # Avoid VRAM contention
      - OLLAMA_KV_CACHE_TYPE=q8_0     # Reduce KV cache VRAM with minimal perf impact
      # Removed restrictive settings for 70B model testing:
      # - OLLAMA_CONTEXT_LENGTH=8192 (let Ollama auto-detect)
      # - OLLAMA_NUM_PARALLEL=4 (let Ollama decide)
      # - OLLAMA_MAX_LOADED=1 (allow multiple models)
      # - OLLAMA_NUM_THREADS=16 (may force CPU usage)
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  # GPU Memory Monitor - only for unified memory systems like DGX Spark
  gpu-monitor:
    build:
      context: .
      dockerfile: Dockerfile.monitor
    container_name: ollama-gpu-monitor
    depends_on:
      - ollama
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    environment:
      - CHECK_INTERVAL=60          # Check every 60 seconds
      - MIN_AVAILABLE_PERCENT=70   # Alert if less than 70% GPU memory available
      - AUTO_FIX=true             # Automatically fix buffer cache issues
    privileged: true  # Required to clear buffer cache and restart containers
    restart: unless-stopped
    profiles:
      - unified-memory  # Only start with --profile unified-memory

volumes:
  ollama_models:
    driver: local