version: '3.8'

services:
  vllm-nvfp4:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: vllm-nvfp4-server
    ports:
      - "8001:8001"
    environment:
      # HuggingFace configuration
      - HF_TOKEN=${HF_TOKEN}
      - HF_HOME=/app/models/.cache
      
    volumes:
      # Cache HuggingFace models locally
      - ./models:/app/models
      - huggingface_cache:/app/models/.cache
      # Mount the launch script
      - ./launch_server.sh:/app/launch_server.sh
    
    # NVIDIA recommended settings for PyTorch
    ipc: host
    ulimits:
      memlock: -1
      stack: 67108864
    shm_size: 2gb
      
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    
    restart: unless-stopped
    
    entrypoint: ["/bin/bash", "/app/launch_server.sh"]
    
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

volumes:
  huggingface_cache:
    driver: local