dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/docker-compose.yml

version: '3.8'

services:
  vllm-nvfp4:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: vllm-nvfp4-server
    ports:
      - "8001:8001"
    environment:
      # HuggingFace configuration
      - HF_TOKEN=${HF_TOKEN}
      - HF_HOME=/app/models/.cache

    volumes:
      # Cache HuggingFace models locally
      - ./models:/app/models
      - huggingface_cache:/app/models/.cache
      # Mount the launch script
      - ./launch_server.sh:/app/launch_server.sh

    # NVIDIA recommended settings for PyTorch
    ipc: host
    ulimits:
      memlock: -1
      stack: 67108864
    shm_size: 2gb

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

    restart: unless-stopped

    entrypoint: ["/bin/bash", "/app/launch_server.sh"]

    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

volumes:
  huggingface_cache:
    driver: local