dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/docker-compose.llama3-8b.yml

services:
  vllm-llama3-8b:
    image: nvcr.io/nvidia/vllm:25.09-py3
    container_name: vllm-llama3-8b
    ports:
      - "8001:8001"
    environment:
      # Model configuration - Llama3 8B
      - MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
      - TENSOR_PARALLEL_SIZE=1
      - MAX_MODEL_LEN=4096
      - GPU_MEMORY_UTILIZATION=0.9

      # Performance optimizations
      - QUANTIZATION=fp8
      - KV_CACHE_DTYPE=fp8
      - ENABLE_CHUNKED_PREFILL=true
      - MAX_NUM_BATCHED_TOKENS=8192
      - MAX_NUM_SEQS=256

      # Service configuration
      - HOST=0.0.0.0
      - PORT=8001
      - DISABLE_LOG_STATS=false
      - DISABLE_LOG_REQUESTS=false

      # CUDA settings
      - CUDA_VISIBLE_DEVICES=0
      - NCCL_DEBUG=INFO

      # Hugging Face settings
      - HF_HOME=/app/.cache/huggingface
      - TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers

    volumes:
      # Cache Hugging Face models for faster startup
      - ~/.cache/huggingface:/app/.cache/huggingface
      - /tmp:/tmp

    command: >
      python -m vllm.entrypoints.openai.api_server
      --model meta-llama/Llama-3.1-8B-Instruct
      --host 0.0.0.0
      --port 8001
      --tensor-parallel-size 1
      --max-model-len 4096
      --gpu-memory-utilization 0.9
      --quantization fp8
      --kv-cache-dtype fp8
      --enable-chunked-prefill
      --max-num-batched-tokens 8192
      --max-num-seqs 256
      --disable-log-stats
      --trust-remote-code

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]

    restart: unless-stopped

    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s  # 5 minutes for model loading

    networks:
      - vllm-network

  # Benchmark runner service
  vllm-benchmark:
    build:
      context: .
      dockerfile: Dockerfile.benchmark
    container_name: vllm-benchmark
    depends_on:
      vllm-llama3-8b:
        condition: service_healthy
    environment:
      - VLLM_URL=http://vllm-llama3-8b:8001
    volumes:
      - ./benchmark_results:/app/results
    networks:
      - vllm-network
    profiles:
      - benchmark  # Only start when explicitly requested

networks:
  vllm-network:
    driver: bridge

volumes:
  vllm_cache:
    driver: local