dgx-spark-playbooks/nvidia/multi-agent-chatbot/assets/docker-compose-models.yml

#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
x-build-base: &build-base
  build:
    context: .
    dockerfile: Dockerfile.llamacpp
  image: local/llama.cpp:server-cuda

services:
  qwen2.5-vl:
    image: nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev
    container_name: qwen2.5-vl
    shm_size: '1g'
    restart: unless-stopped
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      - TOKENIZERS_PARALLELISM=false
      - NCCL_P2P_LEVEL=SYS
      - NCCL_DEBUG=INFO
      - UCX_TLS=tcp,sm,self
      - UCX_MEMTYPE_CACHE=n
      - CUDA_VISIBLE_DEVICES=0
    command: >
      trtllm-serve serve Qwen/Qwen2-VL-7B-Instruct
      --backend pytorch
      --host 0.0.0.0
      --port 8000
      --trust_remote_code
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s

  qwen3-embedding:
    <<: *build-base
    container_name: qwen3-embedding
    volumes:
      - ./models:/models
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command:
      - "-m"
      - "/models/Qwen3-Embedding-4B-Q8_0.gguf"
      - "--port"
      - "8000"
      - "--host"
      - "0.0.0.0"
      - "--jinja"
      - "--embeddings"

  # Uncomment next block if you want to use gpt-oss-20b
  # gpt-oss-20b:
  #   <<: *build-base
  #   container_name: gpt-oss-20b
  #   volumes:
  #     - ./models:/models
  #   deploy:
  #     resources:
  #       reservations:
  #         devices:
  #           - driver: nvidia
  #             count: all
  #             capabilities: [gpu]
  #   command:
  #     - "-m"
  #     - "/models/gpt-oss-20b-mxfp4.gguf"
  #     - "--port"
  #     - "8000"
  #     - "--host"
  #     - "0.0.0.0"
  #     - "-n"
  #     - "2048"
  #     - "--n-gpu-layers"
  #     - "999"
  #     - "--jinja"

  # Comment next block if you want to use gpt-oss-20b
  gpt-oss-120b:
    <<: *build-base
    container_name: gpt-oss-120b
    volumes:
      - ./models:/models
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command:
      - "-m"
      - "/models/gpt-oss-120b-mxfp4-00001-of-00003.gguf"
      - "--port"
      - "8000"
      - "--host"
      - "0.0.0.0"
      - "-n"
      - "65536"
      - "--n-gpu-layers"
      - "70"
      - "--jinja"

  deepseek-coder:
    <<: *build-base
    container_name: deepseek-coder
    volumes:
      - ./models:/models
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command:
      - "-m"
      - "/models/deepseek-coder-6.7b-instruct.Q8_0.gguf"
      - "--port"
      - "8000"
      - "--host"
      - "0.0.0.0"
      - "-n"
      - "256"
      - "--n-gpu-layers"
      - "999"
      - "--jinja"

volumes:
  ollama-data:

networks:
  default:
    name: chatbot-net