dgx-spark-playbooks/nvidia/multi-agent-chatbot/assets/docker-compose-models.yml
2025-10-06 12:57:08 +00:00

160 lines
3.7 KiB
YAML

#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
x-build-base: &build-base
build:
context: .
dockerfile: Dockerfile.llamacpp
image: local/llama.cpp:server-cuda
services:
qwen2.5-vl:
image: nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev
container_name: qwen2.5-vl
shm_size: '1g'
restart: unless-stopped
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- TOKENIZERS_PARALLELISM=false
- NCCL_P2P_LEVEL=SYS
- NCCL_DEBUG=INFO
- UCX_TLS=tcp,sm,self
- UCX_MEMTYPE_CACHE=n
- CUDA_VISIBLE_DEVICES=0
command: >
trtllm-serve serve Qwen/Qwen2-VL-7B-Instruct
--backend pytorch
--host 0.0.0.0
--port 8000
--trust_remote_code
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
qwen3-embedding:
<<: *build-base
container_name: qwen3-embedding
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- "-m"
- "/models/Qwen3-Embedding-4B-Q8_0.gguf"
- "--port"
- "8000"
- "--host"
- "0.0.0.0"
- "--jinja"
- "--embeddings"
# Uncomment next block if you want to use gpt-oss-20b
# gpt-oss-20b:
# <<: *build-base
# container_name: gpt-oss-20b
# volumes:
# - ./models:/models
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
# command:
# - "-m"
# - "/models/gpt-oss-20b-mxfp4.gguf"
# - "--port"
# - "8000"
# - "--host"
# - "0.0.0.0"
# - "-n"
# - "2048"
# - "--n-gpu-layers"
# - "999"
# - "--jinja"
# Comment next block if you want to use gpt-oss-20b
gpt-oss-120b:
<<: *build-base
container_name: gpt-oss-120b
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- "-m"
- "/models/gpt-oss-120b-mxfp4-00001-of-00003.gguf"
- "--port"
- "8000"
- "--host"
- "0.0.0.0"
- "-n"
- "65536"
- "--n-gpu-layers"
- "70"
- "--jinja"
deepseek-coder:
<<: *build-base
container_name: deepseek-coder
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- "-m"
- "/models/deepseek-coder-6.7b-instruct.Q8_0.gguf"
- "--port"
- "8000"
- "--host"
- "0.0.0.0"
- "-n"
- "256"
- "--n-gpu-layers"
- "999"
- "--jinja"
volumes:
ollama-data:
networks:
default:
name: chatbot-net