mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-22 18:13:52 +00:00
160 lines
3.7 KiB
YAML
160 lines
3.7 KiB
YAML
#
|
|
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
x-build-base: &build-base
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile.llamacpp
|
|
image: local/llama.cpp:server-cuda
|
|
|
|
services:
|
|
qwen2.5-vl:
|
|
image: nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev
|
|
container_name: qwen2.5-vl
|
|
shm_size: '1g'
|
|
restart: unless-stopped
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
environment:
|
|
- TOKENIZERS_PARALLELISM=false
|
|
- NCCL_P2P_LEVEL=SYS
|
|
- NCCL_DEBUG=INFO
|
|
- UCX_TLS=tcp,sm,self
|
|
- UCX_MEMTYPE_CACHE=n
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
command: >
|
|
trtllm-serve serve Qwen/Qwen2-VL-7B-Instruct
|
|
--backend pytorch
|
|
--host 0.0.0.0
|
|
--port 8000
|
|
--trust_remote_code
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 60s
|
|
|
|
qwen3-embedding:
|
|
<<: *build-base
|
|
container_name: qwen3-embedding
|
|
volumes:
|
|
- ./models:/models
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
command:
|
|
- "-m"
|
|
- "/models/Qwen3-Embedding-4B-Q8_0.gguf"
|
|
- "--port"
|
|
- "8000"
|
|
- "--host"
|
|
- "0.0.0.0"
|
|
- "--jinja"
|
|
- "--embeddings"
|
|
|
|
# Uncomment next block if you want to use gpt-oss-20b
|
|
# gpt-oss-20b:
|
|
# <<: *build-base
|
|
# container_name: gpt-oss-20b
|
|
# volumes:
|
|
# - ./models:/models
|
|
# deploy:
|
|
# resources:
|
|
# reservations:
|
|
# devices:
|
|
# - driver: nvidia
|
|
# count: all
|
|
# capabilities: [gpu]
|
|
# command:
|
|
# - "-m"
|
|
# - "/models/gpt-oss-20b-mxfp4.gguf"
|
|
# - "--port"
|
|
# - "8000"
|
|
# - "--host"
|
|
# - "0.0.0.0"
|
|
# - "-n"
|
|
# - "2048"
|
|
# - "--n-gpu-layers"
|
|
# - "999"
|
|
# - "--jinja"
|
|
|
|
# Comment next block if you want to use gpt-oss-20b
|
|
gpt-oss-120b:
|
|
<<: *build-base
|
|
container_name: gpt-oss-120b
|
|
volumes:
|
|
- ./models:/models
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
command:
|
|
- "-m"
|
|
- "/models/gpt-oss-120b-mxfp4-00001-of-00003.gguf"
|
|
- "--port"
|
|
- "8000"
|
|
- "--host"
|
|
- "0.0.0.0"
|
|
- "-n"
|
|
- "65536"
|
|
- "--n-gpu-layers"
|
|
- "70"
|
|
- "--jinja"
|
|
|
|
deepseek-coder:
|
|
<<: *build-base
|
|
container_name: deepseek-coder
|
|
volumes:
|
|
- ./models:/models
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
command:
|
|
- "-m"
|
|
- "/models/deepseek-coder-6.7b-instruct.Q8_0.gguf"
|
|
- "--port"
|
|
- "8000"
|
|
- "--host"
|
|
- "0.0.0.0"
|
|
- "-n"
|
|
- "256"
|
|
- "--n-gpu-layers"
|
|
- "999"
|
|
- "--jinja"
|
|
|
|
volumes:
|
|
ollama-data:
|
|
|
|
networks:
|
|
default:
|
|
name: chatbot-net |