# # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # x-build-base: &build-base build: context: . dockerfile: Dockerfile.llamacpp image: local/llama.cpp:server-cuda services: qwen2.5-vl: image: nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev container_name: qwen2.5-vl shm_size: '1g' restart: unless-stopped deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] environment: - TOKENIZERS_PARALLELISM=false - NCCL_P2P_LEVEL=SYS - NCCL_DEBUG=INFO - UCX_TLS=tcp,sm,self - UCX_MEMTYPE_CACHE=n - CUDA_VISIBLE_DEVICES=0 command: > trtllm-serve serve Qwen/Qwen2-VL-7B-Instruct --backend pytorch --host 0.0.0.0 --port 8000 --trust_remote_code healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 5 start_period: 60s qwen3-embedding: <<: *build-base container_name: qwen3-embedding volumes: - ./models:/models deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] command: - "-m" - "/models/Qwen3-Embedding-4B-Q8_0.gguf" - "--port" - "8000" - "--host" - "0.0.0.0" - "--jinja" - "--embeddings" # Uncomment next block if you want to use gpt-oss-20b # gpt-oss-20b: # <<: *build-base # container_name: gpt-oss-20b # volumes: # - ./models:/models # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: all # capabilities: [gpu] # command: # - "-m" # - "/models/gpt-oss-20b-mxfp4.gguf" # - "--port" # - "8000" # - "--host" # - "0.0.0.0" # - "-n" # - "2048" # - "--n-gpu-layers" # - "999" # - "--jinja" # Comment next block if you want to use gpt-oss-20b gpt-oss-120b: <<: *build-base container_name: gpt-oss-120b volumes: - ./models:/models deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] command: - "-m" - "/models/gpt-oss-120b-mxfp4-00001-of-00003.gguf" - "--port" - "8000" - "--host" - "0.0.0.0" - "-n" - "65536" - "--n-gpu-layers" - "70" - "--jinja" deepseek-coder: <<: *build-base container_name: deepseek-coder volumes: - ./models:/models deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] command: - "-m" - "/models/deepseek-coder-6.7b-instruct.Q8_0.gguf" - "--port" - "8000" - "--host" - "0.0.0.0" - "-n" - "256" - "--n-gpu-layers" - "999" - "--jinja" volumes: ollama-data: networks: default: name: chatbot-net