From 3af55d4b439aba6005b2d770d78aeceeeb50539c Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Fri, 12 Dec 2025 10:42:01 -0600 Subject: [PATCH] fix(txt2kg): force ollama to use cuda_v13 library to enable GPU usage --- nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml | 2 +- nvidia/txt2kg/assets/deploy/compose/docker-compose.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml index 3b57d4f..b6e464e 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml @@ -104,7 +104,7 @@ services: - OLLAMA_FLASH_ATTENTION=1 - OLLAMA_KEEP_ALIVE=30m - OLLAMA_CUDA=1 - - OLLAMA_LLM_LIBRARY=cuda + - OLLAMA_LLM_LIBRARY=cuda_v13 - OLLAMA_NUM_PARALLEL=1 - OLLAMA_MAX_LOADED_MODELS=1 - OLLAMA_KV_CACHE_TYPE=q8_0 diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index 07e9e7d..6f8c99c 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -75,6 +75,7 @@ services: - OLLAMA_NUM_PARALLEL=4 # Process 4 requests in parallel - DGX Spark has unified memory - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact + - OLLAMA_LLM_LIBRARY=cuda_v13 # Force usage of CUDA v13 library networks: - default restart: unless-stopped