From cb22754d2f768582de0348f6afd90750d428fb4d Mon Sep 17 00:00:00 2001 From: seongyongkim Date: Thu, 13 Nov 2025 23:06:13 -0800 Subject: [PATCH] FIX: 100% CPU - Change OLLAMA_LLM_LIBRARY to cuda_v13 The current value "cuda" is invalid and causes Ollama to fallback to 100% CPU. Updated OLLAMA_LLM_LIBRARY environment variable for compatibility with DGX Spark with CUDA 13.0. --- nvidia/txt2kg/assets/deploy/compose/docker-compose.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index 5916938..5492be9 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -61,7 +61,9 @@ services: - OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance - OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes - OLLAMA_CUDA=1 # Enable CUDA acceleration - - OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations + - OLLAMA_LLM_LIBRARY=cuda_v13 # The correct value for DGX Spark is cuda_v13. "cuda" will fallback to 100% CPU. + # Valid vaules are [cuda_jetpack5, cuda_jetpack6, cuda_v12, cuda_v13]. + # Can be found in /usr/lib/ollama - OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact