diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml index 3b57d4f..b6e464e 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.complete.yml @@ -104,7 +104,7 @@ services: - OLLAMA_FLASH_ATTENTION=1 - OLLAMA_KEEP_ALIVE=30m - OLLAMA_CUDA=1 - - OLLAMA_LLM_LIBRARY=cuda + - OLLAMA_LLM_LIBRARY=cuda_v13 - OLLAMA_NUM_PARALLEL=1 - OLLAMA_MAX_LOADED_MODELS=1 - OLLAMA_KV_CACHE_TYPE=q8_0 diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index 07e9e7d..6f8c99c 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -75,6 +75,7 @@ services: - OLLAMA_NUM_PARALLEL=4 # Process 4 requests in parallel - DGX Spark has unified memory - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact + - OLLAMA_LLM_LIBRARY=cuda_v13 # Force usage of CUDA v13 library networks: - default restart: unless-stopped