mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-22 18:13:52 +00:00
fix(txt2kg): force ollama to use cuda_v13 library to enable GPU usage
This commit is contained in:
parent
5472c97a8c
commit
3af55d4b43
@ -104,7 +104,7 @@ services:
|
||||
- OLLAMA_FLASH_ATTENTION=1
|
||||
- OLLAMA_KEEP_ALIVE=30m
|
||||
- OLLAMA_CUDA=1
|
||||
- OLLAMA_LLM_LIBRARY=cuda
|
||||
- OLLAMA_LLM_LIBRARY=cuda_v13
|
||||
- OLLAMA_NUM_PARALLEL=1
|
||||
- OLLAMA_MAX_LOADED_MODELS=1
|
||||
- OLLAMA_KV_CACHE_TYPE=q8_0
|
||||
|
||||
@ -75,6 +75,7 @@ services:
|
||||
- OLLAMA_NUM_PARALLEL=4 # Process 4 requests in parallel - DGX Spark has unified memory
|
||||
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
|
||||
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
|
||||
- OLLAMA_LLM_LIBRARY=cuda_v13 # Force usage of CUDA v13 library
|
||||
networks:
|
||||
- default
|
||||
restart: unless-stopped
|
||||
|
||||
Loading…
Reference in New Issue
Block a user