From 529debb633543035fa2dbe5d577e638b45cc9847 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Sun, 19 Oct 2025 20:56:58 -0700 Subject: [PATCH] perf(docker): increase Ollama parallel processing for DGX - Increase OLLAMA_NUM_PARALLEL from 1 to 4 requests - Leverage DGX Spark's unified memory architecture - Improve throughput for concurrent inference requests --- nvidia/txt2kg/assets/deploy/compose/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index 7919997..79cf23c 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -74,7 +74,7 @@ services: - NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA - OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance - OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes - - OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models + - OLLAMA_NUM_PARALLEL=4 # Process 4 requests in parallel - DGX Spark has unified memory - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact networks: