perf(docker): increase Ollama parallel processing for DGX

- Increase OLLAMA_NUM_PARALLEL from 1 to 4 requests - Leverage DGX Spark's unified memory architecture - Improve throughput for concurrent inference requests
2026-06-18 04:22:21 +00:00 · 2025-10-19 20:56:58 -07:00 · 2025-10-19 20:56:58 -07:00 · 529debb633
commit 529debb633
parent ffb0688a63
1 changed files with 1 additions and 1 deletions
--- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
+++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
@ -74,7 +74,7 @@ services:
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility  # Required capabilities for CUDA
      - OLLAMA_FLASH_ATTENTION=1          # Enable flash attention for better performance
      - OLLAMA_KEEP_ALIVE=30m             # Keep models loaded for 30 minutes
-      - OLLAMA_NUM_PARALLEL=1             # Process one request at a time for 70B models
+      - OLLAMA_NUM_PARALLEL=4             # Process 4 requests in parallel - DGX Spark has unified memory
      - OLLAMA_MAX_LOADED_MODELS=1        # Load only one model at a time to avoid VRAM contention
      - OLLAMA_KV_CACHE_TYPE=q8_0         # Reduce KV cache VRAM usage with minimal performance impact
    networks: