mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-26 03:43:52 +00:00
feat(docker): add vector search services and GPU configuration
- Add optional Pinecone and sentence-transformers services for vector search - Configure NVIDIA GPU support with proper environment variables - Add new environment variables for embeddings and Pinecone - Add docker compose profiles to optionally enable vector-search - Improve CUDA configuration for Ollama service - Add pinecone-net network for service communication
This commit is contained in:
parent
9dc734eee5
commit
8c1d2ae9f3
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
app:
|
app:
|
||||||
build:
|
build:
|
||||||
@ -8,7 +9,14 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- ARANGODB_URL=http://arangodb:8529
|
- ARANGODB_URL=http://arangodb:8529
|
||||||
- ARANGODB_DB=txt2kg
|
- ARANGODB_DB=txt2kg
|
||||||
|
- PINECONE_HOST=entity-embeddings
|
||||||
|
- PINECONE_PORT=5081
|
||||||
|
- PINECONE_API_KEY=pclocal
|
||||||
|
- PINECONE_ENVIRONMENT=local
|
||||||
- LANGCHAIN_TRACING_V2=true
|
- LANGCHAIN_TRACING_V2=true
|
||||||
|
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
|
||||||
|
- MODEL_NAME=all-MiniLM-L6-v2
|
||||||
|
- EMBEDDINGS_API_URL=http://sentence-transformers:80
|
||||||
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
|
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
|
||||||
- NODE_TLS_REJECT_UNAUTHORIZED=0
|
- NODE_TLS_REJECT_UNAUTHORIZED=0
|
||||||
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
||||||
@ -23,9 +31,12 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- default
|
- default
|
||||||
- txt2kg-network
|
- txt2kg-network
|
||||||
|
- pinecone-net
|
||||||
depends_on:
|
depends_on:
|
||||||
- arangodb
|
- arangodb
|
||||||
- ollama
|
- ollama
|
||||||
|
# Optional: sentence-transformers and entity-embeddings are only needed for vector search
|
||||||
|
# Traditional graph search works without these services
|
||||||
arangodb:
|
arangodb:
|
||||||
image: arangodb:latest
|
image: arangodb:latest
|
||||||
ports:
|
ports:
|
||||||
@ -59,16 +70,13 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ollama_data:/root/.ollama
|
- ollama_data:/root/.ollama
|
||||||
environment:
|
environment:
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA
|
||||||
- OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance
|
- OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance
|
||||||
- OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes
|
- OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes
|
||||||
- OLLAMA_CUDA=1 # Enable CUDA acceleration
|
|
||||||
- OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations
|
|
||||||
- OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models
|
- OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models
|
||||||
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
|
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
|
||||||
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
|
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
|
||||||
- OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers
|
|
||||||
- OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory
|
|
||||||
- CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU)
|
|
||||||
networks:
|
networks:
|
||||||
- default
|
- default
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -86,6 +94,40 @@ services:
|
|||||||
retries: 3
|
retries: 3
|
||||||
start_period: 60s
|
start_period: 60s
|
||||||
|
|
||||||
|
# Optional services for vector search (NOT required for traditional graph search)
|
||||||
|
# Traditional graph search works with just: app, arangodb, and ollama
|
||||||
|
sentence-transformers:
|
||||||
|
build:
|
||||||
|
context: ../services/sentence-transformers
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- '8000:80'
|
||||||
|
environment:
|
||||||
|
- MODEL_NAME=all-MiniLM-L6-v2
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
restart: unless-stopped
|
||||||
|
profiles:
|
||||||
|
- vector-search # Only start with: docker compose --profile vector-search up
|
||||||
|
entity-embeddings:
|
||||||
|
image: ghcr.io/pinecone-io/pinecone-index:latest
|
||||||
|
container_name: entity-embeddings
|
||||||
|
environment:
|
||||||
|
PORT: 5081
|
||||||
|
INDEX_TYPE: serverless
|
||||||
|
VECTOR_TYPE: dense
|
||||||
|
DIMENSION: 384
|
||||||
|
METRIC: cosine
|
||||||
|
INDEX_NAME: entity-embeddings
|
||||||
|
ports:
|
||||||
|
- "5081:5081"
|
||||||
|
platform: linux/amd64
|
||||||
|
networks:
|
||||||
|
- pinecone-net
|
||||||
|
restart: unless-stopped
|
||||||
|
profiles:
|
||||||
|
- vector-search # Only start with: docker compose --profile vector-search up
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
arangodb_data:
|
arangodb_data:
|
||||||
arangodb_apps_data:
|
arangodb_apps_data:
|
||||||
@ -96,3 +138,5 @@ networks:
|
|||||||
driver: bridge
|
driver: bridge
|
||||||
txt2kg-network:
|
txt2kg-network:
|
||||||
driver: bridge
|
driver: bridge
|
||||||
|
pinecone-net:
|
||||||
|
name: pinecone
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user