mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 18:33:54 +00:00
feat(docker): add vector search services and GPU configuration
- Add optional Pinecone and sentence-transformers services for vector search - Configure NVIDIA GPU support with proper environment variables - Add new environment variables for embeddings and Pinecone - Add docker compose profiles to optionally enable vector-search - Improve CUDA configuration for Ollama service - Add pinecone-net network for service communication
This commit is contained in:
parent
9dc734eee5
commit
8c1d2ae9f3
@ -1,3 +1,4 @@
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
@ -8,7 +9,14 @@ services:
|
||||
environment:
|
||||
- ARANGODB_URL=http://arangodb:8529
|
||||
- ARANGODB_DB=txt2kg
|
||||
- PINECONE_HOST=entity-embeddings
|
||||
- PINECONE_PORT=5081
|
||||
- PINECONE_API_KEY=pclocal
|
||||
- PINECONE_ENVIRONMENT=local
|
||||
- LANGCHAIN_TRACING_V2=true
|
||||
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
|
||||
- MODEL_NAME=all-MiniLM-L6-v2
|
||||
- EMBEDDINGS_API_URL=http://sentence-transformers:80
|
||||
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
|
||||
- NODE_TLS_REJECT_UNAUTHORIZED=0
|
||||
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
||||
@ -23,9 +31,12 @@ services:
|
||||
networks:
|
||||
- default
|
||||
- txt2kg-network
|
||||
- pinecone-net
|
||||
depends_on:
|
||||
- arangodb
|
||||
- ollama
|
||||
# Optional: sentence-transformers and entity-embeddings are only needed for vector search
|
||||
# Traditional graph search works without these services
|
||||
arangodb:
|
||||
image: arangodb:latest
|
||||
ports:
|
||||
@ -59,16 +70,13 @@ services:
|
||||
volumes:
|
||||
- ollama_data:/root/.ollama
|
||||
environment:
|
||||
- NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA
|
||||
- OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance
|
||||
- OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes
|
||||
- OLLAMA_CUDA=1 # Enable CUDA acceleration
|
||||
- OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations
|
||||
- OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models
|
||||
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
|
||||
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
|
||||
- OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers
|
||||
- OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory
|
||||
- CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU)
|
||||
networks:
|
||||
- default
|
||||
restart: unless-stopped
|
||||
@ -85,6 +93,40 @@ services:
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
# Optional services for vector search (NOT required for traditional graph search)
|
||||
# Traditional graph search works with just: app, arangodb, and ollama
|
||||
sentence-transformers:
|
||||
build:
|
||||
context: ../services/sentence-transformers
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- '8000:80'
|
||||
environment:
|
||||
- MODEL_NAME=all-MiniLM-L6-v2
|
||||
networks:
|
||||
- default
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- vector-search # Only start with: docker compose --profile vector-search up
|
||||
entity-embeddings:
|
||||
image: ghcr.io/pinecone-io/pinecone-index:latest
|
||||
container_name: entity-embeddings
|
||||
environment:
|
||||
PORT: 5081
|
||||
INDEX_TYPE: serverless
|
||||
VECTOR_TYPE: dense
|
||||
DIMENSION: 384
|
||||
METRIC: cosine
|
||||
INDEX_NAME: entity-embeddings
|
||||
ports:
|
||||
- "5081:5081"
|
||||
platform: linux/amd64
|
||||
networks:
|
||||
- pinecone-net
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- vector-search # Only start with: docker compose --profile vector-search up
|
||||
|
||||
volumes:
|
||||
arangodb_data:
|
||||
@ -96,3 +138,5 @@ networks:
|
||||
driver: bridge
|
||||
txt2kg-network:
|
||||
driver: bridge
|
||||
pinecone-net:
|
||||
name: pinecone
|
||||
|
||||
Loading…
Reference in New Issue
Block a user