feat(docker): add vector search services and GPU configuration

- Add optional Pinecone and sentence-transformers services for vector search
- Configure NVIDIA GPU support with proper environment variables
- Add new environment variables for embeddings and Pinecone
- Add docker compose profiles to optionally enable vector-search
- Improve CUDA configuration for Ollama service
- Add pinecone-net network for service communication
This commit is contained in:
Santosh Bhavani 2025-10-19 19:56:55 -07:00
parent 9dc734eee5
commit 8c1d2ae9f3

View File

@ -1,3 +1,4 @@
services:
app:
build:
@ -8,7 +9,14 @@ services:
environment:
- ARANGODB_URL=http://arangodb:8529
- ARANGODB_DB=txt2kg
- PINECONE_HOST=entity-embeddings
- PINECONE_PORT=5081
- PINECONE_API_KEY=pclocal
- PINECONE_ENVIRONMENT=local
- LANGCHAIN_TRACING_V2=true
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
- MODEL_NAME=all-MiniLM-L6-v2
- EMBEDDINGS_API_URL=http://sentence-transformers:80
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
- NODE_TLS_REJECT_UNAUTHORIZED=0
- OLLAMA_BASE_URL=http://ollama:11434/v1
@ -23,9 +31,12 @@ services:
networks:
- default
- txt2kg-network
- pinecone-net
depends_on:
- arangodb
- ollama
# Optional: sentence-transformers and entity-embeddings are only needed for vector search
# Traditional graph search works without these services
arangodb:
image: arangodb:latest
ports:
@ -59,16 +70,13 @@ services:
volumes:
- ollama_data:/root/.ollama
environment:
- NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container
- NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA
- OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance
- OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes
- OLLAMA_CUDA=1 # Enable CUDA acceleration
- OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations
- OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
- OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers
- OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory
- CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU)
networks:
- default
restart: unless-stopped
@ -85,6 +93,40 @@ services:
timeout: 10s
retries: 3
start_period: 60s
# Optional services for vector search (NOT required for traditional graph search)
# Traditional graph search works with just: app, arangodb, and ollama
sentence-transformers:
build:
context: ../services/sentence-transformers
dockerfile: Dockerfile
ports:
- '8000:80'
environment:
- MODEL_NAME=all-MiniLM-L6-v2
networks:
- default
restart: unless-stopped
profiles:
- vector-search # Only start with: docker compose --profile vector-search up
entity-embeddings:
image: ghcr.io/pinecone-io/pinecone-index:latest
container_name: entity-embeddings
environment:
PORT: 5081
INDEX_TYPE: serverless
VECTOR_TYPE: dense
DIMENSION: 384
METRIC: cosine
INDEX_NAME: entity-embeddings
ports:
- "5081:5081"
platform: linux/amd64
networks:
- pinecone-net
restart: unless-stopped
profiles:
- vector-search # Only start with: docker compose --profile vector-search up
volumes:
arangodb_data:
@ -96,3 +138,5 @@ networks:
driver: bridge
txt2kg-network:
driver: bridge
pinecone-net:
name: pinecone