feat(docker): add vector search services and GPU configuration

- Add optional Pinecone and sentence-transformers services for vector search
- Configure NVIDIA GPU support with proper environment variables
- Add new environment variables for embeddings and Pinecone
- Add docker compose profiles to optionally enable vector-search
- Improve CUDA configuration for Ollama service
- Add pinecone-net network for service communication
This commit is contained in:
Santosh Bhavani 2025-10-19 19:56:55 -07:00
parent 9dc734eee5
commit 8c1d2ae9f3

View File

@ -1,3 +1,4 @@
services: services:
app: app:
build: build:
@ -8,7 +9,14 @@ services:
environment: environment:
- ARANGODB_URL=http://arangodb:8529 - ARANGODB_URL=http://arangodb:8529
- ARANGODB_DB=txt2kg - ARANGODB_DB=txt2kg
- PINECONE_HOST=entity-embeddings
- PINECONE_PORT=5081
- PINECONE_API_KEY=pclocal
- PINECONE_ENVIRONMENT=local
- LANGCHAIN_TRACING_V2=true - LANGCHAIN_TRACING_V2=true
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
- MODEL_NAME=all-MiniLM-L6-v2
- EMBEDDINGS_API_URL=http://sentence-transformers:80
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
- NODE_TLS_REJECT_UNAUTHORIZED=0 - NODE_TLS_REJECT_UNAUTHORIZED=0
- OLLAMA_BASE_URL=http://ollama:11434/v1 - OLLAMA_BASE_URL=http://ollama:11434/v1
@ -23,9 +31,12 @@ services:
networks: networks:
- default - default
- txt2kg-network - txt2kg-network
- pinecone-net
depends_on: depends_on:
- arangodb - arangodb
- ollama - ollama
# Optional: sentence-transformers and entity-embeddings are only needed for vector search
# Traditional graph search works without these services
arangodb: arangodb:
image: arangodb:latest image: arangodb:latest
ports: ports:
@ -59,16 +70,13 @@ services:
volumes: volumes:
- ollama_data:/root/.ollama - ollama_data:/root/.ollama
environment: environment:
- NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container
- NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA
- OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance - OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance
- OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes - OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes
- OLLAMA_CUDA=1 # Enable CUDA acceleration
- OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations
- OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models - OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models
- OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention
- OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact
- OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers
- OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory
- CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU)
networks: networks:
- default - default
restart: unless-stopped restart: unless-stopped
@ -86,6 +94,40 @@ services:
retries: 3 retries: 3
start_period: 60s start_period: 60s
# Optional services for vector search (NOT required for traditional graph search)
# Traditional graph search works with just: app, arangodb, and ollama
sentence-transformers:
build:
context: ../services/sentence-transformers
dockerfile: Dockerfile
ports:
- '8000:80'
environment:
- MODEL_NAME=all-MiniLM-L6-v2
networks:
- default
restart: unless-stopped
profiles:
- vector-search # Only start with: docker compose --profile vector-search up
entity-embeddings:
image: ghcr.io/pinecone-io/pinecone-index:latest
container_name: entity-embeddings
environment:
PORT: 5081
INDEX_TYPE: serverless
VECTOR_TYPE: dense
DIMENSION: 384
METRIC: cosine
INDEX_NAME: entity-embeddings
ports:
- "5081:5081"
platform: linux/amd64
networks:
- pinecone-net
restart: unless-stopped
profiles:
- vector-search # Only start with: docker compose --profile vector-search up
volumes: volumes:
arangodb_data: arangodb_data:
arangodb_apps_data: arangodb_apps_data:
@ -96,3 +138,5 @@ networks:
driver: bridge driver: bridge
txt2kg-network: txt2kg-network:
driver: bridge driver: bridge
pinecone-net:
name: pinecone