From 8c1d2ae9f370632e6b78de288d10597488b74907 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Sun, 19 Oct 2025 19:56:55 -0700 Subject: [PATCH] feat(docker): add vector search services and GPU configuration - Add optional Pinecone and sentence-transformers services for vector search - Configure NVIDIA GPU support with proper environment variables - Add new environment variables for embeddings and Pinecone - Add docker compose profiles to optionally enable vector-search - Improve CUDA configuration for Ollama service - Add pinecone-net network for service communication --- .../assets/deploy/compose/docker-compose.yml | 54 +++++++++++++++++-- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index ea8a9ec..7919997 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -1,3 +1,4 @@ + services: app: build: @@ -8,7 +9,14 @@ services: environment: - ARANGODB_URL=http://arangodb:8529 - ARANGODB_DB=txt2kg + - PINECONE_HOST=entity-embeddings + - PINECONE_PORT=5081 + - PINECONE_API_KEY=pclocal + - PINECONE_ENVIRONMENT=local - LANGCHAIN_TRACING_V2=true + - SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80 + - MODEL_NAME=all-MiniLM-L6-v2 + - EMBEDDINGS_API_URL=http://sentence-transformers:80 - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA - NODE_TLS_REJECT_UNAUTHORIZED=0 - OLLAMA_BASE_URL=http://ollama:11434/v1 @@ -23,9 +31,12 @@ services: networks: - default - txt2kg-network + - pinecone-net depends_on: - arangodb - ollama + # Optional: sentence-transformers and entity-embeddings are only needed for vector search + # Traditional graph search works without these services arangodb: image: arangodb:latest ports: @@ -59,16 +70,13 @@ services: volumes: - ollama_data:/root/.ollama environment: + - NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container + - NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA - OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance - OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes - - OLLAMA_CUDA=1 # Enable CUDA acceleration - - OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations - OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact - - OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers - - OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory - - CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU) networks: - default restart: unless-stopped @@ -85,6 +93,40 @@ services: timeout: 10s retries: 3 start_period: 60s + + # Optional services for vector search (NOT required for traditional graph search) + # Traditional graph search works with just: app, arangodb, and ollama + sentence-transformers: + build: + context: ../services/sentence-transformers + dockerfile: Dockerfile + ports: + - '8000:80' + environment: + - MODEL_NAME=all-MiniLM-L6-v2 + networks: + - default + restart: unless-stopped + profiles: + - vector-search # Only start with: docker compose --profile vector-search up + entity-embeddings: + image: ghcr.io/pinecone-io/pinecone-index:latest + container_name: entity-embeddings + environment: + PORT: 5081 + INDEX_TYPE: serverless + VECTOR_TYPE: dense + DIMENSION: 384 + METRIC: cosine + INDEX_NAME: entity-embeddings + ports: + - "5081:5081" + platform: linux/amd64 + networks: + - pinecone-net + restart: unless-stopped + profiles: + - vector-search # Only start with: docker compose --profile vector-search up volumes: arangodb_data: @@ -96,3 +138,5 @@ networks: driver: bridge txt2kg-network: driver: bridge + pinecone-net: + name: pinecone