diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
index ea8a9ec..7919997 100644
--- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
+++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml
@@ -1,3 +1,4 @@
+
 services:
   app:
     build:
@@ -8,7 +9,14 @@ services:
     environment:
       - ARANGODB_URL=http://arangodb:8529
       - ARANGODB_DB=txt2kg
+      - PINECONE_HOST=entity-embeddings
+      - PINECONE_PORT=5081
+      - PINECONE_API_KEY=pclocal
+      - PINECONE_ENVIRONMENT=local
       - LANGCHAIN_TRACING_V2=true
+      - SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
+      - MODEL_NAME=all-MiniLM-L6-v2
+      - EMBEDDINGS_API_URL=http://sentence-transformers:80
       - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
       - NODE_TLS_REJECT_UNAUTHORIZED=0
       - OLLAMA_BASE_URL=http://ollama:11434/v1
@@ -23,9 +31,12 @@ services:
     networks:
       - default
       - txt2kg-network
+      - pinecone-net
     depends_on:
       - arangodb
       - ollama
+      # Optional: sentence-transformers and entity-embeddings are only needed for vector search
+      # Traditional graph search works without these services
   arangodb:
     image: arangodb:latest
     ports:
@@ -59,16 +70,13 @@ services:
     volumes:
       - ollama_data:/root/.ollama
     environment:
+      - NVIDIA_VISIBLE_DEVICES=all        # Make all GPUs visible to the container
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility  # Required capabilities for CUDA
       - OLLAMA_FLASH_ATTENTION=1          # Enable flash attention for better performance
       - OLLAMA_KEEP_ALIVE=30m             # Keep models loaded for 30 minutes
-      - OLLAMA_CUDA=1                     # Enable CUDA acceleration
-      - OLLAMA_LLM_LIBRARY=cuda           # Use CUDA library for LLM operations
       - OLLAMA_NUM_PARALLEL=1             # Process one request at a time for 70B models
       - OLLAMA_MAX_LOADED_MODELS=1        # Load only one model at a time to avoid VRAM contention
       - OLLAMA_KV_CACHE_TYPE=q8_0         # Reduce KV cache VRAM usage with minimal performance impact
-      - OLLAMA_GPU_LAYERS=999             # Use maximum GPU layers
-      - OLLAMA_GPU_MEMORY_FRACTION=0.9    # Use 90% of GPU memory
-      - CUDA_VISIBLE_DEVICES=0            # Use GPU 0 (change to 'all' for multi-GPU)
     networks:
       - default
     restart: unless-stopped
@@ -85,6 +93,40 @@ services:
       timeout: 10s
       retries: 3
       start_period: 60s
+  
+  # Optional services for vector search (NOT required for traditional graph search)
+  # Traditional graph search works with just: app, arangodb, and ollama
+  sentence-transformers:
+    build:
+      context: ../services/sentence-transformers
+      dockerfile: Dockerfile
+    ports:
+      - '8000:80'
+    environment:
+      - MODEL_NAME=all-MiniLM-L6-v2
+    networks:
+      - default
+    restart: unless-stopped
+    profiles:
+      - vector-search  # Only start with: docker compose --profile vector-search up
+  entity-embeddings:
+    image: ghcr.io/pinecone-io/pinecone-index:latest
+    container_name: entity-embeddings
+    environment:
+      PORT: 5081
+      INDEX_TYPE: serverless
+      VECTOR_TYPE: dense
+      DIMENSION: 384
+      METRIC: cosine
+      INDEX_NAME: entity-embeddings
+    ports:
+      - "5081:5081"
+    platform: linux/amd64
+    networks:
+      - pinecone-net
+    restart: unless-stopped
+    profiles:
+      - vector-search  # Only start with: docker compose --profile vector-search up
 
 volumes:
   arangodb_data:
@@ -96,3 +138,5 @@ networks:
     driver: bridge
   txt2kg-network:
     driver: bridge
+  pinecone-net:
+    name: pinecone