diff --git a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml index ea8a9ec..7919997 100644 --- a/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml +++ b/nvidia/txt2kg/assets/deploy/compose/docker-compose.yml @@ -1,3 +1,4 @@ + services: app: build: @@ -8,7 +9,14 @@ services: environment: - ARANGODB_URL=http://arangodb:8529 - ARANGODB_DB=txt2kg + - PINECONE_HOST=entity-embeddings + - PINECONE_PORT=5081 + - PINECONE_API_KEY=pclocal + - PINECONE_ENVIRONMENT=local - LANGCHAIN_TRACING_V2=true + - SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80 + - MODEL_NAME=all-MiniLM-L6-v2 + - EMBEDDINGS_API_URL=http://sentence-transformers:80 - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA - NODE_TLS_REJECT_UNAUTHORIZED=0 - OLLAMA_BASE_URL=http://ollama:11434/v1 @@ -23,9 +31,12 @@ services: networks: - default - txt2kg-network + - pinecone-net depends_on: - arangodb - ollama + # Optional: sentence-transformers and entity-embeddings are only needed for vector search + # Traditional graph search works without these services arangodb: image: arangodb:latest ports: @@ -59,16 +70,13 @@ services: volumes: - ollama_data:/root/.ollama environment: + - NVIDIA_VISIBLE_DEVICES=all # Make all GPUs visible to the container + - NVIDIA_DRIVER_CAPABILITIES=compute,utility # Required capabilities for CUDA - OLLAMA_FLASH_ATTENTION=1 # Enable flash attention for better performance - OLLAMA_KEEP_ALIVE=30m # Keep models loaded for 30 minutes - - OLLAMA_CUDA=1 # Enable CUDA acceleration - - OLLAMA_LLM_LIBRARY=cuda # Use CUDA library for LLM operations - OLLAMA_NUM_PARALLEL=1 # Process one request at a time for 70B models - OLLAMA_MAX_LOADED_MODELS=1 # Load only one model at a time to avoid VRAM contention - OLLAMA_KV_CACHE_TYPE=q8_0 # Reduce KV cache VRAM usage with minimal performance impact - - OLLAMA_GPU_LAYERS=999 # Use maximum GPU layers - - OLLAMA_GPU_MEMORY_FRACTION=0.9 # Use 90% of GPU memory - - CUDA_VISIBLE_DEVICES=0 # Use GPU 0 (change to 'all' for multi-GPU) networks: - default restart: unless-stopped @@ -85,6 +93,40 @@ services: timeout: 10s retries: 3 start_period: 60s + + # Optional services for vector search (NOT required for traditional graph search) + # Traditional graph search works with just: app, arangodb, and ollama + sentence-transformers: + build: + context: ../services/sentence-transformers + dockerfile: Dockerfile + ports: + - '8000:80' + environment: + - MODEL_NAME=all-MiniLM-L6-v2 + networks: + - default + restart: unless-stopped + profiles: + - vector-search # Only start with: docker compose --profile vector-search up + entity-embeddings: + image: ghcr.io/pinecone-io/pinecone-index:latest + container_name: entity-embeddings + environment: + PORT: 5081 + INDEX_TYPE: serverless + VECTOR_TYPE: dense + DIMENSION: 384 + METRIC: cosine + INDEX_NAME: entity-embeddings + ports: + - "5081:5081" + platform: linux/amd64 + networks: + - pinecone-net + restart: unless-stopped + profiles: + - vector-search # Only start with: docker compose --profile vector-search up volumes: arangodb_data: @@ -96,3 +138,5 @@ networks: driver: bridge txt2kg-network: driver: bridge + pinecone-net: + name: pinecone