dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/compose/docker-compose.vllm.yml
2025-10-06 17:05:41 +00:00

138 lines
3.7 KiB
YAML

services:
app:
build:
context: ../..
dockerfile: deploy/app/Dockerfile
ports:
- '3001:3000'
environment:
- ARANGODB_URL=http://arangodb:8529
- ARANGODB_DB=txt2kg
- PINECONE_HOST=entity-embeddings
- PINECONE_PORT=5081
- PINECONE_API_KEY=pclocal
- PINECONE_ENVIRONMENT=local
- LANGCHAIN_TRACING_V2=true
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
- MODEL_NAME=all-MiniLM-L6-v2
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
- NODE_TLS_REJECT_UNAUTHORIZED=0
- OLLAMA_BASE_URL=http://ollama:11434/v1
- OLLAMA_MODEL=qwen3:1.7b
- VLLM_BASE_URL=http://vllm:8001/v1
- VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
- REMOTE_WEBGPU_SERVICE_URL=http://txt2kg-remote-webgpu:8083
networks:
- pinecone-net
- default
- txt2kg-network
depends_on:
- arangodb
- entity-embeddings
- sentence-transformers
- vllm
arangodb:
image: arangodb:latest
ports:
- '8529:8529'
environment:
- ARANGO_NO_AUTH=1
volumes:
- arangodb_data:/var/lib/arangodb3
- arangodb_apps_data:/var/lib/arangodb3-apps
arangodb-init:
image: arangodb:latest
depends_on:
arangodb:
condition: service_started
restart: on-failure
entrypoint: >
sh -c "
echo 'Waiting for ArangoDB to start...' &&
sleep 10 &&
echo 'Creating txt2kg database...' &&
arangosh --server.endpoint tcp://arangodb:8529 --server.authentication false --javascript.execute-string 'try { db._createDatabase(\"txt2kg\"); console.log(\"Database txt2kg created successfully!\"); } catch(e) { if(e.message.includes(\"duplicate\")) { console.log(\"Database txt2kg already exists\"); } else { throw e; } }'
"
entity-embeddings:
image: ghcr.io/pinecone-io/pinecone-index:latest
container_name: entity-embeddings
environment:
PORT: 5081
INDEX_TYPE: serverless
VECTOR_TYPE: dense
DIMENSION: 384
METRIC: cosine
INDEX_NAME: entity-embeddings
ports:
- "5081:5081"
platform: linux/amd64
networks:
- pinecone-net
restart: unless-stopped
sentence-transformers:
build:
context: ../../deploy/services/sentence-transformers
dockerfile: Dockerfile
ports:
- '8000:80'
environment:
- MODEL_NAME=all-MiniLM-L6-v2
networks:
- default
vllm:
build:
context: ../../deploy/services/vllm
dockerfile: Dockerfile
container_name: vllm-service
ports:
- '8001:8001'
environment:
# Model configuration
- VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
- VLLM_TENSOR_PARALLEL_SIZE=1
- VLLM_MAX_MODEL_LEN=4096
- VLLM_GPU_MEMORY_UTILIZATION=0.9
# NVfp4 quantization settings
- VLLM_QUANTIZATION=fp8
- VLLM_KV_CACHE_DTYPE=fp8
# Service configuration
- VLLM_PORT=8001
- VLLM_HOST=0.0.0.0
# Performance tuning
- CUDA_VISIBLE_DEVICES=0
- NCCL_DEBUG=INFO
volumes:
- vllm_models:/app/models
- /tmp:/tmp
# Mount model cache for faster startup
- ~/.cache/huggingface:/root/.cache/huggingface
networks:
- default
restart: unless-stopped
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s # Longer start period for model loading
volumes:
arangodb_data:
arangodb_apps_data:
vllm_models:
networks:
pinecone-net:
name: pinecone
default:
driver: bridge
txt2kg-network:
driver: bridge