mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
195 lines
5.0 KiB
YAML
195 lines
5.0 KiB
YAML
# txt2kg Docker Compose - Neo4j + vLLM (GPU-accelerated)
|
|
#
|
|
# Optional stack optimized for DGX Spark/GB300 with unified memory support
|
|
#
|
|
# Usage:
|
|
# ./start.sh --vllm # Use this compose file
|
|
# ./start.sh --vllm --vector-search # Add Qdrant + Sentence Transformers
|
|
|
|
services:
|
|
app:
|
|
build:
|
|
context: ../..
|
|
dockerfile: deploy/app/Dockerfile
|
|
ports:
|
|
- '3001:3000'
|
|
environment:
|
|
# Neo4j configuration
|
|
- NEO4J_URI=bolt://neo4j:7687
|
|
- NEO4J_USER=neo4j
|
|
- NEO4J_PASSWORD=password123
|
|
- GRAPH_DB_TYPE=neo4j
|
|
# Disable ArangoDB
|
|
- ARANGODB_URL=http://localhost:8529
|
|
- ARANGODB_DB=txt2kg
|
|
# vLLM configuration (GPU-accelerated)
|
|
- VLLM_BASE_URL=http://vllm:8001/v1
|
|
- VLLM_MODEL=nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-FP8
|
|
# Disable Ollama
|
|
- OLLAMA_BASE_URL=http://localhost:11434/v1
|
|
- OLLAMA_MODEL=disabled
|
|
# Vector DB configuration
|
|
- QDRANT_URL=http://qdrant:6333
|
|
- VECTOR_DB_TYPE=qdrant
|
|
# Embeddings configuration
|
|
- LANGCHAIN_TRACING_V2=true
|
|
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
|
|
- MODEL_NAME=all-MiniLM-L6-v2
|
|
- EMBEDDINGS_API_URL=http://sentence-transformers:80
|
|
# Other settings
|
|
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
|
|
- NODE_TLS_REJECT_UNAUTHORIZED=0
|
|
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
|
- NODE_OPTIONS=--max-http-header-size=80000
|
|
- UV_THREADPOOL_SIZE=128
|
|
- HTTP_TIMEOUT=1800000
|
|
- REQUEST_TIMEOUT=1800000
|
|
networks:
|
|
- default
|
|
- txt2kg-network
|
|
- qdrant-net
|
|
depends_on:
|
|
neo4j:
|
|
condition: service_healthy
|
|
vllm:
|
|
condition: service_started
|
|
|
|
# Neo4j - Graph database
|
|
neo4j:
|
|
image: neo4j:5-community
|
|
ports:
|
|
- '7474:7474'
|
|
- '7687:7687'
|
|
environment:
|
|
- NEO4J_AUTH=neo4j/password123
|
|
- NEO4J_server_memory_heap_initial__size=512m
|
|
- NEO4J_server_memory_heap_max__size=2G
|
|
volumes:
|
|
- neo4j_data:/data
|
|
- neo4j_logs:/logs
|
|
networks:
|
|
- default
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
|
|
interval: 15s
|
|
timeout: 10s
|
|
retries: 10
|
|
start_period: 60s
|
|
|
|
# vLLM - GPU-accelerated LLM with unified memory support
|
|
vllm:
|
|
build:
|
|
context: ../services/vllm
|
|
dockerfile: Dockerfile
|
|
container_name: vllm-service
|
|
ports:
|
|
- '8001:8001'
|
|
ipc: host
|
|
ulimits:
|
|
memlock: -1
|
|
stack: 67108864
|
|
shm_size: '16gb'
|
|
environment:
|
|
- VLLM_MODEL=nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-FP8
|
|
- VLLM_TENSOR_PARALLEL_SIZE=1
|
|
- VLLM_MAX_MODEL_LEN=32768
|
|
- VLLM_GPU_MEMORY_UTILIZATION=0.9
|
|
- VLLM_MAX_NUM_SEQS=32
|
|
- VLLM_MAX_NUM_BATCHED_TOKENS=32768
|
|
- VLLM_KV_CACHE_DTYPE=auto
|
|
- VLLM_PORT=8001
|
|
- VLLM_HOST=0.0.0.0
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
- NCCL_DEBUG=INFO
|
|
- CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
|
|
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
- VLLM_CPU_OFFLOAD_GB=0
|
|
volumes:
|
|
- vllm_models:/app/models
|
|
- /tmp:/tmp
|
|
- ~/.cache/huggingface:/root/.cache/huggingface
|
|
networks:
|
|
- default
|
|
restart: unless-stopped
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
|
|
interval: 60s
|
|
timeout: 30s
|
|
retries: 30
|
|
start_period: 1800s
|
|
|
|
# Optional: Vector search services
|
|
sentence-transformers:
|
|
build:
|
|
context: ../services/sentence-transformers
|
|
dockerfile: Dockerfile
|
|
ports:
|
|
- '8000:80'
|
|
environment:
|
|
- MODEL_NAME=all-MiniLM-L6-v2
|
|
networks:
|
|
- default
|
|
restart: unless-stopped
|
|
profiles:
|
|
- vector-search
|
|
|
|
qdrant:
|
|
image: qdrant/qdrant:latest
|
|
container_name: qdrant
|
|
ports:
|
|
- "6333:6333"
|
|
- "6334:6334"
|
|
volumes:
|
|
- qdrant_data:/qdrant/storage
|
|
networks:
|
|
- qdrant-net
|
|
restart: unless-stopped
|
|
profiles:
|
|
- vector-search
|
|
|
|
qdrant-init:
|
|
image: curlimages/curl:latest
|
|
depends_on:
|
|
- qdrant
|
|
restart: "no"
|
|
entrypoint: /bin/sh
|
|
command:
|
|
- -c
|
|
- |
|
|
echo 'Waiting for Qdrant to start...'
|
|
sleep 5
|
|
curl -X PUT http://qdrant:6333/collections/entity-embeddings \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"vectors":{"size":384,"distance":"Cosine"}}' || true
|
|
curl -X PUT http://qdrant:6333/collections/document-embeddings \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"vectors":{"size":384,"distance":"Cosine"}}' || true
|
|
echo 'Collections created'
|
|
networks:
|
|
- qdrant-net
|
|
profiles:
|
|
- vector-search
|
|
|
|
volumes:
|
|
neo4j_data:
|
|
neo4j_logs:
|
|
vllm_models:
|
|
qdrant_data:
|
|
|
|
networks:
|
|
default:
|
|
driver: bridge
|
|
txt2kg-network:
|
|
driver: bridge
|
|
qdrant-net:
|
|
name: qdrant-network
|
|
|