mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
142 lines
3.9 KiB
YAML
142 lines
3.9 KiB
YAML
# This is a legacy file - use --with-optional flag instead
|
|
# The vLLM service is now included in docker-compose.optional.yml
|
|
# This file is kept for backwards compatibility
|
|
|
|
services:
|
|
app:
|
|
build:
|
|
context: ../..
|
|
dockerfile: deploy/app/Dockerfile
|
|
ports:
|
|
- '3001:3000'
|
|
environment:
|
|
- ARANGODB_URL=http://arangodb:8529
|
|
- ARANGODB_DB=txt2kg
|
|
- PINECONE_HOST=entity-embeddings
|
|
- PINECONE_PORT=5081
|
|
- PINECONE_API_KEY=pclocal
|
|
- PINECONE_ENVIRONMENT=local
|
|
- LANGCHAIN_TRACING_V2=true
|
|
- SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
|
|
- MODEL_NAME=all-MiniLM-L6-v2
|
|
- GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
|
|
- NODE_TLS_REJECT_UNAUTHORIZED=0
|
|
- OLLAMA_BASE_URL=http://ollama:11434/v1
|
|
- OLLAMA_MODEL=qwen3:1.7b
|
|
- VLLM_BASE_URL=http://vllm:8001/v1
|
|
- VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
|
- REMOTE_WEBGPU_SERVICE_URL=http://txt2kg-remote-webgpu:8083
|
|
networks:
|
|
- pinecone-net
|
|
- default
|
|
- txt2kg-network
|
|
depends_on:
|
|
- arangodb
|
|
- entity-embeddings
|
|
- sentence-transformers
|
|
- vllm
|
|
arangodb:
|
|
image: arangodb:latest
|
|
ports:
|
|
- '8529:8529'
|
|
environment:
|
|
- ARANGO_NO_AUTH=1
|
|
volumes:
|
|
- arangodb_data:/var/lib/arangodb3
|
|
- arangodb_apps_data:/var/lib/arangodb3-apps
|
|
arangodb-init:
|
|
image: arangodb:latest
|
|
depends_on:
|
|
arangodb:
|
|
condition: service_started
|
|
restart: on-failure
|
|
entrypoint: >
|
|
sh -c "
|
|
echo 'Waiting for ArangoDB to start...' &&
|
|
sleep 10 &&
|
|
echo 'Creating txt2kg database...' &&
|
|
arangosh --server.endpoint tcp://arangodb:8529 --server.authentication false --javascript.execute-string 'try { db._createDatabase(\"txt2kg\"); console.log(\"Database txt2kg created successfully!\"); } catch(e) { if(e.message.includes(\"duplicate\")) { console.log(\"Database txt2kg already exists\"); } else { throw e; } }'
|
|
"
|
|
entity-embeddings:
|
|
image: ghcr.io/pinecone-io/pinecone-index:latest
|
|
container_name: entity-embeddings
|
|
environment:
|
|
PORT: 5081
|
|
INDEX_TYPE: serverless
|
|
VECTOR_TYPE: dense
|
|
DIMENSION: 384
|
|
METRIC: cosine
|
|
INDEX_NAME: entity-embeddings
|
|
ports:
|
|
- "5081:5081"
|
|
platform: linux/amd64
|
|
networks:
|
|
- pinecone-net
|
|
restart: unless-stopped
|
|
sentence-transformers:
|
|
build:
|
|
context: ../../deploy/services/sentence-transformers
|
|
dockerfile: Dockerfile
|
|
ports:
|
|
- '8000:80'
|
|
environment:
|
|
- MODEL_NAME=all-MiniLM-L6-v2
|
|
networks:
|
|
- default
|
|
vllm:
|
|
build:
|
|
context: ../../deploy/services/vllm
|
|
dockerfile: Dockerfile
|
|
container_name: vllm-service
|
|
ports:
|
|
- '8001:8001'
|
|
environment:
|
|
# Model configuration
|
|
- VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
|
- VLLM_TENSOR_PARALLEL_SIZE=1
|
|
- VLLM_MAX_MODEL_LEN=4096
|
|
- VLLM_GPU_MEMORY_UTILIZATION=0.9
|
|
# NVfp4 quantization settings
|
|
- VLLM_QUANTIZATION=fp8
|
|
- VLLM_KV_CACHE_DTYPE=fp8
|
|
# Service configuration
|
|
- VLLM_PORT=8001
|
|
- VLLM_HOST=0.0.0.0
|
|
# Performance tuning
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
- NCCL_DEBUG=INFO
|
|
volumes:
|
|
- vllm_models:/app/models
|
|
- /tmp:/tmp
|
|
# Mount model cache for faster startup
|
|
- ~/.cache/huggingface:/root/.cache/huggingface
|
|
networks:
|
|
- default
|
|
restart: unless-stopped
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 120s # Longer start period for model loading
|
|
|
|
volumes:
|
|
arangodb_data:
|
|
arangodb_apps_data:
|
|
vllm_models:
|
|
|
|
networks:
|
|
pinecone-net:
|
|
name: pinecone
|
|
default:
|
|
driver: bridge
|
|
txt2kg-network:
|
|
driver: bridge
|