# This is a legacy file - use --with-optional flag instead # The vLLM service is now included in docker-compose.optional.yml # This file is kept for backwards compatibility services: app: build: context: ../.. dockerfile: deploy/app/Dockerfile ports: - '3001:3000' environment: - ARANGODB_URL=http://arangodb:8529 - ARANGODB_DB=txt2kg - PINECONE_HOST=entity-embeddings - PINECONE_PORT=5081 - PINECONE_API_KEY=pclocal - PINECONE_ENVIRONMENT=local - LANGCHAIN_TRACING_V2=true - SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80 - MODEL_NAME=all-MiniLM-L6-v2 - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA - NODE_TLS_REJECT_UNAUTHORIZED=0 - OLLAMA_BASE_URL=http://ollama:11434/v1 - OLLAMA_MODEL=qwen3:1.7b - VLLM_BASE_URL=http://vllm:8001/v1 - VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct - REMOTE_WEBGPU_SERVICE_URL=http://txt2kg-remote-webgpu:8083 networks: - pinecone-net - default - txt2kg-network depends_on: - arangodb - entity-embeddings - sentence-transformers - vllm arangodb: image: arangodb:latest ports: - '8529:8529' environment: - ARANGO_NO_AUTH=1 volumes: - arangodb_data:/var/lib/arangodb3 - arangodb_apps_data:/var/lib/arangodb3-apps arangodb-init: image: arangodb:latest depends_on: arangodb: condition: service_started restart: on-failure entrypoint: > sh -c " echo 'Waiting for ArangoDB to start...' && sleep 10 && echo 'Creating txt2kg database...' && arangosh --server.endpoint tcp://arangodb:8529 --server.authentication false --javascript.execute-string 'try { db._createDatabase(\"txt2kg\"); console.log(\"Database txt2kg created successfully!\"); } catch(e) { if(e.message.includes(\"duplicate\")) { console.log(\"Database txt2kg already exists\"); } else { throw e; } }' " entity-embeddings: image: ghcr.io/pinecone-io/pinecone-index:latest container_name: entity-embeddings environment: PORT: 5081 INDEX_TYPE: serverless VECTOR_TYPE: dense DIMENSION: 384 METRIC: cosine INDEX_NAME: entity-embeddings ports: - "5081:5081" platform: linux/amd64 networks: - pinecone-net restart: unless-stopped sentence-transformers: build: context: ../../deploy/services/sentence-transformers dockerfile: Dockerfile ports: - '8000:80' environment: - MODEL_NAME=all-MiniLM-L6-v2 networks: - default vllm: build: context: ../../deploy/services/vllm dockerfile: Dockerfile container_name: vllm-service ports: - '8001:8001' environment: # Model configuration - VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct - VLLM_TENSOR_PARALLEL_SIZE=1 - VLLM_MAX_MODEL_LEN=4096 - VLLM_GPU_MEMORY_UTILIZATION=0.9 # NVfp4 quantization settings - VLLM_QUANTIZATION=fp8 - VLLM_KV_CACHE_DTYPE=fp8 # Service configuration - VLLM_PORT=8001 - VLLM_HOST=0.0.0.0 # Performance tuning - CUDA_VISIBLE_DEVICES=0 - NCCL_DEBUG=INFO volumes: - vllm_models:/app/models - /tmp:/tmp # Mount model cache for faster startup - ~/.cache/huggingface:/root/.cache/huggingface networks: - default restart: unless-stopped deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"] interval: 30s timeout: 10s retries: 5 start_period: 120s # Longer start period for model loading volumes: arangodb_data: arangodb_apps_data: vllm_models: networks: pinecone-net: name: pinecone default: driver: bridge txt2kg-network: driver: bridge