dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/compose/docker-compose.vllm.yml

# This is a legacy file - use --with-optional flag instead
# The vLLM service is now included in docker-compose.optional.yml
# This file is kept for backwards compatibility

services:
  app:
    build:
      context: ../..
      dockerfile: deploy/app/Dockerfile
    ports:
      - '3001:3000'
    environment:
      - ARANGODB_URL=http://arangodb:8529
      - ARANGODB_DB=txt2kg
      - PINECONE_HOST=entity-embeddings
      - PINECONE_PORT=5081
      - PINECONE_API_KEY=pclocal
      - PINECONE_ENVIRONMENT=local
      - LANGCHAIN_TRACING_V2=true
      - SENTENCE_TRANSFORMER_URL=http://sentence-transformers:80
      - MODEL_NAME=all-MiniLM-L6-v2
      - GRPC_SSL_CIPHER_SUITES=HIGH+ECDSA:HIGH+aRSA
      - NODE_TLS_REJECT_UNAUTHORIZED=0
      - OLLAMA_BASE_URL=http://ollama:11434/v1
      - OLLAMA_MODEL=qwen3:1.7b
      - VLLM_BASE_URL=http://vllm:8001/v1
      - VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
      - REMOTE_WEBGPU_SERVICE_URL=http://txt2kg-remote-webgpu:8083
    networks:
      - pinecone-net
      - default
      - txt2kg-network
    depends_on:
      - arangodb
      - entity-embeddings
      - sentence-transformers
      - vllm
  arangodb:
    image: arangodb:latest
    ports:
      - '8529:8529'
    environment:
      - ARANGO_NO_AUTH=1
    volumes:
      - arangodb_data:/var/lib/arangodb3
      - arangodb_apps_data:/var/lib/arangodb3-apps
  arangodb-init:
    image: arangodb:latest
    depends_on:
      arangodb:
        condition: service_started
    restart: on-failure
    entrypoint: >
      sh -c "
        echo 'Waiting for ArangoDB to start...' &&
        sleep 10 &&
        echo 'Creating txt2kg database...' &&
        arangosh --server.endpoint tcp://arangodb:8529 --server.authentication false --javascript.execute-string 'try { db._createDatabase(\"txt2kg\"); console.log(\"Database txt2kg created successfully!\"); } catch(e) { if(e.message.includes(\"duplicate\")) { console.log(\"Database txt2kg already exists\"); } else { throw e; } }'
      "
  entity-embeddings:
    image: ghcr.io/pinecone-io/pinecone-index:latest
    container_name: entity-embeddings
    environment:
      PORT: 5081
      INDEX_TYPE: serverless
      VECTOR_TYPE: dense
      DIMENSION: 384
      METRIC: cosine
      INDEX_NAME: entity-embeddings
    ports:
      - "5081:5081"
    platform: linux/amd64
    networks:
      - pinecone-net
    restart: unless-stopped
  sentence-transformers:
    build:
      context: ../../deploy/services/sentence-transformers
      dockerfile: Dockerfile
    ports:
      - '8000:80'
    environment:
      - MODEL_NAME=all-MiniLM-L6-v2
    networks:
      - default
  vllm:
    build:
      context: ../../deploy/services/vllm
      dockerfile: Dockerfile
    container_name: vllm-service
    ports:
      - '8001:8001'
    environment:
      # Model configuration
      - VLLM_MODEL=meta-llama/Llama-3.2-3B-Instruct
      - VLLM_TENSOR_PARALLEL_SIZE=1
      - VLLM_MAX_MODEL_LEN=4096
      - VLLM_GPU_MEMORY_UTILIZATION=0.9
      # NVfp4 quantization settings
      - VLLM_QUANTIZATION=fp8
      - VLLM_KV_CACHE_DTYPE=fp8
      # Service configuration
      - VLLM_PORT=8001
      - VLLM_HOST=0.0.0.0
      # Performance tuning
      - CUDA_VISIBLE_DEVICES=0
      - NCCL_DEBUG=INFO
    volumes:
      - vllm_models:/app/models
      - /tmp:/tmp
      # Mount model cache for faster startup
      - ~/.cache/huggingface:/root/.cache/huggingface
    networks:
      - default
    restart: unless-stopped
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/v1/models"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s  # Longer start period for model loading

volumes:
  arangodb_data:
  arangodb_apps_data:
  vllm_models:

networks:
  pinecone-net:
    name: pinecone
  default:
    driver: bridge
  txt2kg-network:
    driver: bridge