dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/vllm/Dockerfile
2025-10-06 17:05:41 +00:00

28 lines
724 B
Docker

# Use NVIDIA Triton Inference Server with vLLM - optimized for latest NVIDIA hardware
FROM nvcr.io/nvidia/tritonserver:25.08-vllm-python-py3
# Install curl for health checks
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Copy the vLLM startup script
COPY launch_server.sh .
# Make startup script executable
RUN chmod +x launch_server.sh
# Create model directory
RUN mkdir -p /app/models
# Expose the service port
EXPOSE 8001
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8001/health || exit 1
# Start vLLM's built-in OpenAI API server directly
CMD ["./launch_server.sh"]