mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
28 lines
724 B
Docker
28 lines
724 B
Docker
# Use NVIDIA Triton Inference Server with vLLM - optimized for latest NVIDIA hardware
|
|
FROM nvcr.io/nvidia/tritonserver:25.08-vllm-python-py3
|
|
|
|
# Install curl for health checks
|
|
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
|
|
|
# Set working directory
|
|
WORKDIR /app
|
|
|
|
# Copy the vLLM startup script
|
|
COPY launch_server.sh .
|
|
|
|
# Make startup script executable
|
|
RUN chmod +x launch_server.sh
|
|
|
|
# Create model directory
|
|
RUN mkdir -p /app/models
|
|
|
|
# Expose the service port
|
|
EXPOSE 8001
|
|
|
|
# Health check
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
|
CMD curl -f http://localhost:8001/health || exit 1
|
|
|
|
# Start vLLM's built-in OpenAI API server directly
|
|
CMD ["./launch_server.sh"]
|