# Use NVIDIA Triton Inference Server with vLLM - optimized for latest NVIDIA hardware
FROM nvcr.io/nvidia/tritonserver:25.08-vllm-python-py3

# Install curl for health checks
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /app

# Copy the vLLM startup script
COPY launch_server.sh .

# Make startup script executable
RUN chmod +x launch_server.sh

# Create model directory
RUN mkdir -p /app/models

# Expose the service port
EXPOSE 8001

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8001/health || exit 1

# Start vLLM's built-in OpenAI API server directly
CMD ["./launch_server.sh"]