dgx-spark-playbooks/nvidia/station-nanochat/assets/launch.sh

#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Lite training (default). Runs speedrun.sh, which setup copies from speedrun_lite.sh.

# Get wandb API key
export WANDB_API_KEY=$WANDB_API_KEY
if [ -z "$WANDB_API_KEY" ]; then
    echo "WANDB_API_KEY is not set"
    exit 1
fi

export WANDB_RUN=${WANDB_RUN:-speedrun}

# Get Hugging Face API key
export HF_TOKEN=$HF_TOKEN
if [ -z "$HF_TOKEN" ]; then
    echo "HF_TOKEN is not set"
    exit 1
fi

# Cleanup function to stop containers
cleanup() {
    echo
    echo "Stopping containers..."
    docker stop $(docker ps -q --filter ancestor=nanochat) 2>/dev/null || true
    echo "Interrupted training!"
    exit 0
}

workdir=$(pwd)
# DGX Station: use local cache dirs so no root paths are required
NANOCHAT_CACHE="${NANOCHAT_CACHE:-$(pwd)/nanochat_cache}"
HF_CACHE="${HF_CACHE:-$(pwd)/hf_cache}"
mkdir -p "$NANOCHAT_CACHE" "$HF_CACHE"

cmd="
mkdir -p /nanochat_cache && \
mkdir -p /hf_cache && \
chmod 777 /nanochat_cache && \
chmod 777 /hf_cache && \
docker run \
    --rm \
    --runtime=nvidia \
    --gpus all \
    --ipc=host \
    --net=host \
    --ulimit memlock=-1 \
    --ulimit stack=268435456 \
    -e WANDB_API_KEY=$WANDB_API_KEY \
    -e WANDB_RUN=$WANDB_RUN \
    -e HF_TOKEN=$HF_TOKEN \
    -v $(pwd)/nanochat:/workspace/nanochat \
    -v $NANOCHAT_CACHE:/root/.cache/nanochat \
    -v $HF_CACHE:/root/.cache/huggingface \
    -w /workspace/nanochat \
    nanochat \
    bash speedrun.sh"

sh -c "$cmd" &

sleep 5
while true; do
    if ! docker ps | grep -q "nanochat"; then
        echo
        echo "Training complete!"
        exit 0
    fi
    sleep 1
done