dgx-spark-playbooks/nvidia/station-nanochat/assets/launch.sh

60 lines
1.4 KiB
Bash
Raw Normal View History

2026-05-26 18:25:53 +00:00
#!/bin/bash
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Get wandb API key
export WANDB_API_KEY=$WANDB_API_KEY
if [ -z "$WANDB_API_KEY" ]; then
echo "WANDB_API_KEY is not set"
exit 1
fi
export WANDB_RUN=${WANDB_RUN:-speedrun}
# Get Hugging Face API key
export HF_TOKEN=$HF_TOKEN
if [ -z "$HF_TOKEN" ]; then
echo "HF_TOKEN is not set"
exit 1
fi
2026-05-29 15:56:45 +00:00
# Use local cache dirs so no root paths are required
workdir=$(pwd)
NANOCHAT_CACHE="$(pwd)/nanochat_cache"
HF_CACHE="$(pwd)/hf_cache"
2026-05-26 18:25:53 +00:00
cleanup() {
2026-05-29 15:56:45 +00:00
echo -e "\nStopping training container..."
docker stop $(docker ps -q --filter ancestor=nanochat) 2>/dev/null
echo "Cleanup complete."
2026-05-26 18:25:53 +00:00
exit 0
}
2026-05-29 15:56:45 +00:00
trap cleanup SIGINT SIGTERM
2026-05-26 18:25:53 +00:00
2026-05-29 15:56:45 +00:00
# Launch Nanochat training
cmd="mkdir -p $NANOCHAT_CACHE $HF_CACHE && \
chmod u+rwx $NANOCHAT_CACHE $HF_CACHE && \
2026-05-26 18:25:53 +00:00
docker run \
--rm \
--runtime=nvidia \
--gpus all \
--ipc=host \
--net=host \
--ulimit memlock=-1 \
--ulimit stack=268435456 \
-e WANDB_API_KEY=$WANDB_API_KEY \
-e WANDB_RUN=$WANDB_RUN \
-e HF_TOKEN=$HF_TOKEN \
-v $(pwd)/nanochat:/workspace/nanochat \
-v $NANOCHAT_CACHE:/root/.cache/nanochat \
-v $HF_CACHE:/root/.cache/huggingface \
-w /workspace/nanochat \
nanochat \
2026-05-29 15:56:45 +00:00
bash runs/speedrun.sh"
2026-05-26 18:25:53 +00:00
sh -c "$cmd" &
2026-05-29 15:56:45 +00:00
wait
echo -e "\nTraining complete!"