mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-06-20 21:29:31 +00:00
60 lines
1.4 KiB
Bash
Executable File
60 lines
1.4 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
# Get wandb API key
|
|
export WANDB_API_KEY=$WANDB_API_KEY
|
|
if [ -z "$WANDB_API_KEY" ]; then
|
|
echo "WANDB_API_KEY is not set"
|
|
exit 1
|
|
fi
|
|
export WANDB_RUN=${WANDB_RUN:-speedrun}
|
|
|
|
# Get Hugging Face API key
|
|
export HF_TOKEN=$HF_TOKEN
|
|
if [ -z "$HF_TOKEN" ]; then
|
|
echo "HF_TOKEN is not set"
|
|
exit 1
|
|
fi
|
|
|
|
# Use local cache dirs so no root paths are required
|
|
workdir=$(pwd)
|
|
NANOCHAT_CACHE="$(pwd)/nanochat_cache"
|
|
HF_CACHE="$(pwd)/hf_cache"
|
|
|
|
cleanup() {
|
|
echo -e "\nStopping training container..."
|
|
docker stop $(docker ps -q --filter ancestor=nanochat) 2>/dev/null
|
|
echo "Cleanup complete."
|
|
exit 0
|
|
}
|
|
|
|
trap cleanup SIGINT SIGTERM
|
|
|
|
# Launch Nanochat training
|
|
cmd="mkdir -p $NANOCHAT_CACHE $HF_CACHE && \
|
|
chmod u+rwx $NANOCHAT_CACHE $HF_CACHE && \
|
|
docker run \
|
|
--rm \
|
|
--runtime=nvidia \
|
|
--gpus all \
|
|
--ipc=host \
|
|
--net=host \
|
|
--ulimit memlock=-1 \
|
|
--ulimit stack=268435456 \
|
|
-e WANDB_API_KEY=$WANDB_API_KEY \
|
|
-e WANDB_RUN=$WANDB_RUN \
|
|
-e HF_TOKEN=$HF_TOKEN \
|
|
-v $(pwd)/nanochat:/workspace/nanochat \
|
|
-v $NANOCHAT_CACHE:/root/.cache/nanochat \
|
|
-v $HF_CACHE:/root/.cache/huggingface \
|
|
-w /workspace/nanochat \
|
|
nanochat \
|
|
bash runs/speedrun.sh"
|
|
sh -c "$cmd" &
|
|
|
|
wait
|
|
echo -e "\nTraining complete!"
|