dgx-spark-playbooks/nvidia/station-rec-sys/assets/teardown.sh
2026-05-26 18:25:53 +00:00

291 lines
9.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# teardown.sh — Stop processes and optionally remove all downloaded assets.
#
# Kills running demo processes (Ollama, vLLM, training, FastAPI,
# W&B). With --purge-downloads, also removes all downloaded data,
# base models, environments, and HLLM code — but preserves trained checkpoints.
#
# USAGE EXAMPLES:
#
# # Kill all running demo processes
# bash teardown.sh
#
# # Preview what would be killed (no action taken)
# bash teardown.sh --dry-run
#
# # Kill processes AND remove all downloaded assets (~60 GB)
# bash teardown.sh --purge-downloads
#
# # Same, but data lives on a custom mount
# bash teardown.sh --purge-downloads --data-dir /raid
#
# # Preview what --purge-downloads would remove
# bash teardown.sh --purge-downloads --dry-run
#
# # Use environment variable instead of CLI flag
# export PLAYBOOK_WORKSPACE=/raid
# bash teardown.sh --purge-downloads
#
# # Show all options
# bash teardown.sh --help
#
# WHAT --purge-downloads REMOVES:
# - Amazon raw + processed data (~46 GB)
# - TinyLlama-1.1B base model (~2 GB)
# - Ollama nemotron-mini model (~3 GB)
# - HLLM code, dataset, information (~1 GB)
# - Project .venv, wandb/tensorboard logs (~2+ GB)
#
# WHAT --purge-downloads PRESERVES:
# - Saved checkpoints ($WORKSPACE/checkpoints/) — your trained model weights
# - This repo's source code
# - uv and ollama binaries (system tools)
set -euo pipefail
DRY_RUN=false
PURGE=false
WORKSPACE="${PLAYBOOK_WORKSPACE:-$HOME}"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--purge-downloads) PURGE=true ;;
--workspace=*) WORKSPACE="${arg#*=}" ;;
--workspace) ;; # handled below with next arg
-h|--help)
cat <<'HELPEOF'
Usage: bash teardown.sh [OPTIONS]
Options:
(no flags) Kill running demo processes only
--dry-run Show what would happen without doing it
--purge-downloads Kill processes AND remove all downloaded assets:
- Amazon raw + processed data (~46 GB)
- Base models (TinyLlama) (~2 GB)
- Ollama nemotron-mini model (~3 GB)
- HLLM code + dataset + info (~1 GB)
- Project .venv (~2+ GB)
Preserves:
- Saved checkpoints ($WORKSPACE/checkpoints/)
- This repo's source code
- uv and ollama binaries
--workspace PATH Root workspace directory (default: $HOME)
Use when artifacts live on a different mount, e.g.:
bash teardown.sh --purge-downloads --workspace /raid/recsys-playbook
-h, --help Show this help
Environment:
PLAYBOOK_WORKSPACE Same as --workspace (CLI flag takes precedence)
HELPEOF
exit 0
;;
*)
# Handle --workspace as two separate args
if [[ "${prev_arg:-}" == "--workspace" ]]; then
WORKSPACE="$arg"
else
echo "Unknown option: $arg (run --help for usage)"
exit 1
fi
;;
esac
prev_arg="$arg"
done
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
HLLM_CODE_DIR="$WORKSPACE/hllm-code"
DATA_DIR="$WORKSPACE/data"
MODELS_DIR="$WORKSPACE/models"
CHECKPOINTS_DIR="$WORKSPACE/checkpoints"
echo "Workspace: $WORKSPACE"
echo ""
# ===================================================================
# Phase 1: Kill processes (always runs)
# ===================================================================
killed=0
kill_by_pattern() {
local label="$1"
local pattern="$2"
local pids
pids=$(pgrep -f "$pattern" 2>/dev/null || true)
if [ -n "$pids" ]; then
echo " $label"
for pid in $pids; do
if [ "$pid" = "$$" ] || [ "$pid" = "$PPID" ]; then
continue
fi
cmd=$(ps -p "$pid" -o args= 2>/dev/null || true)
cmd=${cmd:0:80}
echo " PID $pid: $cmd"
if ! $DRY_RUN; then
kill "$pid" 2>/dev/null && killed=$((killed + 1)) || true
else
killed=$((killed + 1))
fi
done
fi
}
echo "Stopping demo processes..."
echo ""
kill_by_pattern "Ollama server" "ollama serve"
kill_by_pattern "vLLM server" "vllm.entrypoints"
kill_by_pattern "HLLM training (torchrun)" "torchrun.*run.py"
kill_by_pattern "HLLM training (deepspeed)" "deepspeed.*run.py"
kill_by_pattern "HLLM embedding extraction" "extract_embeddings.py"
kill_by_pattern "FastAPI app (uvicorn)" "uvicorn.*app:app"
kill_by_pattern "W&B agent" "wandb-service"
echo ""
if [ "$killed" -gt 0 ]; then
if $DRY_RUN; then
echo "Would kill $killed process(es)."
else
echo "Killed $killed process(es)."
sleep 2
stragglers=$(pgrep -f "ollama serve|vllm.entrypoints|torchrun.*run.py|uvicorn.*app:app" 2>/dev/null || true)
if [ -n "$stragglers" ]; then
echo "Stragglers still running (sending SIGKILL):"
for pid in $stragglers; do
cmd=$(ps -p "$pid" -o args= 2>/dev/null || true)
cmd=${cmd:0:80}
echo " PID $pid: $cmd"
kill -9 "$pid" 2>/dev/null || true
done
fi
fi
else
echo "No demo processes found running."
fi
# ===================================================================
# Phase 2: Remove downloaded assets (only with --purge-downloads)
# ===================================================================
if ! $PURGE; then
exit 0
fi
echo ""
echo "============================================================"
echo " --purge-downloads: removing installed assets"
echo "============================================================"
echo ""
echo "Will remove:"
found=0
check_dir() {
local label="$1"
local path="$2"
if [ -d "$path" ]; then
local size
size=$(du -sh "$path" 2>/dev/null | cut -f1)
echo " $label: $path ($size)"
return 0
fi
return 1
}
check_dir "Amazon raw data" "$DATA_DIR/raw" && found=$((found + 1))
check_dir "Amazon processed data" "$DATA_DIR/processed" && found=$((found + 1))
check_dir "Base models" "$MODELS_DIR" && found=$((found + 1))
check_dir "HLLM code" "$HLLM_CODE_DIR" && found=$((found + 1))
check_dir "HLLM tensorboard logs" "$HLLM_CODE_DIR/log_tensorboard" && found=$((found + 1))
check_dir "HLLM wandb logs" "$HLLM_CODE_DIR/wandb" && found=$((found + 1))
check_dir "Project .venv" "$REPO_DIR/.venv" && found=$((found + 1))
check_dir "Project wandb logs" "$REPO_DIR/wandb" && found=$((found + 1))
check_dir "Project tensorboard" "$REPO_DIR/log_tensorboard" && found=$((found + 1))
if ollama list 2>/dev/null | grep -q "nemotron-mini"; then
echo " Ollama nemotron-mini model (~2.7 GB)"
found=$((found + 1))
fi
echo ""
echo "Will preserve:"
echo " Saved checkpoints: $CHECKPOINTS_DIR/ (trained model weights)"
echo " Repo source code: $REPO_DIR"
echo " uv binary: $(which uv 2>/dev/null || echo 'not installed')"
echo " ollama binary: $(which ollama 2>/dev/null || echo 'not installed')"
if [ "$found" -eq 0 ]; then
echo ""
echo "Nothing to remove."
exit 0
fi
# Require explicit confirmation unless dry-run
if ! $DRY_RUN; then
echo ""
read -r -p "Proceed? This will delete all of the above. [y/N] " confirm
if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
echo "Aborted."
exit 0
fi
fi
remove_dir() {
local label="$1"
local path="$2"
if [ -d "$path" ]; then
if $DRY_RUN; then
echo " [dry-run] would remove $path"
else
echo " Removing $path ..."
rm -rf "$path"
fi
fi
}
echo ""
# --- Amazon data ---
remove_dir "Amazon raw data" "$DATA_DIR/raw"
remove_dir "Amazon processed data" "$DATA_DIR/processed"
if [ -d "$DATA_DIR" ] && [ -z "$(ls -A "$DATA_DIR" 2>/dev/null)" ]; then
remove_dir "Data dir" "$DATA_DIR"
fi
# --- Base models (NOT checkpoints) ---
remove_dir "Base models" "$MODELS_DIR"
# --- HLLM code (preserve $CHECKPOINTS_DIR) ---
remove_dir "HLLM code" "$HLLM_CODE_DIR"
# --- Ollama model ---
if ollama list 2>/dev/null | grep -q "nemotron-mini"; then
if $DRY_RUN; then
echo " [dry-run] would run: ollama rm nemotron-mini"
else
echo " Removing Ollama nemotron-mini model..."
ollama rm nemotron-mini 2>/dev/null || true
fi
fi
# --- Project environments and logs ---
remove_dir "Project .venv" "$REPO_DIR/.venv"
remove_dir "Project wandb logs" "$REPO_DIR/wandb"
remove_dir "Project tensorboard" "$REPO_DIR/log_tensorboard"
echo ""
if $DRY_RUN; then
echo "Dry run complete. Run without --dry-run to execute."
else
echo "Purge complete."
echo ""
echo "Preserved:"
if [ -d "$CHECKPOINTS_DIR" ]; then
echo " $CHECKPOINTS_DIR/ ($(du -sh "$CHECKPOINTS_DIR" 2>/dev/null | cut -f1))"
fi
echo " $REPO_DIR (repo source)"
echo ""
echo "To fully rebuild: bash $REPO_DIR/assets/setup.sh --workspace $WORKSPACE"
fi