dgx-spark-playbooks/nvidia/station-rec-sys/assets/setup.sh
2026-05-26 18:25:53 +00:00

504 lines
20 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# setup.sh — Automated setup for the standalone recommender playbook.
#
# Installs all dependencies, downloads models and data, and prepares the
# environment to run the E2E HLLM recommendation pipeline.
# Idempotent: safe to re-run if interrupted — skips completed steps.
#
# USAGE EXAMPLES:
#
# # Default setup — stores data/models under $HOME
# bash setup.sh
#
# # Preview what would be done without executing
# bash setup.sh --dry-run
#
# # Verify prerequisites only
# bash setup.sh --check
#
# # Store large files on a different mount (SSD, NVMe, RAID array, etc.)
# bash setup.sh --workspace /raid/recsys-playbook
# bash setup.sh --workspace /mnt/nvme/recsys-playbook
# bash setup.sh --workspace /data/recsys-playbook
# bash setup.sh --data-dir /raid/recsys-playbook
#
# # Use environment variable instead of CLI flag
# export PLAYBOOK_WORKSPACE=/raid
# bash setup.sh
#
# # Show all options
# bash setup.sh --help
#
# WHAT IT INSTALLS under $PLAYBOOK_WORKSPACE (~80 GB total):
# - uv (Python project manager) — ~/.local/bin/
# - Ollama (LLM serving) — /usr/local/bin/ (requires sudo)
# - Nemotron Mini 4B (Ollama) — ~/.ollama/models/ (~3 GB)
# - Python deps (uv venv) — this repo's .venv/
# - HLLM code (ByteDance) — $WORKSPACE/hllm-code/ (~1 GB)
# - TinyLlama-1.1B model — $WORKSPACE/models/ (~2 GB, HF cache stays at default)
# - Amazon Clothing dataset — $WORKSPACE/data/ (~46 GB)
# - Training checkpoints — $WORKSPACE/checkpoints/ (~2030 GB after training)
#
# REQUIRES: NVIDIA GPU with drivers installed, internet access.
# SUDO: Required only for Ollama install (can be pre-installed by an admin).
set -euo pipefail
DRY_RUN=false
CHECK_ONLY=false
WORKSPACE="${PLAYBOOK_WORKSPACE:-$HOME}"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--check) CHECK_ONLY=true ;;
--workspace=*) WORKSPACE="${arg#*=}" ;;
--data-dir=*) WORKSPACE="${arg#*=}" ;;
--workspace) ;; # handled below with next arg
--data-dir) ;; # handled below with next arg
-h|--help)
cat <<'HELPEOF'
Usage: bash setup.sh [OPTIONS]
Options:
(no flags) Run full setup
--dry-run Print commands without executing
--check Validate prerequisites only
--workspace PATH Root directory for all playbook artifacts
(default: $HOME, or $PLAYBOOK_WORKSPACE if set)
Example: bash setup.sh --workspace /raid/recsys-playbook
--data-dir PATH Alias for --workspace
-h, --help Show this help
Environment:
PLAYBOOK_WORKSPACE Same as --workspace (CLI flag takes precedence)
What gets created under WORKSPACE (~80 GB total):
WORKSPACE/
├── station-rec-sys/ The playbook repo + uv venv
├── hllm-code/ ByteDance HLLM (~1 GB)
├── data/ Amazon dataset — raw + processed (~46 GB)
├── models/ TinyLlama-1.1B (~2 GB)
└── checkpoints/ HLLM training output (~2030 GB after training)
HuggingFace and Ollama caches stay at their default locations (~/.cache/huggingface,
~/.ollama) so they can be shared with other projects on the machine.
HELPEOF
exit 0
;;
*)
if [[ "${prev_arg:-}" == "--workspace" ]]; then
WORKSPACE="$arg"
elif [[ "${prev_arg:-}" == "--data-dir" ]]; then
WORKSPACE="$arg"
else
echo "Unknown option: $arg (run --help for usage)"
exit 1
fi
;;
esac
prev_arg="$arg"
done
WORKSPACE="${WORKSPACE%/}" # strip trailing slash if any
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
HLLM_CODE_DIR="$WORKSPACE/hllm-code"
DATA_DIR="$WORKSPACE/data"
MODELS_DIR="$WORKSPACE/models"
CHECKPOINTS_DIR="$WORKSPACE/checkpoints"
if $CHECK_ONLY; then
echo "Checking pre-requisites..."
echo ""
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
RED='\033[0;31m'
NC='\033[0m'
ok=0
warn=0
fail=0
if command -v nvidia-smi &>/dev/null; then
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
gpu_mem_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
gpu_mem_gb=$(( gpu_mem_mib / 1024 ))
echo -e " ${GREEN}[OK]${NC} GPU: $gpu_name (${gpu_mem_gb} GB)"
ok=$((ok + 1))
else
echo -e " ${RED}[FAIL]${NC} nvidia-smi not found — NVIDIA driver not installed"
fail=$((fail + 1))
fi
if command -v nvcc &>/dev/null; then
cuda_ver=$(nvcc --version | grep "release" | sed 's/.*release //' | sed 's/,.*//')
echo -e " ${GREEN}[OK]${NC} CUDA: $cuda_ver"
ok=$((ok + 1))
else
echo -e " ${YELLOW}[WARN]${NC} nvcc not found — CUDA toolkit not on PATH (driver CUDA may still work)"
warn=$((warn + 1))
fi
avail_gb=$(df -BG "$WORKSPACE" 2>/dev/null | tail -1 | awk '{print $4}' | tr -d 'G')
if [ "${avail_gb:-0}" -ge 80 ]; then
echo -e " ${GREEN}[OK]${NC} Disk: ${avail_gb} GB available at $WORKSPACE (need ~80 GB)"
ok=$((ok + 1))
else
echo -e " ${YELLOW}[WARN]${NC} Disk: ${avail_gb:-?} GB available at $WORKSPACE (need ~80 GB)"
echo " Set PLAYBOOK_WORKSPACE to a path with more space, or use --workspace"
warn=$((warn + 1))
fi
if command -v sudo &>/dev/null && sudo -n true 2>/dev/null; then
echo -e " ${GREEN}[OK]${NC} sudo: available (needed for Ollama install)"
ok=$((ok + 1))
else
echo -e " ${YELLOW}[WARN]${NC} sudo: not available or requires password"
echo " Ollama install needs sudo — ask an admin to pre-install if needed"
warn=$((warn + 1))
fi
for tool in git wget curl; do
if command -v "$tool" &>/dev/null; then
echo -e " ${GREEN}[OK]${NC} $tool: $(command -v "$tool")"
ok=$((ok + 1))
else
echo -e " ${RED}[FAIL]${NC} $tool: not found"
fail=$((fail + 1))
fi
done
echo ""
echo -e "Result: ${GREEN}$ok passed${NC}, ${YELLOW}$warn warnings${NC}, ${RED}$fail failed${NC}"
if [ "$fail" -gt 0 ]; then
echo "Fix failures above before running setup."
elif [ "$warn" -gt 0 ]; then
echo "Review warnings above before running setup."
else
echo "All checks passed. Ready to run: bash assets/setup.sh"
fi
exit 0
fi
run() {
echo " -> $*"
$DRY_RUN || "$@"
}
section() {
echo ""
echo "============================================================"
echo " $1"
echo "============================================================"
}
echo "Workspace: $WORKSPACE"
echo "Repo: $REPO_DIR"
# ---------------------------------------------------------------
section "Step 1: System tools (uv, Ollama)"
# ---------------------------------------------------------------
if ! command -v uv &>/dev/null; then
echo "Installing uv..."
run bash -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
export PATH="$HOME/.local/bin:$PATH"
else
echo "uv already installed: $(uv --version)"
fi
if ! command -v ollama &>/dev/null; then
echo "Installing Ollama..."
run bash -c 'curl -fsSL https://ollama.com/install.sh | sh'
else
echo "Ollama already installed: $(ollama --version)"
fi
# ---------------------------------------------------------------
section "Step 2: Python environment (uv)"
# ---------------------------------------------------------------
cd "$REPO_DIR"
if [ -d ".venv" ]; then
echo "Project venv already exists. Syncing dependencies..."
else
echo "Creating project venv and installing all dependencies..."
fi
run uv sync --inexact
echo ""
echo "Verifying key packages..."
CUDA_VISIBLE_DEVICES=0 uv run python -c "import torch; print(f' PyTorch {torch.__version__}, CUDA: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}')" || echo " WARNING: PyTorch import failed"
uv run python -c "import faiss; print(f' FAISS installed')" || echo " WARNING: faiss not found"
uv run python -c "import transformers; print(f' transformers {transformers.__version__}')" || echo " WARNING: transformers import failed"
CUDA_VISIBLE_DEVICES=0 uv run python -c "import deepspeed; print(f' deepspeed {deepspeed.__version__}')" || echo " WARNING: deepspeed import failed"
uv run python -c "import peft; print(f' peft {peft.__version__}')" || echo " WARNING: peft import failed"
echo ""
echo "Optional W&B login..."
if uv run wandb status 2>/dev/null | grep -q "Logged in"; then
echo " W&B already logged in."
elif [ -t 0 ]; then
read -r -p " Log in to Weights & Biases now? [y/N] " wandb_confirm
if [[ "$wandb_confirm" =~ ^[Yy]$ ]]; then
run uv run wandb login
else
echo " Skipping W&B login. Run 'uv run wandb login' later to enable training dashboards."
fi
else
echo " Non-interactive shell; skipping W&B login prompt."
echo " Run 'uv run wandb login' later to enable training dashboards."
fi
# ---------------------------------------------------------------
section "Step 3: flash-attn (source build for target GPU)"
# ---------------------------------------------------------------
# flash-attn must be built from source to target specific GPU architectures.
# Pre-built wheels only cover older CUDA versions.
# Note: uv sync --inexact (Step 2) preserves the editable install, so
# re-runs skip this step entirely once flash-attn is built.
FLASH_ATTN_REPO="$HOME/dev/flash-attention"
FLASH_ATTN_PY="$REPO_DIR/.venv/bin/python"
FLASH_ATTN_EDITABLE_WHEEL="${FLASH_ATTN_EDITABLE_WHEEL:-}"
if uv run python -c "import flash_attn" 2>/dev/null; then
echo "flash-attn already installed: $(uv run python -c 'import flash_attn; print(flash_attn.__version__)')"
else
if [ -f "$FLASH_ATTN_REPO/flash_attn_2_cuda.cpython-313-aarch64-linux-gnu.so" ]; then
echo "Found existing flash-attn source build at $FLASH_ATTN_REPO"
if [ -z "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
FLASH_ATTN_EDITABLE_WHEEL=$(find "$HOME/.cache/uv/sdists-v9/editable" \
-path "*/flash_attn-2.8.4-0.editable-cp313-cp313-linux_aarch64.whl" \
-print -quit 2>/dev/null || true)
fi
if [ -n "$FLASH_ATTN_EDITABLE_WHEEL" ] && [ -f "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
echo " Installing editable flash-attn wheel without rebuilding:"
echo " $FLASH_ATTN_EDITABLE_WHEEL"
run uv pip install --python "$FLASH_ATTN_PY" "$FLASH_ATTN_EDITABLE_WHEEL"
else
echo " Existing compiled extension found, but no editable wheel was found in uv cache."
fi
fi
if uv run python -c "import flash_attn, flash_attn_2_cuda; print(f' flash-attn {flash_attn.__version__} from {flash_attn.__file__}')" 2>/dev/null; then
echo " Reused existing flash-attn source build."
else
# Full source compilation required (~20 min first time)
# Detect CUDA toolkit — prefer 13.1, fall back to whatever /usr/local/cuda points to
if [ -d "/usr/local/cuda-13.1" ]; then
export CUDA_HOME="/usr/local/cuda-13.1"
elif [ -d "/usr/local/cuda" ]; then
export CUDA_HOME="/usr/local/cuda"
fi
echo " CUDA_HOME: ${CUDA_HOME:-not set}"
# Target GPU architectures — auto-detect from nvidia-smi so we only compile
# kernels for GPUs actually present. Two env vars matter:
# - TORCH_CUDA_ARCH_LIST: dot-notation (e.g. "10.3"), used by torch's C++
# extension builder. Accepts the raw compute_cap string.
# - FLASH_ATTN_CUDA_ARCHS: flash-attn's own var (setup.py:72). Semicolon-
# separated family integers (80;90;100;110;120). Ignores
# TORCH_CUDA_ARCH_LIST entirely, so it must be set separately.
# Mapping: "10.3" -> major=10 -> "100" (the sm_100f family, forward-
# compatible with sm_101/102/103).
detected_cc=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | sort -u | tr '\n' ';' | sed 's/;$//')
if [ -z "${TORCH_CUDA_ARCH_LIST:-}" ]; then
export TORCH_CUDA_ARCH_LIST="${detected_cc:-10.3}"
fi
if [ -z "${FLASH_ATTN_CUDA_ARCHS:-}" ]; then
fa_archs=$(echo "$detected_cc" | tr ';' '\n' | awk -F. '{print $1*10}' | sort -u | tr '\n' ';' | sed 's/;$//')
export FLASH_ATTN_CUDA_ARCHS="${fa_archs:-100}"
fi
echo " Detected compute_cap: ${detected_cc:-unknown}"
echo " TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST (for torch extensions)"
echo " FLASH_ATTN_CUDA_ARCHS: $FLASH_ATTN_CUDA_ARCHS (flash-attn family targets)"
if [ -d "$FLASH_ATTN_REPO" ]; then
echo " Building flash-attn from local repo: $FLASH_ATTN_REPO"
echo " First-time build compiles CUDA kernels from source using all CPU cores."
echo " This may take up to 30 minutes. Subsequent runs are near-instant."
cd "$FLASH_ATTN_REPO"
run git pull
else
echo " Cloning flash-attention repo and building from source..."
echo " First-time build compiles CUDA kernels from source using all CPU cores."
echo " This may take up to 30 minutes. Subsequent runs are near-instant."
run mkdir -p "$HOME/dev"
run git clone https://github.com/Dao-AILab/flash-attention.git "$FLASH_ATTN_REPO"
cd "$FLASH_ATTN_REPO"
fi
# Install into the project venv (--python points uv to the right env)
run uv pip install --python "$FLASH_ATTN_PY" -e . --no-build-isolation
cd "$REPO_DIR"
# Verify
if uv run python -c "import flash_attn; print(f' flash-attn {flash_attn.__version__}')" 2>/dev/null; then
echo " flash-attn installed successfully."
else
echo " WARNING: flash-attn build failed. Training may still work without it (slower attention)."
fi
fi
fi
# ---------------------------------------------------------------
section "Step 4: Clone and patch HLLM"
# ---------------------------------------------------------------
if [ ! -d "$HLLM_CODE_DIR/REC" ]; then
echo "Cloning HLLM from github.com/bytedance/HLLM..."
run mkdir -p "$WORKSPACE"
UPSTREAM_TMP="$WORKSPACE/.hllm-upstream"
run git clone https://github.com/bytedance/HLLM.git "$UPSTREAM_TMP"
run cp -r "$UPSTREAM_TMP/code" "$HLLM_CODE_DIR"
run rm -rf "$UPSTREAM_TMP"
else
echo "HLLM code already present at $HLLM_CODE_DIR/"
fi
# Apply LoRA patches (overwrites upstream files with patched versions)
PATCHES_DIR="$REPO_DIR/assets/patches/HLLM"
if [ -d "$PATCHES_DIR" ]; then
echo " Applying LoRA patches from $PATCHES_DIR ..."
run cp "$PATCHES_DIR/hllm.py" "$HLLM_CODE_DIR/REC/model/HLLM/hllm.py"
run cp "$PATCHES_DIR/modeling_bert.py" "$HLLM_CODE_DIR/REC/model/HLLM/modeling_bert.py"
run cp "$PATCHES_DIR/trainer.py" "$HLLM_CODE_DIR/REC/trainer/trainer.py"
run cp "$PATCHES_DIR/wandblogger.py" "$HLLM_CODE_DIR/REC/utils/wandblogger.py"
run cp "$PATCHES_DIR/argument_list.py" "$HLLM_CODE_DIR/REC/utils/argument_list.py"
run cp "$PATCHES_DIR/utils.py" "$HLLM_CODE_DIR/REC/data/utils.py"
run cp "$PATCHES_DIR/dataload.py" "$HLLM_CODE_DIR/REC/data/dataload.py"
echo " LoRA patches applied."
fi
# Dataset + information dirs live alongside the HLLM code (HLLM config reads from them)
run mkdir -p "$HLLM_CODE_DIR/dataset" "$HLLM_CODE_DIR/information" "$CHECKPOINTS_DIR"
# Copy training/extraction scripts from the repo into hllm-code/ for HLLM to find
if [ -f "$REPO_DIR/assets/train_retriever.sh" ]; then
run cp "$REPO_DIR/assets/train_retriever.sh" "$HLLM_CODE_DIR/train_lora.sh"
run chmod +x "$HLLM_CODE_DIR/train_lora.sh"
fi
if [ -f "$REPO_DIR/assets/extract_embeddings.py" ]; then
run cp "$REPO_DIR/assets/extract_embeddings.py" "$HLLM_CODE_DIR/extract_embeddings.py"
fi
# Create extract_embeddings.sh wrapper inside hllm-code/
cat > "$HLLM_CODE_DIR/extract_embeddings.sh" << SCRIPT
#!/bin/bash
set -e
cd "$HLLM_CODE_DIR"
unset MPLBACKEND
CKPT_PATH="\${1:-$CHECKPOINTS_DIR/dresses_lora_r16/HLLM-0.pth/checkpoint/mp_rank_00_model_states.pt}"
OUTPUT_DIR="\${2:-$DATA_DIR/processed}"
MASTER_PORT="\${PLAYBOOK_EXTRACT_MASTER_PORT:-12399}"
[ "\$#" -ge 1 ] && shift
[ "\$#" -ge 1 ] && shift
exec uv run --project "$REPO_DIR" torchrun --master_port="\$MASTER_PORT" --nproc_per_node=1 --nnodes=1 extract_embeddings.py \\
--ckpt_path "\$CKPT_PATH" \\
--output_dir "\$OUTPUT_DIR" \\
"\$@"
SCRIPT
chmod +x "$HLLM_CODE_DIR/extract_embeddings.sh"
# ---------------------------------------------------------------
section "Step 5: Download models"
# ---------------------------------------------------------------
# TinyLlama-1.1B (backbone for HLLM retriever training)
# If the download fails with 401/403, log in first: uv run hf login
if [ -f "$MODELS_DIR/TinyLlama-1.1B/config.json" ]; then
echo "TinyLlama-1.1B already present at $MODELS_DIR/TinyLlama-1.1B/"
else
echo "Downloading TinyLlama-1.1B from HuggingFace (~2 GB)..."
echo " If this fails with a 401/403 error, run: uv run hf login"
run mkdir -p "$MODELS_DIR"
run uv run hf download \
TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
--local-dir "$MODELS_DIR/TinyLlama-1.1B"
fi
# Nemotron Mini via Ollama
if ollama list 2>/dev/null | grep -q "nemotron-mini"; then
echo "nemotron-mini already pulled in Ollama."
else
echo "Pulling nemotron-mini (2.7 GB)..."
run ollama pull nemotron-mini
fi
# ---------------------------------------------------------------
section "Step 6: Download and process Amazon data"
# ---------------------------------------------------------------
REVIEWS_FILE="$DATA_DIR/raw/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl"
META_FILE="$DATA_DIR/raw/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl"
run mkdir -p "$DATA_DIR/raw/raw/review_categories" "$DATA_DIR/raw/raw/meta_categories" "$DATA_DIR/processed"
# Dataset moved from datarepo.eng.ucsd.edu to HuggingFace (plain JSONL, not gzipped)
HF_DATASET_BASE="https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main"
if [ -f "$REVIEWS_FILE" ]; then
echo "Reviews data already present ($(du -h "$REVIEWS_FILE" | cut -f1))"
else
echo "Downloading Amazon Reviews — Clothing, Shoes & Jewelry (~27.8 GB)..."
run wget -c \
"$HF_DATASET_BASE/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl" \
-O "$REVIEWS_FILE"
fi
if [ -f "$META_FILE" ]; then
echo "Metadata already present ($(du -h "$META_FILE" | cut -f1))"
else
echo "Downloading Amazon Metadata — Clothing, Shoes & Jewelry (~18 GB)..."
run wget -c \
"$HF_DATASET_BASE/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl" \
-O "$META_FILE"
fi
# Process into HLLM format
if [ -f "$DATA_DIR/processed/dress_metadata.parquet" ] && [ -f "$HLLM_CODE_DIR/dataset/amazon_dresses.csv" ]; then
echo "Processed data already exists. Skipping processing."
else
echo "Processing Amazon data into HLLM format..."
cd "$REPO_DIR"
run uv run python assets/prepare_data.py
fi
# ---------------------------------------------------------------
section "Step 7: Start Ollama"
# ---------------------------------------------------------------
if curl -s http://localhost:11434/api/tags &>/dev/null; then
echo "Ollama is already running."
ollama list
else
echo "Starting Ollama..."
run ollama serve &
sleep 3
if curl -s http://localhost:11434/api/tags &>/dev/null; then
echo "Ollama started successfully."
ollama list
else
echo "WARNING: Ollama may not have started. Run 'ollama serve' manually."
fi
fi
# ---------------------------------------------------------------
section "Setup complete!"
# ---------------------------------------------------------------
echo ""
echo "Workspace: $WORKSPACE"
echo "Repo: $REPO_DIR"
echo ""
echo "Next steps:"
echo " cd $REPO_DIR"
echo " bash assets/train_retriever.sh"
echo " uv run python assets/extract_embeddings.py"
echo ""
echo "Expected runtime: ~20 min (with training at bs=512) or ~5 min (with pre-computed embeddings)"
echo ""