dgx-spark-playbooks/nvidia/station-rec-sys/assets/setup.sh

#!/usr/bin/env bash
# setup.sh — Automated setup for the standalone recommender playbook.
#
# Installs all dependencies, downloads models and data, and prepares the
# environment to run the E2E HLLM recommendation pipeline.
# Idempotent: safe to re-run if interrupted — skips completed steps.
#
# USAGE EXAMPLES:
#
#   # Default setup — stores data/models under $HOME
#   bash setup.sh
#
#   # Preview what would be done without executing
#   bash setup.sh --dry-run
#
#   # Verify prerequisites only
#   bash setup.sh --check
#
#   # Store large files on a different mount (SSD, NVMe, RAID array, etc.)
#   bash setup.sh --workspace /raid/recsys-playbook
#   bash setup.sh --workspace /mnt/nvme/recsys-playbook
#   bash setup.sh --workspace /data/recsys-playbook
#   bash setup.sh --data-dir /raid/recsys-playbook
#
#   # Use environment variable instead of CLI flag
#   export PLAYBOOK_WORKSPACE=/raid
#   bash setup.sh
#
#   # Show all options
#   bash setup.sh --help
#
# WHAT IT INSTALLS under $PLAYBOOK_WORKSPACE (~80 GB total):
#   - uv (Python project manager)        — ~/.local/bin/
#   - Ollama (LLM serving)               — /usr/local/bin/ (requires sudo)
#   - Nemotron Mini 4B (Ollama)          — ~/.ollama/models/ (~3 GB)
#   - Python deps (uv venv)              — this repo's .venv/
#   - HLLM code (ByteDance)              — $WORKSPACE/hllm-code/ (~1 GB)
#   - TinyLlama-1.1B model               — $WORKSPACE/models/ (~2 GB, HF cache stays at default)
#   - Amazon Clothing dataset            — $WORKSPACE/data/ (~46 GB)
#   - Training checkpoints               — $WORKSPACE/checkpoints/ (~20–30 GB after training)
#
# REQUIRES: NVIDIA GPU with drivers installed, internet access.
# SUDO: Required only for Ollama install (can be pre-installed by an admin).

set -euo pipefail

DRY_RUN=false
CHECK_ONLY=false
WORKSPACE="${PLAYBOOK_WORKSPACE:-$HOME}"

for arg in "$@"; do
    case "$arg" in
        --dry-run)      DRY_RUN=true ;;
        --check)        CHECK_ONLY=true ;;
        --workspace=*)  WORKSPACE="${arg#*=}" ;;
        --data-dir=*)   WORKSPACE="${arg#*=}" ;;
        --workspace)    ;; # handled below with next arg
        --data-dir)     ;; # handled below with next arg
        -h|--help)
            cat <<'HELPEOF'
Usage: bash setup.sh [OPTIONS]

Options:
  (no flags)        Run full setup
  --dry-run         Print commands without executing
  --check           Validate prerequisites only
  --workspace PATH  Root directory for all playbook artifacts
                    (default: $HOME, or $PLAYBOOK_WORKSPACE if set)
                    Example: bash setup.sh --workspace /raid/recsys-playbook
  --data-dir PATH   Alias for --workspace
  -h, --help        Show this help

Environment:
  PLAYBOOK_WORKSPACE   Same as --workspace (CLI flag takes precedence)

What gets created under WORKSPACE (~80 GB total):
  WORKSPACE/
  ├── station-rec-sys/          The playbook repo + uv venv
  ├── hllm-code/                ByteDance HLLM (~1 GB)
  ├── data/                     Amazon dataset — raw + processed (~46 GB)
  ├── models/                   TinyLlama-1.1B (~2 GB)
  └── checkpoints/              HLLM training output (~20–30 GB after training)

HuggingFace and Ollama caches stay at their default locations (~/.cache/huggingface,
~/.ollama) so they can be shared with other projects on the machine.
HELPEOF
            exit 0
            ;;
        *)
            if [[ "${prev_arg:-}" == "--workspace" ]]; then
                WORKSPACE="$arg"
            elif [[ "${prev_arg:-}" == "--data-dir" ]]; then
                WORKSPACE="$arg"
            else
                echo "Unknown option: $arg (run --help for usage)"
                exit 1
            fi
            ;;
    esac
    prev_arg="$arg"
done

WORKSPACE="${WORKSPACE%/}"   # strip trailing slash if any
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
HLLM_CODE_DIR="$WORKSPACE/hllm-code"
DATA_DIR="$WORKSPACE/data"
MODELS_DIR="$WORKSPACE/models"
CHECKPOINTS_DIR="$WORKSPACE/checkpoints"

if $CHECK_ONLY; then
    echo "Checking pre-requisites..."
    echo ""

    GREEN='\033[0;32m'
    YELLOW='\033[0;33m'
    RED='\033[0;31m'
    NC='\033[0m'

    ok=0
    warn=0
    fail=0

    if command -v nvidia-smi &>/dev/null; then
        gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
        gpu_mem_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
        gpu_mem_gb=$(( gpu_mem_mib / 1024 ))
        echo -e "  ${GREEN}[OK]${NC} GPU: $gpu_name (${gpu_mem_gb} GB)"
        ok=$((ok + 1))
    else
        echo -e "  ${RED}[FAIL]${NC} nvidia-smi not found — NVIDIA driver not installed"
        fail=$((fail + 1))
    fi

    if command -v nvcc &>/dev/null; then
        cuda_ver=$(nvcc --version | grep "release" | sed 's/.*release //' | sed 's/,.*//')
        echo -e "  ${GREEN}[OK]${NC} CUDA: $cuda_ver"
        ok=$((ok + 1))
    else
        echo -e "  ${YELLOW}[WARN]${NC} nvcc not found — CUDA toolkit not on PATH (driver CUDA may still work)"
        warn=$((warn + 1))
    fi

    avail_gb=$(df -BG "$WORKSPACE" 2>/dev/null | tail -1 | awk '{print $4}' | tr -d 'G')
    if [ "${avail_gb:-0}" -ge 80 ]; then
        echo -e "  ${GREEN}[OK]${NC} Disk: ${avail_gb} GB available at $WORKSPACE (need ~80 GB)"
        ok=$((ok + 1))
    else
        echo -e "  ${YELLOW}[WARN]${NC} Disk: ${avail_gb:-?} GB available at $WORKSPACE (need ~80 GB)"
        echo "         Set PLAYBOOK_WORKSPACE to a path with more space, or use --workspace"
        warn=$((warn + 1))
    fi

    if command -v sudo &>/dev/null && sudo -n true 2>/dev/null; then
        echo -e "  ${GREEN}[OK]${NC} sudo: available (needed for Ollama install)"
        ok=$((ok + 1))
    else
        echo -e "  ${YELLOW}[WARN]${NC} sudo: not available or requires password"
        echo "         Ollama install needs sudo — ask an admin to pre-install if needed"
        warn=$((warn + 1))
    fi

    for tool in git wget curl; do
        if command -v "$tool" &>/dev/null; then
            echo -e "  ${GREEN}[OK]${NC} $tool: $(command -v "$tool")"
            ok=$((ok + 1))
        else
            echo -e "  ${RED}[FAIL]${NC} $tool: not found"
            fail=$((fail + 1))
        fi
    done

    echo ""
    echo -e "Result: ${GREEN}$ok passed${NC}, ${YELLOW}$warn warnings${NC}, ${RED}$fail failed${NC}"
    if [ "$fail" -gt 0 ]; then
        echo "Fix failures above before running setup."
    elif [ "$warn" -gt 0 ]; then
        echo "Review warnings above before running setup."
    else
        echo "All checks passed. Ready to run: bash assets/setup.sh"
    fi
    exit 0
fi

run() {
    echo "  -> $*"
    $DRY_RUN || "$@"
}

section() {
    echo ""
    echo "============================================================"
    echo "  $1"
    echo "============================================================"
}

echo "Workspace:    $WORKSPACE"
echo "Repo:         $REPO_DIR"

# ---------------------------------------------------------------
section "Step 1: System tools (uv, Ollama)"
# ---------------------------------------------------------------

if ! command -v uv &>/dev/null; then
    echo "Installing uv..."
    run bash -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
    export PATH="$HOME/.local/bin:$PATH"
else
    echo "uv already installed: $(uv --version)"
fi

if ! command -v ollama &>/dev/null; then
    echo "Installing Ollama..."
    run bash -c 'curl -fsSL https://ollama.com/install.sh | sh'
else
    echo "Ollama already installed: $(ollama --version)"
fi

# ---------------------------------------------------------------
section "Step 2: Python environment (uv)"
# ---------------------------------------------------------------

cd "$REPO_DIR"

if [ -d ".venv" ]; then
    echo "Project venv already exists. Syncing dependencies..."
else
    echo "Creating project venv and installing all dependencies..."
fi
run uv sync --inexact

echo ""
echo "Verifying key packages..."
CUDA_VISIBLE_DEVICES=0 uv run python -c "import torch; print(f'  PyTorch {torch.__version__}, CUDA: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}')" || echo "  WARNING: PyTorch import failed"
uv run python -c "import faiss; print(f'  FAISS installed')" || echo "  WARNING: faiss not found"
uv run python -c "import transformers; print(f'  transformers {transformers.__version__}')" || echo "  WARNING: transformers import failed"
CUDA_VISIBLE_DEVICES=0 uv run python -c "import deepspeed; print(f'  deepspeed {deepspeed.__version__}')" || echo "  WARNING: deepspeed import failed"
uv run python -c "import peft; print(f'  peft {peft.__version__}')" || echo "  WARNING: peft import failed"

echo ""
echo "Optional W&B login..."
if uv run wandb status 2>/dev/null | grep -q "Logged in"; then
    echo "  W&B already logged in."
elif [ -t 0 ]; then
    read -r -p "  Log in to Weights & Biases now? [y/N] " wandb_confirm
    if [[ "$wandb_confirm" =~ ^[Yy]$ ]]; then
        run uv run wandb login
    else
        echo "  Skipping W&B login. Run 'uv run wandb login' later to enable training dashboards."
    fi
else
    echo "  Non-interactive shell; skipping W&B login prompt."
    echo "  Run 'uv run wandb login' later to enable training dashboards."
fi

# ---------------------------------------------------------------
section "Step 3: flash-attn (source build for target GPU)"
# ---------------------------------------------------------------

# flash-attn must be built from source to target specific GPU architectures.
# Pre-built wheels only cover older CUDA versions.
# Note: uv sync --inexact (Step 2) preserves the editable install, so
# re-runs skip this step entirely once flash-attn is built.
FLASH_ATTN_REPO="$HOME/dev/flash-attention"
FLASH_ATTN_PY="$REPO_DIR/.venv/bin/python"
FLASH_ATTN_EDITABLE_WHEEL="${FLASH_ATTN_EDITABLE_WHEEL:-}"

if uv run python -c "import flash_attn" 2>/dev/null; then
    echo "flash-attn already installed: $(uv run python -c 'import flash_attn; print(flash_attn.__version__)')"
else
    if [ -f "$FLASH_ATTN_REPO/flash_attn_2_cuda.cpython-313-aarch64-linux-gnu.so" ]; then
        echo "Found existing flash-attn source build at $FLASH_ATTN_REPO"
        if [ -z "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
            FLASH_ATTN_EDITABLE_WHEEL=$(find "$HOME/.cache/uv/sdists-v9/editable" \
                -path "*/flash_attn-2.8.4-0.editable-cp313-cp313-linux_aarch64.whl" \
                -print -quit 2>/dev/null || true)
        fi
        if [ -n "$FLASH_ATTN_EDITABLE_WHEEL" ] && [ -f "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
            echo "  Installing editable flash-attn wheel without rebuilding:"
            echo "  $FLASH_ATTN_EDITABLE_WHEEL"
            run uv pip install --python "$FLASH_ATTN_PY" "$FLASH_ATTN_EDITABLE_WHEEL"
        else
            echo "  Existing compiled extension found, but no editable wheel was found in uv cache."
        fi
    fi

    if uv run python -c "import flash_attn, flash_attn_2_cuda; print(f'  flash-attn {flash_attn.__version__} from {flash_attn.__file__}')" 2>/dev/null; then
        echo "  Reused existing flash-attn source build."
    else
    # Full source compilation required (~20 min first time)
    # Detect CUDA toolkit — prefer 13.1, fall back to whatever /usr/local/cuda points to
    if [ -d "/usr/local/cuda-13.1" ]; then
        export CUDA_HOME="/usr/local/cuda-13.1"
    elif [ -d "/usr/local/cuda" ]; then
        export CUDA_HOME="/usr/local/cuda"
    fi
    echo "  CUDA_HOME: ${CUDA_HOME:-not set}"

    # Target GPU architectures — auto-detect from nvidia-smi so we only compile
    # kernels for GPUs actually present. Two env vars matter:
    #   - TORCH_CUDA_ARCH_LIST: dot-notation (e.g. "10.3"), used by torch's C++
    #     extension builder. Accepts the raw compute_cap string.
    #   - FLASH_ATTN_CUDA_ARCHS: flash-attn's own var (setup.py:72). Semicolon-
    #     separated family integers (80;90;100;110;120). Ignores
    #     TORCH_CUDA_ARCH_LIST entirely, so it must be set separately.
    # Mapping: "10.3" -> major=10 -> "100" (the sm_100f family, forward-
    # compatible with sm_101/102/103).
    detected_cc=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | sort -u | tr '\n' ';' | sed 's/;$//')
    if [ -z "${TORCH_CUDA_ARCH_LIST:-}" ]; then
        export TORCH_CUDA_ARCH_LIST="${detected_cc:-10.3}"
    fi
    if [ -z "${FLASH_ATTN_CUDA_ARCHS:-}" ]; then
        fa_archs=$(echo "$detected_cc" | tr ';' '\n' | awk -F. '{print $1*10}' | sort -u | tr '\n' ';' | sed 's/;$//')
        export FLASH_ATTN_CUDA_ARCHS="${fa_archs:-100}"
    fi
    echo "  Detected compute_cap:   ${detected_cc:-unknown}"
    echo "  TORCH_CUDA_ARCH_LIST:   $TORCH_CUDA_ARCH_LIST  (for torch extensions)"
    echo "  FLASH_ATTN_CUDA_ARCHS:  $FLASH_ATTN_CUDA_ARCHS  (flash-attn family targets)"

    if [ -d "$FLASH_ATTN_REPO" ]; then
        echo "  Building flash-attn from local repo: $FLASH_ATTN_REPO"
        echo "  First-time build compiles CUDA kernels from source using all CPU cores."
        echo "  This may take up to 30 minutes. Subsequent runs are near-instant."
        cd "$FLASH_ATTN_REPO"
        run git pull
    else
        echo "  Cloning flash-attention repo and building from source..."
        echo "  First-time build compiles CUDA kernels from source using all CPU cores."
        echo "  This may take up to 30 minutes. Subsequent runs are near-instant."
        run mkdir -p "$HOME/dev"
        run git clone https://github.com/Dao-AILab/flash-attention.git "$FLASH_ATTN_REPO"
        cd "$FLASH_ATTN_REPO"
    fi

    # Install into the project venv (--python points uv to the right env)
    run uv pip install --python "$FLASH_ATTN_PY" -e . --no-build-isolation
    cd "$REPO_DIR"

    # Verify
    if uv run python -c "import flash_attn; print(f'  flash-attn {flash_attn.__version__}')" 2>/dev/null; then
        echo "  flash-attn installed successfully."
    else
        echo "  WARNING: flash-attn build failed. Training may still work without it (slower attention)."
    fi
    fi
fi

# ---------------------------------------------------------------
section "Step 4: Clone and patch HLLM"
# ---------------------------------------------------------------

if [ ! -d "$HLLM_CODE_DIR/REC" ]; then
    echo "Cloning HLLM from github.com/bytedance/HLLM..."
    run mkdir -p "$WORKSPACE"
    UPSTREAM_TMP="$WORKSPACE/.hllm-upstream"
    run git clone https://github.com/bytedance/HLLM.git "$UPSTREAM_TMP"
    run cp -r "$UPSTREAM_TMP/code" "$HLLM_CODE_DIR"
    run rm -rf "$UPSTREAM_TMP"
else
    echo "HLLM code already present at $HLLM_CODE_DIR/"
fi

# Apply LoRA patches (overwrites upstream files with patched versions)
PATCHES_DIR="$REPO_DIR/assets/patches/HLLM"
if [ -d "$PATCHES_DIR" ]; then
    echo "  Applying LoRA patches from $PATCHES_DIR ..."
    run cp "$PATCHES_DIR/hllm.py"            "$HLLM_CODE_DIR/REC/model/HLLM/hllm.py"
    run cp "$PATCHES_DIR/modeling_bert.py"   "$HLLM_CODE_DIR/REC/model/HLLM/modeling_bert.py"
    run cp "$PATCHES_DIR/trainer.py"         "$HLLM_CODE_DIR/REC/trainer/trainer.py"
    run cp "$PATCHES_DIR/wandblogger.py"     "$HLLM_CODE_DIR/REC/utils/wandblogger.py"
    run cp "$PATCHES_DIR/argument_list.py"   "$HLLM_CODE_DIR/REC/utils/argument_list.py"
    run cp "$PATCHES_DIR/utils.py"           "$HLLM_CODE_DIR/REC/data/utils.py"
    run cp "$PATCHES_DIR/dataload.py"        "$HLLM_CODE_DIR/REC/data/dataload.py"
    echo "  LoRA patches applied."
fi

# Dataset + information dirs live alongside the HLLM code (HLLM config reads from them)
run mkdir -p "$HLLM_CODE_DIR/dataset" "$HLLM_CODE_DIR/information" "$CHECKPOINTS_DIR"

# Copy training/extraction scripts from the repo into hllm-code/ for HLLM to find
if [ -f "$REPO_DIR/assets/train_retriever.sh" ]; then
    run cp "$REPO_DIR/assets/train_retriever.sh" "$HLLM_CODE_DIR/train_lora.sh"
    run chmod +x "$HLLM_CODE_DIR/train_lora.sh"
fi
if [ -f "$REPO_DIR/assets/extract_embeddings.py" ]; then
    run cp "$REPO_DIR/assets/extract_embeddings.py" "$HLLM_CODE_DIR/extract_embeddings.py"
fi

# Create extract_embeddings.sh wrapper inside hllm-code/
cat > "$HLLM_CODE_DIR/extract_embeddings.sh" << SCRIPT
#!/bin/bash
set -e
cd "$HLLM_CODE_DIR"
unset MPLBACKEND
CKPT_PATH="\${1:-$CHECKPOINTS_DIR/dresses_lora_r16/HLLM-0.pth/checkpoint/mp_rank_00_model_states.pt}"
OUTPUT_DIR="\${2:-$DATA_DIR/processed}"
MASTER_PORT="\${PLAYBOOK_EXTRACT_MASTER_PORT:-12399}"
[ "\$#" -ge 1 ] && shift
[ "\$#" -ge 1 ] && shift
exec uv run --project "$REPO_DIR" torchrun --master_port="\$MASTER_PORT" --nproc_per_node=1 --nnodes=1 extract_embeddings.py \\
    --ckpt_path "\$CKPT_PATH" \\
    --output_dir "\$OUTPUT_DIR" \\
    "\$@"
SCRIPT
chmod +x "$HLLM_CODE_DIR/extract_embeddings.sh"

# ---------------------------------------------------------------
section "Step 5: Download models"
# ---------------------------------------------------------------

# TinyLlama-1.1B (backbone for HLLM retriever training)
# If the download fails with 401/403, log in first: uv run hf login
if [ -f "$MODELS_DIR/TinyLlama-1.1B/config.json" ]; then
    echo "TinyLlama-1.1B already present at $MODELS_DIR/TinyLlama-1.1B/"
else
    echo "Downloading TinyLlama-1.1B from HuggingFace (~2 GB)..."
    echo "  If this fails with a 401/403 error, run: uv run hf login"
    run mkdir -p "$MODELS_DIR"
    run uv run hf download \
        TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
        --local-dir "$MODELS_DIR/TinyLlama-1.1B"
fi

# Nemotron Mini via Ollama
if ollama list 2>/dev/null | grep -q "nemotron-mini"; then
    echo "nemotron-mini already pulled in Ollama."
else
    echo "Pulling nemotron-mini (2.7 GB)..."
    run ollama pull nemotron-mini
fi

# ---------------------------------------------------------------
section "Step 6: Download and process Amazon data"
# ---------------------------------------------------------------

REVIEWS_FILE="$DATA_DIR/raw/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl"
META_FILE="$DATA_DIR/raw/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl"

run mkdir -p "$DATA_DIR/raw/raw/review_categories" "$DATA_DIR/raw/raw/meta_categories" "$DATA_DIR/processed"

# Dataset moved from datarepo.eng.ucsd.edu to HuggingFace (plain JSONL, not gzipped)
HF_DATASET_BASE="https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main"

if [ -f "$REVIEWS_FILE" ]; then
    echo "Reviews data already present ($(du -h "$REVIEWS_FILE" | cut -f1))"
else
    echo "Downloading Amazon Reviews — Clothing, Shoes & Jewelry (~27.8 GB)..."
    run wget -c \
        "$HF_DATASET_BASE/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl" \
        -O "$REVIEWS_FILE"
fi

if [ -f "$META_FILE" ]; then
    echo "Metadata already present ($(du -h "$META_FILE" | cut -f1))"
else
    echo "Downloading Amazon Metadata — Clothing, Shoes & Jewelry (~18 GB)..."
    run wget -c \
        "$HF_DATASET_BASE/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl" \
        -O "$META_FILE"
fi

# Process into HLLM format
if [ -f "$DATA_DIR/processed/dress_metadata.parquet" ] && [ -f "$HLLM_CODE_DIR/dataset/amazon_dresses.csv" ]; then
    echo "Processed data already exists. Skipping processing."
else
    echo "Processing Amazon data into HLLM format..."
    cd "$REPO_DIR"
    run uv run python assets/prepare_data.py
fi

# ---------------------------------------------------------------
section "Step 7: Start Ollama"
# ---------------------------------------------------------------

if curl -s http://localhost:11434/api/tags &>/dev/null; then
    echo "Ollama is already running."
    ollama list
else
    echo "Starting Ollama..."
    run ollama serve &
    sleep 3
    if curl -s http://localhost:11434/api/tags &>/dev/null; then
        echo "Ollama started successfully."
        ollama list
    else
        echo "WARNING: Ollama may not have started. Run 'ollama serve' manually."
    fi
fi

# ---------------------------------------------------------------
section "Setup complete!"
# ---------------------------------------------------------------

echo ""
echo "Workspace: $WORKSPACE"
echo "Repo:      $REPO_DIR"
echo ""
echo "Next steps:"
echo "  cd $REPO_DIR"
echo "  bash assets/train_retriever.sh"
echo "  uv run python assets/extract_embeddings.py"
echo ""
echo "Expected runtime: ~20 min (with training at bs=512) or ~5 min (with pre-computed embeddings)"
echo ""