mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-06-19 04:49:35 +00:00
504 lines
20 KiB
Bash
Executable File
504 lines
20 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# setup.sh — Automated setup for the standalone recommender playbook.
|
||
#
|
||
# Installs all dependencies, downloads models and data, and prepares the
|
||
# environment to run the E2E HLLM recommendation pipeline.
|
||
# Idempotent: safe to re-run if interrupted — skips completed steps.
|
||
#
|
||
# USAGE EXAMPLES:
|
||
#
|
||
# # Default setup — stores data/models under $HOME
|
||
# bash setup.sh
|
||
#
|
||
# # Preview what would be done without executing
|
||
# bash setup.sh --dry-run
|
||
#
|
||
# # Verify prerequisites only
|
||
# bash setup.sh --check
|
||
#
|
||
# # Store large files on a different mount (SSD, NVMe, RAID array, etc.)
|
||
# bash setup.sh --workspace /raid/recsys-playbook
|
||
# bash setup.sh --workspace /mnt/nvme/recsys-playbook
|
||
# bash setup.sh --workspace /data/recsys-playbook
|
||
# bash setup.sh --data-dir /raid/recsys-playbook
|
||
#
|
||
# # Use environment variable instead of CLI flag
|
||
# export PLAYBOOK_WORKSPACE=/raid
|
||
# bash setup.sh
|
||
#
|
||
# # Show all options
|
||
# bash setup.sh --help
|
||
#
|
||
# WHAT IT INSTALLS under $PLAYBOOK_WORKSPACE (~80 GB total):
|
||
# - uv (Python project manager) — ~/.local/bin/
|
||
# - Ollama (LLM serving) — /usr/local/bin/ (requires sudo)
|
||
# - Nemotron Mini 4B (Ollama) — ~/.ollama/models/ (~3 GB)
|
||
# - Python deps (uv venv) — this repo's .venv/
|
||
# - HLLM code (ByteDance) — $WORKSPACE/hllm-code/ (~1 GB)
|
||
# - TinyLlama-1.1B model — $WORKSPACE/models/ (~2 GB, HF cache stays at default)
|
||
# - Amazon Clothing dataset — $WORKSPACE/data/ (~46 GB)
|
||
# - Training checkpoints — $WORKSPACE/checkpoints/ (~20–30 GB after training)
|
||
#
|
||
# REQUIRES: NVIDIA GPU with drivers installed, internet access.
|
||
# SUDO: Required only for Ollama install (can be pre-installed by an admin).
|
||
|
||
set -euo pipefail
|
||
|
||
DRY_RUN=false
|
||
CHECK_ONLY=false
|
||
WORKSPACE="${PLAYBOOK_WORKSPACE:-$HOME}"
|
||
|
||
for arg in "$@"; do
|
||
case "$arg" in
|
||
--dry-run) DRY_RUN=true ;;
|
||
--check) CHECK_ONLY=true ;;
|
||
--workspace=*) WORKSPACE="${arg#*=}" ;;
|
||
--data-dir=*) WORKSPACE="${arg#*=}" ;;
|
||
--workspace) ;; # handled below with next arg
|
||
--data-dir) ;; # handled below with next arg
|
||
-h|--help)
|
||
cat <<'HELPEOF'
|
||
Usage: bash setup.sh [OPTIONS]
|
||
|
||
Options:
|
||
(no flags) Run full setup
|
||
--dry-run Print commands without executing
|
||
--check Validate prerequisites only
|
||
--workspace PATH Root directory for all playbook artifacts
|
||
(default: $HOME, or $PLAYBOOK_WORKSPACE if set)
|
||
Example: bash setup.sh --workspace /raid/recsys-playbook
|
||
--data-dir PATH Alias for --workspace
|
||
-h, --help Show this help
|
||
|
||
Environment:
|
||
PLAYBOOK_WORKSPACE Same as --workspace (CLI flag takes precedence)
|
||
|
||
What gets created under WORKSPACE (~80 GB total):
|
||
WORKSPACE/
|
||
├── station-rec-sys/ The playbook repo + uv venv
|
||
├── hllm-code/ ByteDance HLLM (~1 GB)
|
||
├── data/ Amazon dataset — raw + processed (~46 GB)
|
||
├── models/ TinyLlama-1.1B (~2 GB)
|
||
└── checkpoints/ HLLM training output (~20–30 GB after training)
|
||
|
||
HuggingFace and Ollama caches stay at their default locations (~/.cache/huggingface,
|
||
~/.ollama) so they can be shared with other projects on the machine.
|
||
HELPEOF
|
||
exit 0
|
||
;;
|
||
*)
|
||
if [[ "${prev_arg:-}" == "--workspace" ]]; then
|
||
WORKSPACE="$arg"
|
||
elif [[ "${prev_arg:-}" == "--data-dir" ]]; then
|
||
WORKSPACE="$arg"
|
||
else
|
||
echo "Unknown option: $arg (run --help for usage)"
|
||
exit 1
|
||
fi
|
||
;;
|
||
esac
|
||
prev_arg="$arg"
|
||
done
|
||
|
||
WORKSPACE="${WORKSPACE%/}" # strip trailing slash if any
|
||
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||
HLLM_CODE_DIR="$WORKSPACE/hllm-code"
|
||
DATA_DIR="$WORKSPACE/data"
|
||
MODELS_DIR="$WORKSPACE/models"
|
||
CHECKPOINTS_DIR="$WORKSPACE/checkpoints"
|
||
|
||
if $CHECK_ONLY; then
|
||
echo "Checking pre-requisites..."
|
||
echo ""
|
||
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[0;33m'
|
||
RED='\033[0;31m'
|
||
NC='\033[0m'
|
||
|
||
ok=0
|
||
warn=0
|
||
fail=0
|
||
|
||
if command -v nvidia-smi &>/dev/null; then
|
||
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
|
||
gpu_mem_mib=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
|
||
gpu_mem_gb=$(( gpu_mem_mib / 1024 ))
|
||
echo -e " ${GREEN}[OK]${NC} GPU: $gpu_name (${gpu_mem_gb} GB)"
|
||
ok=$((ok + 1))
|
||
else
|
||
echo -e " ${RED}[FAIL]${NC} nvidia-smi not found — NVIDIA driver not installed"
|
||
fail=$((fail + 1))
|
||
fi
|
||
|
||
if command -v nvcc &>/dev/null; then
|
||
cuda_ver=$(nvcc --version | grep "release" | sed 's/.*release //' | sed 's/,.*//')
|
||
echo -e " ${GREEN}[OK]${NC} CUDA: $cuda_ver"
|
||
ok=$((ok + 1))
|
||
else
|
||
echo -e " ${YELLOW}[WARN]${NC} nvcc not found — CUDA toolkit not on PATH (driver CUDA may still work)"
|
||
warn=$((warn + 1))
|
||
fi
|
||
|
||
avail_gb=$(df -BG "$WORKSPACE" 2>/dev/null | tail -1 | awk '{print $4}' | tr -d 'G')
|
||
if [ "${avail_gb:-0}" -ge 80 ]; then
|
||
echo -e " ${GREEN}[OK]${NC} Disk: ${avail_gb} GB available at $WORKSPACE (need ~80 GB)"
|
||
ok=$((ok + 1))
|
||
else
|
||
echo -e " ${YELLOW}[WARN]${NC} Disk: ${avail_gb:-?} GB available at $WORKSPACE (need ~80 GB)"
|
||
echo " Set PLAYBOOK_WORKSPACE to a path with more space, or use --workspace"
|
||
warn=$((warn + 1))
|
||
fi
|
||
|
||
if command -v sudo &>/dev/null && sudo -n true 2>/dev/null; then
|
||
echo -e " ${GREEN}[OK]${NC} sudo: available (needed for Ollama install)"
|
||
ok=$((ok + 1))
|
||
else
|
||
echo -e " ${YELLOW}[WARN]${NC} sudo: not available or requires password"
|
||
echo " Ollama install needs sudo — ask an admin to pre-install if needed"
|
||
warn=$((warn + 1))
|
||
fi
|
||
|
||
for tool in git wget curl; do
|
||
if command -v "$tool" &>/dev/null; then
|
||
echo -e " ${GREEN}[OK]${NC} $tool: $(command -v "$tool")"
|
||
ok=$((ok + 1))
|
||
else
|
||
echo -e " ${RED}[FAIL]${NC} $tool: not found"
|
||
fail=$((fail + 1))
|
||
fi
|
||
done
|
||
|
||
echo ""
|
||
echo -e "Result: ${GREEN}$ok passed${NC}, ${YELLOW}$warn warnings${NC}, ${RED}$fail failed${NC}"
|
||
if [ "$fail" -gt 0 ]; then
|
||
echo "Fix failures above before running setup."
|
||
elif [ "$warn" -gt 0 ]; then
|
||
echo "Review warnings above before running setup."
|
||
else
|
||
echo "All checks passed. Ready to run: bash assets/setup.sh"
|
||
fi
|
||
exit 0
|
||
fi
|
||
|
||
run() {
|
||
echo " -> $*"
|
||
$DRY_RUN || "$@"
|
||
}
|
||
|
||
section() {
|
||
echo ""
|
||
echo "============================================================"
|
||
echo " $1"
|
||
echo "============================================================"
|
||
}
|
||
|
||
echo "Workspace: $WORKSPACE"
|
||
echo "Repo: $REPO_DIR"
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 1: System tools (uv, Ollama)"
|
||
# ---------------------------------------------------------------
|
||
|
||
if ! command -v uv &>/dev/null; then
|
||
echo "Installing uv..."
|
||
run bash -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
|
||
export PATH="$HOME/.local/bin:$PATH"
|
||
else
|
||
echo "uv already installed: $(uv --version)"
|
||
fi
|
||
|
||
if ! command -v ollama &>/dev/null; then
|
||
echo "Installing Ollama..."
|
||
run bash -c 'curl -fsSL https://ollama.com/install.sh | sh'
|
||
else
|
||
echo "Ollama already installed: $(ollama --version)"
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 2: Python environment (uv)"
|
||
# ---------------------------------------------------------------
|
||
|
||
cd "$REPO_DIR"
|
||
|
||
if [ -d ".venv" ]; then
|
||
echo "Project venv already exists. Syncing dependencies..."
|
||
else
|
||
echo "Creating project venv and installing all dependencies..."
|
||
fi
|
||
run uv sync --inexact
|
||
|
||
echo ""
|
||
echo "Verifying key packages..."
|
||
CUDA_VISIBLE_DEVICES=0 uv run python -c "import torch; print(f' PyTorch {torch.__version__}, CUDA: {torch.version.cuda}, GPU available: {torch.cuda.is_available()}')" || echo " WARNING: PyTorch import failed"
|
||
uv run python -c "import faiss; print(f' FAISS installed')" || echo " WARNING: faiss not found"
|
||
uv run python -c "import transformers; print(f' transformers {transformers.__version__}')" || echo " WARNING: transformers import failed"
|
||
CUDA_VISIBLE_DEVICES=0 uv run python -c "import deepspeed; print(f' deepspeed {deepspeed.__version__}')" || echo " WARNING: deepspeed import failed"
|
||
uv run python -c "import peft; print(f' peft {peft.__version__}')" || echo " WARNING: peft import failed"
|
||
|
||
echo ""
|
||
echo "Optional W&B login..."
|
||
if uv run wandb status 2>/dev/null | grep -q "Logged in"; then
|
||
echo " W&B already logged in."
|
||
elif [ -t 0 ]; then
|
||
read -r -p " Log in to Weights & Biases now? [y/N] " wandb_confirm
|
||
if [[ "$wandb_confirm" =~ ^[Yy]$ ]]; then
|
||
run uv run wandb login
|
||
else
|
||
echo " Skipping W&B login. Run 'uv run wandb login' later to enable training dashboards."
|
||
fi
|
||
else
|
||
echo " Non-interactive shell; skipping W&B login prompt."
|
||
echo " Run 'uv run wandb login' later to enable training dashboards."
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 3: flash-attn (source build for target GPU)"
|
||
# ---------------------------------------------------------------
|
||
|
||
# flash-attn must be built from source to target specific GPU architectures.
|
||
# Pre-built wheels only cover older CUDA versions.
|
||
# Note: uv sync --inexact (Step 2) preserves the editable install, so
|
||
# re-runs skip this step entirely once flash-attn is built.
|
||
FLASH_ATTN_REPO="$HOME/dev/flash-attention"
|
||
FLASH_ATTN_PY="$REPO_DIR/.venv/bin/python"
|
||
FLASH_ATTN_EDITABLE_WHEEL="${FLASH_ATTN_EDITABLE_WHEEL:-}"
|
||
|
||
if uv run python -c "import flash_attn" 2>/dev/null; then
|
||
echo "flash-attn already installed: $(uv run python -c 'import flash_attn; print(flash_attn.__version__)')"
|
||
else
|
||
if [ -f "$FLASH_ATTN_REPO/flash_attn_2_cuda.cpython-313-aarch64-linux-gnu.so" ]; then
|
||
echo "Found existing flash-attn source build at $FLASH_ATTN_REPO"
|
||
if [ -z "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
|
||
FLASH_ATTN_EDITABLE_WHEEL=$(find "$HOME/.cache/uv/sdists-v9/editable" \
|
||
-path "*/flash_attn-2.8.4-0.editable-cp313-cp313-linux_aarch64.whl" \
|
||
-print -quit 2>/dev/null || true)
|
||
fi
|
||
if [ -n "$FLASH_ATTN_EDITABLE_WHEEL" ] && [ -f "$FLASH_ATTN_EDITABLE_WHEEL" ]; then
|
||
echo " Installing editable flash-attn wheel without rebuilding:"
|
||
echo " $FLASH_ATTN_EDITABLE_WHEEL"
|
||
run uv pip install --python "$FLASH_ATTN_PY" "$FLASH_ATTN_EDITABLE_WHEEL"
|
||
else
|
||
echo " Existing compiled extension found, but no editable wheel was found in uv cache."
|
||
fi
|
||
fi
|
||
|
||
if uv run python -c "import flash_attn, flash_attn_2_cuda; print(f' flash-attn {flash_attn.__version__} from {flash_attn.__file__}')" 2>/dev/null; then
|
||
echo " Reused existing flash-attn source build."
|
||
else
|
||
# Full source compilation required (~20 min first time)
|
||
# Detect CUDA toolkit — prefer 13.1, fall back to whatever /usr/local/cuda points to
|
||
if [ -d "/usr/local/cuda-13.1" ]; then
|
||
export CUDA_HOME="/usr/local/cuda-13.1"
|
||
elif [ -d "/usr/local/cuda" ]; then
|
||
export CUDA_HOME="/usr/local/cuda"
|
||
fi
|
||
echo " CUDA_HOME: ${CUDA_HOME:-not set}"
|
||
|
||
# Target GPU architectures — auto-detect from nvidia-smi so we only compile
|
||
# kernels for GPUs actually present. Two env vars matter:
|
||
# - TORCH_CUDA_ARCH_LIST: dot-notation (e.g. "10.3"), used by torch's C++
|
||
# extension builder. Accepts the raw compute_cap string.
|
||
# - FLASH_ATTN_CUDA_ARCHS: flash-attn's own var (setup.py:72). Semicolon-
|
||
# separated family integers (80;90;100;110;120). Ignores
|
||
# TORCH_CUDA_ARCH_LIST entirely, so it must be set separately.
|
||
# Mapping: "10.3" -> major=10 -> "100" (the sm_100f family, forward-
|
||
# compatible with sm_101/102/103).
|
||
detected_cc=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | sort -u | tr '\n' ';' | sed 's/;$//')
|
||
if [ -z "${TORCH_CUDA_ARCH_LIST:-}" ]; then
|
||
export TORCH_CUDA_ARCH_LIST="${detected_cc:-10.3}"
|
||
fi
|
||
if [ -z "${FLASH_ATTN_CUDA_ARCHS:-}" ]; then
|
||
fa_archs=$(echo "$detected_cc" | tr ';' '\n' | awk -F. '{print $1*10}' | sort -u | tr '\n' ';' | sed 's/;$//')
|
||
export FLASH_ATTN_CUDA_ARCHS="${fa_archs:-100}"
|
||
fi
|
||
echo " Detected compute_cap: ${detected_cc:-unknown}"
|
||
echo " TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST (for torch extensions)"
|
||
echo " FLASH_ATTN_CUDA_ARCHS: $FLASH_ATTN_CUDA_ARCHS (flash-attn family targets)"
|
||
|
||
if [ -d "$FLASH_ATTN_REPO" ]; then
|
||
echo " Building flash-attn from local repo: $FLASH_ATTN_REPO"
|
||
echo " First-time build compiles CUDA kernels from source using all CPU cores."
|
||
echo " This may take up to 30 minutes. Subsequent runs are near-instant."
|
||
cd "$FLASH_ATTN_REPO"
|
||
run git pull
|
||
else
|
||
echo " Cloning flash-attention repo and building from source..."
|
||
echo " First-time build compiles CUDA kernels from source using all CPU cores."
|
||
echo " This may take up to 30 minutes. Subsequent runs are near-instant."
|
||
run mkdir -p "$HOME/dev"
|
||
run git clone https://github.com/Dao-AILab/flash-attention.git "$FLASH_ATTN_REPO"
|
||
cd "$FLASH_ATTN_REPO"
|
||
fi
|
||
|
||
# Install into the project venv (--python points uv to the right env)
|
||
run uv pip install --python "$FLASH_ATTN_PY" -e . --no-build-isolation
|
||
cd "$REPO_DIR"
|
||
|
||
# Verify
|
||
if uv run python -c "import flash_attn; print(f' flash-attn {flash_attn.__version__}')" 2>/dev/null; then
|
||
echo " flash-attn installed successfully."
|
||
else
|
||
echo " WARNING: flash-attn build failed. Training may still work without it (slower attention)."
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 4: Clone and patch HLLM"
|
||
# ---------------------------------------------------------------
|
||
|
||
if [ ! -d "$HLLM_CODE_DIR/REC" ]; then
|
||
echo "Cloning HLLM from github.com/bytedance/HLLM..."
|
||
run mkdir -p "$WORKSPACE"
|
||
UPSTREAM_TMP="$WORKSPACE/.hllm-upstream"
|
||
run git clone https://github.com/bytedance/HLLM.git "$UPSTREAM_TMP"
|
||
run cp -r "$UPSTREAM_TMP/code" "$HLLM_CODE_DIR"
|
||
run rm -rf "$UPSTREAM_TMP"
|
||
else
|
||
echo "HLLM code already present at $HLLM_CODE_DIR/"
|
||
fi
|
||
|
||
# Apply LoRA patches (overwrites upstream files with patched versions)
|
||
PATCHES_DIR="$REPO_DIR/assets/patches/HLLM"
|
||
if [ -d "$PATCHES_DIR" ]; then
|
||
echo " Applying LoRA patches from $PATCHES_DIR ..."
|
||
run cp "$PATCHES_DIR/hllm.py" "$HLLM_CODE_DIR/REC/model/HLLM/hllm.py"
|
||
run cp "$PATCHES_DIR/modeling_bert.py" "$HLLM_CODE_DIR/REC/model/HLLM/modeling_bert.py"
|
||
run cp "$PATCHES_DIR/trainer.py" "$HLLM_CODE_DIR/REC/trainer/trainer.py"
|
||
run cp "$PATCHES_DIR/wandblogger.py" "$HLLM_CODE_DIR/REC/utils/wandblogger.py"
|
||
run cp "$PATCHES_DIR/argument_list.py" "$HLLM_CODE_DIR/REC/utils/argument_list.py"
|
||
run cp "$PATCHES_DIR/utils.py" "$HLLM_CODE_DIR/REC/data/utils.py"
|
||
run cp "$PATCHES_DIR/dataload.py" "$HLLM_CODE_DIR/REC/data/dataload.py"
|
||
echo " LoRA patches applied."
|
||
fi
|
||
|
||
# Dataset + information dirs live alongside the HLLM code (HLLM config reads from them)
|
||
run mkdir -p "$HLLM_CODE_DIR/dataset" "$HLLM_CODE_DIR/information" "$CHECKPOINTS_DIR"
|
||
|
||
# Copy training/extraction scripts from the repo into hllm-code/ for HLLM to find
|
||
if [ -f "$REPO_DIR/assets/train_retriever.sh" ]; then
|
||
run cp "$REPO_DIR/assets/train_retriever.sh" "$HLLM_CODE_DIR/train_lora.sh"
|
||
run chmod +x "$HLLM_CODE_DIR/train_lora.sh"
|
||
fi
|
||
if [ -f "$REPO_DIR/assets/extract_embeddings.py" ]; then
|
||
run cp "$REPO_DIR/assets/extract_embeddings.py" "$HLLM_CODE_DIR/extract_embeddings.py"
|
||
fi
|
||
|
||
# Create extract_embeddings.sh wrapper inside hllm-code/
|
||
cat > "$HLLM_CODE_DIR/extract_embeddings.sh" << SCRIPT
|
||
#!/bin/bash
|
||
set -e
|
||
cd "$HLLM_CODE_DIR"
|
||
unset MPLBACKEND
|
||
CKPT_PATH="\${1:-$CHECKPOINTS_DIR/dresses_lora_r16/HLLM-0.pth/checkpoint/mp_rank_00_model_states.pt}"
|
||
OUTPUT_DIR="\${2:-$DATA_DIR/processed}"
|
||
MASTER_PORT="\${PLAYBOOK_EXTRACT_MASTER_PORT:-12399}"
|
||
[ "\$#" -ge 1 ] && shift
|
||
[ "\$#" -ge 1 ] && shift
|
||
exec uv run --project "$REPO_DIR" torchrun --master_port="\$MASTER_PORT" --nproc_per_node=1 --nnodes=1 extract_embeddings.py \\
|
||
--ckpt_path "\$CKPT_PATH" \\
|
||
--output_dir "\$OUTPUT_DIR" \\
|
||
"\$@"
|
||
SCRIPT
|
||
chmod +x "$HLLM_CODE_DIR/extract_embeddings.sh"
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 5: Download models"
|
||
# ---------------------------------------------------------------
|
||
|
||
# TinyLlama-1.1B (backbone for HLLM retriever training)
|
||
# If the download fails with 401/403, log in first: uv run hf login
|
||
if [ -f "$MODELS_DIR/TinyLlama-1.1B/config.json" ]; then
|
||
echo "TinyLlama-1.1B already present at $MODELS_DIR/TinyLlama-1.1B/"
|
||
else
|
||
echo "Downloading TinyLlama-1.1B from HuggingFace (~2 GB)..."
|
||
echo " If this fails with a 401/403 error, run: uv run hf login"
|
||
run mkdir -p "$MODELS_DIR"
|
||
run uv run hf download \
|
||
TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||
--local-dir "$MODELS_DIR/TinyLlama-1.1B"
|
||
fi
|
||
|
||
# Nemotron Mini via Ollama
|
||
if ollama list 2>/dev/null | grep -q "nemotron-mini"; then
|
||
echo "nemotron-mini already pulled in Ollama."
|
||
else
|
||
echo "Pulling nemotron-mini (2.7 GB)..."
|
||
run ollama pull nemotron-mini
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 6: Download and process Amazon data"
|
||
# ---------------------------------------------------------------
|
||
|
||
REVIEWS_FILE="$DATA_DIR/raw/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl"
|
||
META_FILE="$DATA_DIR/raw/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl"
|
||
|
||
run mkdir -p "$DATA_DIR/raw/raw/review_categories" "$DATA_DIR/raw/raw/meta_categories" "$DATA_DIR/processed"
|
||
|
||
# Dataset moved from datarepo.eng.ucsd.edu to HuggingFace (plain JSONL, not gzipped)
|
||
HF_DATASET_BASE="https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main"
|
||
|
||
if [ -f "$REVIEWS_FILE" ]; then
|
||
echo "Reviews data already present ($(du -h "$REVIEWS_FILE" | cut -f1))"
|
||
else
|
||
echo "Downloading Amazon Reviews — Clothing, Shoes & Jewelry (~27.8 GB)..."
|
||
run wget -c \
|
||
"$HF_DATASET_BASE/raw/review_categories/Clothing_Shoes_and_Jewelry.jsonl" \
|
||
-O "$REVIEWS_FILE"
|
||
fi
|
||
|
||
if [ -f "$META_FILE" ]; then
|
||
echo "Metadata already present ($(du -h "$META_FILE" | cut -f1))"
|
||
else
|
||
echo "Downloading Amazon Metadata — Clothing, Shoes & Jewelry (~18 GB)..."
|
||
run wget -c \
|
||
"$HF_DATASET_BASE/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl" \
|
||
-O "$META_FILE"
|
||
fi
|
||
|
||
# Process into HLLM format
|
||
if [ -f "$DATA_DIR/processed/dress_metadata.parquet" ] && [ -f "$HLLM_CODE_DIR/dataset/amazon_dresses.csv" ]; then
|
||
echo "Processed data already exists. Skipping processing."
|
||
else
|
||
echo "Processing Amazon data into HLLM format..."
|
||
cd "$REPO_DIR"
|
||
run uv run python assets/prepare_data.py
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Step 7: Start Ollama"
|
||
# ---------------------------------------------------------------
|
||
|
||
if curl -s http://localhost:11434/api/tags &>/dev/null; then
|
||
echo "Ollama is already running."
|
||
ollama list
|
||
else
|
||
echo "Starting Ollama..."
|
||
run ollama serve &
|
||
sleep 3
|
||
if curl -s http://localhost:11434/api/tags &>/dev/null; then
|
||
echo "Ollama started successfully."
|
||
ollama list
|
||
else
|
||
echo "WARNING: Ollama may not have started. Run 'ollama serve' manually."
|
||
fi
|
||
fi
|
||
|
||
# ---------------------------------------------------------------
|
||
section "Setup complete!"
|
||
# ---------------------------------------------------------------
|
||
|
||
echo ""
|
||
echo "Workspace: $WORKSPACE"
|
||
echo "Repo: $REPO_DIR"
|
||
echo ""
|
||
echo "Next steps:"
|
||
echo " cd $REPO_DIR"
|
||
echo " bash assets/train_retriever.sh"
|
||
echo " uv run python assets/extract_embeddings.py"
|
||
echo ""
|
||
echo "Expected runtime: ~20 min (with training at bs=512) or ~5 min (with pre-computed embeddings)"
|
||
echo ""
|