#!/usr/bin/env bash # train_retriever_1b.sh — Train HLLM retriever with LoRA on TinyLlama-1.1B. # # Uses the known-good TinyLlama recipe that produced R@10 = 0.0708 on the # Amazon Dresses test set (see docs/4-19-metrics-analysis.md run 7sb4tigy). # Wall-clock: ~20 min per epoch on a single GB300 at ~13.75 s/step (bs=512). # # USAGE: # bash assets/train_retriever.sh # Default: train from scratch (any existing ckpt is ignored) # bash assets/train_retriever.sh --resume # Resume from latest checkpoint in $CHECKPOINT_DIR # CUDA_VISIBLE_DEVICES=1 bash assets/train_retriever.sh # Use GPU 1 # PLAYBOOK_WORKSPACE=/raid bash assets/train_retriever.sh # # RECIPE NOTES: # - Model: TinyLlama-1.1B # - train_batch_size: 512 — peak GPU mem ~154 GB / 284 GB on GB300 # - grad_accum_steps: 1 — no accumulation needed # - num_negatives: 4096 — paper-scale NCE signal # - MAX_ITEM_LIST_LEN: 20 — longer user histories # - learning_rate: 2e-4 — larger LR OK at this scale # - gradient_ckpt: True — cheap safety on a 1B LoRA run # - epochs: 1 (~20 min on GB300; set PLAYBOOK_EPOCHS=3-5 for production-quality embeddings) set -euo pipefail # ---- CLI flags ---- AUTO_RESUME=False for arg in "$@"; do case "$arg" in --resume) AUTO_RESUME=True ;; -h|--help) sed -n '2,22p' "$0" exit 0 ;; *) echo "Unknown argument: $arg" >&2 exit 2 ;; esac done # ---- Paths ---- WORKSPACE="${PLAYBOOK_WORKSPACE:-$HOME}" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" HLLM_CODE_DIR="$WORKSPACE/hllm-code" DATA_DIR="$WORKSPACE/data" MODELS_DIR="$WORKSPACE/models" CHECKPOINTS_DIR="$WORKSPACE/checkpoints" CHECKPOINT_DIR="${PLAYBOOK_CHECKPOINT_DIR:-$CHECKPOINTS_DIR/dresses_lora_r16}" GPU_ID="${CUDA_VISIBLE_DEVICES:-0}" EPOCHS="${PLAYBOOK_EPOCHS:-1}" NUM_NEGATIVES="${PLAYBOOK_NUM_NEGATIVES:-4096}" MAX_STEPS="${PLAYBOOK_MAX_STEPS:-0}" SAVE_STEPS="${PLAYBOOK_SAVE_STEPS:-200}" TORCH_COMPILE="${PLAYBOOK_TORCH_COMPILE:-True}" # Activate uv venv so torchrun uses the correct Python source "$REPO_DIR/.venv/bin/activate" # ---- Banner ---- echo "============================================================" echo " HLLM Retriever Training (LoRA, TinyLlama-1.1B)" echo "============================================================" echo "" echo " Model: TinyLlama-1.1B + LoRA r16" echo " Dataset: Amazon Dresses (293K interactions)" echo " GPU: $GPU_ID" echo " Checkpoints: $CHECKPOINT_DIR" echo " Data dir: $DATA_DIR" echo " Epochs: $EPOCHS" echo " Negatives: $NUM_NEGATIVES" echo " Max steps: $MAX_STEPS (0 = full epoch schedule)" echo " Compile: $TORCH_COMPILE" if [ "$AUTO_RESUME" = "True" ]; then echo " Resume: on (--resume: latest checkpoint in $CHECKPOINT_DIR will be loaded)" else echo " Resume: off (training from scratch; pass --resume to continue from latest checkpoint)" fi echo "" # Check W&B auth via netrc (what the Python client reads at training time). # Avoids the wandb CLI, which isn't on PATH unless the venv is active and # whose `wandb status` output doesn't expose a stable "logged in" string. if [ -f "$HOME/.netrc" ] && grep -q "api.wandb.ai" "$HOME/.netrc" 2>/dev/null; then echo " W&B: Enabled (netrc auth) — runs land at https://wandb.ai" echo " Project: enterprise-recsys" echo " Run URL appears in the training log after W&B initializes." else echo " W&B: Not logged in (run 'uv run wandb login' to enable)" fi echo "" echo " Estimated wall time: ~20 min per epoch (~13.75 s/step on GB300 at bs=512)" echo " Monitor GPU: open another terminal and run 'watch nvidia-smi'" echo "" echo "============================================================" echo "" # ---- Launch training ---- cd "$HLLM_CODE_DIR" CUDA_VISIBLE_DEVICES=$GPU_ID PYTHONUNBUFFERED=1 WANDB_API_KEY=${WANDB_TOKEN:-} \ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ torchrun --master_port=12350 --nproc_per_node=1 --nnodes=1 \ run.py \ --config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \ --loss nce \ \ --epochs "$EPOCHS" \ --train_batch_size 512 \ --gradient_accumulation_steps 1 \ --save_steps "$SAVE_STEPS" \ --num_negatives "$NUM_NEGATIVES" \ --max_steps "$MAX_STEPS" \ --MAX_TEXT_LENGTH 64 \ --MAX_ITEM_LIST_LENGTH 20 \ --gradient_checkpointing True \ --num_workers 32 \ --fast_eval_interval 500 \ \ --torch_compile "$TORCH_COMPILE" \ --torch_compile_mode default \ \ --use_fused_adam True \ --optim_args.learning_rate 2e-4 \ --optim_args.weight_decay 0.05 \ --scheduler_args.warmup 0.05 \ \ --lora_r 16 \ --lora_alpha 64 \ --lora_dropout 0.05 \ --lora_target_modules '["q_proj","k_proj","v_proj","o_proj"]' \ \ --dataset amazon_dresses \ --data_path "$HLLM_CODE_DIR/dataset/" \ --item_pretrain_dir "$MODELS_DIR/TinyLlama-1.1B" \ --user_pretrain_dir "$MODELS_DIR/TinyLlama-1.1B" \ --text_path "$HLLM_CODE_DIR/information" \ --text_keys '["title","description"]' \ --checkpoint_dir "$CHECKPOINT_DIR" \ \ --log_wandb True \ --wandb_project enterprise-recsys \ --wandb_log_interval 5 \ --eval_step 1 \ --stopping_step 2 \ --auto_resume "$AUTO_RESUME"