dgx-spark-playbooks/nvidia/station-rec-sys/assets/benchmark_retrieval.py
2026-05-26 18:25:53 +00:00

567 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Library-level benchmark for the trained retrieval engine.
Measures throughput and latency of the HLLM retriever and optionally the
trained LightGBM re-ranker, in-process on GPU. There is no HTTP server, no
FastAPI, no JSON serialization — these numbers reflect what the hardware
can achieve for the recommendation workload, independent of any particular
serving stack.
Retrieval runs as `torch.mm + topk` on the first available CUDA device
(preferring GB300). Mathematically equivalent to FAISS IndexFlatIP — exact
inner-product search, no quantization — just executed on the GPU.
USAGE
# Retrieval-only sweep
uv run python assets/benchmark_retrieval.py
# Retrieval + trained re-ranker
uv run python assets/benchmark_retrieval.py --with-reranker
# Custom sweep
uv run python assets/benchmark_retrieval.py --users 1 100 10000 1000000
# Save results to JSON
uv run python assets/benchmark_retrieval.py --save bench.json
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
from train_reranker_lightgbm import (
FEATURE_COLS, build_user_samples, compute_item_stats, compute_user_stats,
)
class LGBMReranker:
"""LightGBM lambdarank re-ranker with GPU feature precompute.
Vectorizes the per-pair feature build that train_reranker_lightgbm.py
does in a Python loop, so 100K-user batches stay tractable in benchmark
sweeps. Item-side features and user-side context are staged on GPU once;
each scoring call gathers candidates, computes HLLM history sims, and
submits a single chunked numpy batch to ``Booster.predict``.
"""
HISTORY_RECENT_K = 10
ITEM_FEATURE_COLS = [
'item_total_purchases', 'item_unique_buyers',
'item_pop_30d', 'item_pop_90d', 'item_pop_180d', 'item_trend',
'item_recency_days', 'item_age_days',
'log_price', 'title_length', 'desc_length', 'has_image',
]
USER_SCALAR_COLS = [
'user_total_purchases', 'user_unique_items',
'user_avg_price', 'user_price_std', 'user_recency_days',
]
def __init__(
self,
model_path: Path,
processed_dir: Path,
item_embeddings: np.ndarray,
item_id_map: np.ndarray,
device: torch.device,
) -> None:
self.device = device
self.booster = lgb.Booster(model_file=str(model_path))
interactions = pd.read_parquet(processed_dir / 'dress_interactions.parquet')
metadata = pd.read_parquet(processed_dir / 'dress_metadata.parquet')
item_to_idx = {str(iid): i for i, iid in enumerate(item_id_map) if iid != '[PAD]'}
item_idx_to_id = [str(i) for i in item_id_map]
samples = build_user_samples(interactions, item_to_idx)
if not samples:
raise RuntimeError('No usable users for LightGBM reranker precompute.')
self.n_users = len(samples)
# Item-side feature lookup (n_items, 12) on GPU.
item_stats = compute_item_stats(interactions, metadata)
item_stats_arr = item_stats.reindex(item_idx_to_id).fillna(0)
self._item_feat_gpu = torch.from_numpy(
item_stats_arr[self.ITEM_FEATURE_COLS].to_numpy(dtype=np.float32),
).to(device)
self._log_price_idx_in_item_block = self.ITEM_FEATURE_COLS.index('log_price')
# User-side static info: padded recent-history embeddings + history idx
# tensor (for is_repurchase) + scalar features.
user_stats = compute_user_stats(samples, metadata, item_idx_to_id)
dim = item_embeddings.shape[1]
item_emb_t = torch.from_numpy(item_embeddings).to(device)
max_hist = max(len(s[1]) for s in samples)
self._hist_emb_padded = torch.zeros(
self.n_users, self.HISTORY_RECENT_K, dim, dtype=torch.float32, device=device,
)
self._hist_mask = torch.zeros(
self.n_users, self.HISTORY_RECENT_K, dtype=torch.bool, device=device,
)
self._history_idx = torch.full(
(self.n_users, max_hist), -1, dtype=torch.long, device=device,
)
scalar_buf = np.zeros((self.n_users, len(self.USER_SCALAR_COLS)), dtype=np.float32)
for i, (uid, hist_idxs, _, _) in enumerate(samples):
recent = hist_idxs[-self.HISTORY_RECENT_K:]
self._hist_emb_padded[i, :len(recent)] = item_emb_t[recent]
self._hist_mask[i, :len(recent)] = True
self._history_idx[i, :len(hist_idxs)] = torch.tensor(
hist_idxs, dtype=torch.long, device=device,
)
us = user_stats[uid]
for k, col in enumerate(self.USER_SCALAR_COLS):
scalar_buf[i, k] = us[col]
self._user_scalars_gpu = torch.from_numpy(scalar_buf).to(device)
self._user_avg_price_idx = self.USER_SCALAR_COLS.index('user_avg_price')
# Final feature column order must match training.
self._feature_to_pos = {c: i for i, c in enumerate(FEATURE_COLS)}
def _build_features_chunk(
self,
cand_idx: torch.Tensor, # (n, K) int64 on device
cand_scores: torch.Tensor, # (n, K) float32 on device
sample_user_idx: torch.Tensor, # (n,) int64 on device
item_emb_gpu: torch.Tensor, # (n_items, dim)
) -> np.ndarray:
n, K = cand_idx.shape
K_full = len(FEATURE_COLS)
device = self.device
# Item-side block: (n, K, 12)
item_feats = self._item_feat_gpu[cand_idx]
# HLLM dot product is the retrieval score we already have.
hllm_dot = cand_scores # (n, K)
# HLLM sims vs. recent history.
hist_emb = self._hist_emb_padded[sample_user_idx] # (n, hist_k, dim)
hist_mask = self._hist_mask[sample_user_idx] # (n, hist_k)
cand_emb = item_emb_gpu[cand_idx] # (n, K, dim)
sims = torch.bmm(cand_emb, hist_emb.transpose(1, 2)) # (n, K, hist_k)
mask = hist_mask.unsqueeze(1) # (n, 1, hist_k)
sims_masked_min = sims.masked_fill(~mask, -float('inf'))
max_sim = sims_masked_min.max(dim=2).values
sims_masked_zero = sims.masked_fill(~mask, 0.0)
valid = mask.sum(dim=2).clamp(min=1).to(sims.dtype)
avg_sim = sims_masked_zero.sum(dim=2) / valid
# User scalar broadcast: (n, U) -> (n, K, U)
user_block = self._user_scalars_gpu[sample_user_idx]
user_block_kk = user_block.unsqueeze(1).expand(-1, K, -1)
# is_repurchase: (n, K) bool
hist_idx = self._history_idx[sample_user_idx] # (n, max_hist)
eq = cand_idx.unsqueeze(2) == hist_idx.unsqueeze(1) # (n, K, max_hist)
is_rep = eq.any(dim=2).to(torch.float32)
# Cross features
log_price = item_feats[..., self._log_price_idx_in_item_block]
cand_price = torch.expm1(log_price)
user_avg_price = user_block[:, self._user_avg_price_idx].unsqueeze(1)
price_ratio = cand_price / (user_avg_price + 1e-8)
price_diff = cand_price - user_avg_price
# Assemble (n, K, 23) in FEATURE_COLS order.
out = torch.empty(n, K, K_full, dtype=torch.float32, device=device)
f2p = self._feature_to_pos
out[..., f2p['hllm_dot_product']] = hllm_dot
out[..., f2p['hllm_max_hist_sim']] = max_sim
out[..., f2p['hllm_avg_hist_sim']] = avg_sim
for c, col in enumerate(self.ITEM_FEATURE_COLS):
out[..., f2p[col]] = item_feats[..., c]
for c, col in enumerate(self.USER_SCALAR_COLS):
out[..., f2p[col]] = user_block_kk[..., c]
out[..., f2p['price_ratio']] = price_ratio
out[..., f2p['price_diff']] = price_diff
out[..., f2p['is_repurchase']] = is_rep
return out.reshape(n * K, K_full).cpu().numpy()
def detect_device() -> torch.device:
if not torch.cuda.is_available():
raise RuntimeError(
"CUDA GPU not available. This benchmark requires a CUDA-capable device "
"(it characterizes the GPU's retrieval throughput on this hardware)."
)
# Prefer GB300 if multiple GPUs are present.
for i in range(torch.cuda.device_count()):
try:
if 'GB300' in torch.cuda.get_device_name(i):
return torch.device(f'cuda:{i}')
except Exception:
continue
return torch.device('cuda:0')
class GpuSearcher:
"""torch.mm + topk on a CUDA device. Mathematically equivalent to FAISS IndexFlatIP
(exact inner-product search, no quantization), just executed on the GPU.
Chunks queries internally so the (queries × items) scores tensor stays under a
fixed VRAM budget, allowing very large sweeps (e.g. 1M users) without OOM.
"""
# Cap the on-GPU scores tensor so a 1M-user batch over a 16K-item index doesn't
# try to allocate 64 GB at once. ~16 GiB gives plenty of headroom on GB300.
_MAX_SCORES_BYTES = 16 * 1024**3
def __init__(self, item_embeddings: np.ndarray, device: torch.device):
self.device = device
self.item_emb = torch.from_numpy(item_embeddings).to(device)
gpu_name = torch.cuda.get_device_name(device)
self.name = f"torch.mm + topk on {device} ({gpu_name})"
n_items = item_embeddings.shape[0]
self._chunk = max(1, self._MAX_SCORES_BYTES // (n_items * 4))
def _search_one_torch(self, queries_gpu: torch.Tensor, k: int):
"""Internal: queries already on self.device. Returns GPU (distances, indices)."""
scores = torch.mm(queries_gpu, self.item_emb.T)
return torch.topk(scores, k, dim=1)
def _search_one(self, queries_np: np.ndarray, k: int):
q = torch.from_numpy(queries_np).to(self.device, non_blocking=True)
return self._search_one_torch(q, k)
def search(self, queries: np.ndarray, k: int):
n_q = queries.shape[0]
if n_q <= self._chunk:
distances, indices = self._search_one(queries, k)
torch.cuda.synchronize(self.device)
return distances.cpu().numpy(), indices.cpu().numpy()
all_d = np.empty((n_q, k), dtype=np.float32)
all_i = np.empty((n_q, k), dtype=np.int64)
for start in range(0, n_q, self._chunk):
end = min(start + self._chunk, n_q)
distances, indices = self._search_one(queries[start:end], k)
all_d[start:end] = distances.cpu().numpy()
all_i[start:end] = indices.cpu().numpy()
torch.cuda.synchronize(self.device)
return all_d, all_i
def search_torch(self, queries: torch.Tensor, k: int):
"""All-GPU search: GPU torch in, GPU torch out. For end-to-end GPU pipelines
where the reranker (or any downstream consumer) doesn't need a numpy round-trip."""
if queries.device != self.device:
queries = queries.to(self.device, non_blocking=True)
n_q = queries.shape[0]
if n_q <= self._chunk:
d, i = self._search_one_torch(queries, k)
torch.cuda.synchronize(self.device)
return d, i
all_d = torch.empty((n_q, k), dtype=torch.float32, device=self.device)
all_i = torch.empty((n_q, k), dtype=torch.int64, device=self.device)
for start in range(0, n_q, self._chunk):
end = min(start + self._chunk, n_q)
d, i = self._search_one_torch(queries[start:end], k)
all_d[start:end] = d
all_i[start:end] = i
torch.cuda.synchronize(self.device)
return all_d, all_i
def load_engine(processed_dir: Path, models_dir: Path, with_reranker: bool, device: torch.device):
item_embeddings = np.load(processed_dir / 'hllm_item_embeddings.npy').astype(np.float32)
item_id_map = np.load(processed_dir / 'hllm_item_id_map.npy', allow_pickle=True)
interactions = pd.read_parquet(processed_dir / 'dress_interactions.parquet')
interactions = interactions.sort_values(['user_id', 'timestamp'])
item_to_idx = {str(iid): i for i, iid in enumerate(item_id_map) if iid != '[PAD]'}
# Build a single (n_users, dim) matrix so we can sample with vectorized indexing
# instead of a 1M-iteration Python lookup at very large sweep sizes.
user_emb_list: list[np.ndarray] = []
for _, group in interactions.groupby('user_id'):
idxs = [item_to_idx[str(i)] for i in group['item_id'] if str(i) in item_to_idx]
if idxs:
emb = item_embeddings[idxs].mean(axis=0)
emb = emb / (np.linalg.norm(emb) + 1e-8)
user_emb_list.append(emb.astype(np.float32))
user_emb_matrix = np.stack(user_emb_list)
reranker = None
user_matrix_gpu = None
if with_reranker:
lgbm_path = models_dir / 'reranker_lightgbm' / 'reranker_lightgbm.txt'
if not lgbm_path.exists():
raise FileNotFoundError(
f"No re-ranker checkpoint found at {lgbm_path}\n"
"Run Step 5 (`bash assets/train_reranker.sh`) first."
)
reranker = LGBMReranker(
lgbm_path, processed_dir, item_embeddings, item_id_map, device,
)
user_matrix_gpu = torch.from_numpy(user_emb_matrix).to(device, non_blocking=True)
return item_embeddings, user_emb_matrix, user_matrix_gpu, reranker
def score_candidates(
reranker,
item_emb_gpu: torch.Tensor,
user_matrix_gpu: torch.Tensor,
cand_idx_gpu: torch.Tensor,
cand_scores_gpu: torch.Tensor | None = None,
sample_user_idx_gpu: torch.Tensor | None = None,
chunk_size: int = 2048,
) -> float:
"""Score (n_users × k_candidates) pairs with the LightGBM reranker and
return elapsed seconds. GPU feature engineering, then chunked CPU
``Booster.predict``. ``cand_scores_gpu`` and ``sample_user_idx_gpu`` are
required.
"""
device = item_emb_gpu.device
n_users, _k = cand_idx_gpu.shape
if cand_idx_gpu.dtype != torch.int64:
cand_idx_gpu = cand_idx_gpu.to(torch.int64)
if cand_scores_gpu is None or sample_user_idx_gpu is None:
raise ValueError('LGBMReranker requires cand_scores_gpu and sample_user_idx_gpu.')
torch.cuda.synchronize(device)
t0 = time.perf_counter()
for start in range(0, n_users, chunk_size):
end = min(start + chunk_size, n_users)
features_np = reranker._build_features_chunk(
cand_idx_gpu[start:end],
cand_scores_gpu[start:end],
sample_user_idx_gpu[start:end],
item_emb_gpu,
)
_ = reranker.booster.predict(features_np)
torch.cuda.synchronize(device)
return time.perf_counter() - t0
def bench_at_n(
n: int,
user_emb_matrix: np.ndarray,
user_matrix_gpu: torch.Tensor | None,
searcher: GpuSearcher,
reranker,
top_k: int,
seed: int,
) -> dict:
rng = np.random.default_rng(seed)
n_unique = user_emb_matrix.shape[0]
has_rerank = reranker is not None
device = searcher.device
# Build the sample. Reranker path stays on GPU; retrieval-only path stays in numpy.
# For LGBMReranker we also need the real-user index per query (history sims,
# user scalars, is_repurchase). Track sample_user_idx_gpu always when reranking.
sample_user_idx_gpu = None
if has_rerank:
if n <= n_unique:
sample_user_idx_gpu = torch.arange(n, device=device, dtype=torch.long)
else:
sample_user_idx_gpu = torch.from_numpy(
rng.integers(0, n_unique, size=n).astype(np.int64),
).to(device)
sample_gpu = user_matrix_gpu[sample_user_idx_gpu]
else:
if n <= n_unique:
sample = user_emb_matrix[:n]
else:
sample = user_emb_matrix[rng.integers(0, n_unique, size=n)]
if n == 1:
# Latency mode: 100 single-user iterations.
if has_rerank:
user_q_gpu = sample_gpu.reshape(1, -1)
user_idx_q_gpu = sample_user_idx_gpu[:1]
retr_lat, total_lat = [], []
for _ in range(100):
torch.cuda.synchronize(device)
t0 = time.perf_counter()
scores_gpu, idx_gpu = searcher.search_torch(user_q_gpu, top_k)
torch.cuda.synchronize(device)
retr_lat.append((time.perf_counter() - t0) * 1000)
score_candidates(
reranker, searcher.item_emb, user_q_gpu, idx_gpu,
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=user_idx_q_gpu,
)
total_lat.append((time.perf_counter() - t0) * 1000)
else:
user_emb = sample.reshape(1, -1)
retr_lat = []
for _ in range(100):
t0 = time.perf_counter()
_, _ = searcher.search(user_emb, top_k)
retr_lat.append((time.perf_counter() - t0) * 1000)
total_lat = retr_lat
retr_ms = float(np.mean(retr_lat))
total_ms = float(np.mean(total_lat))
rerank_ms = total_ms - retr_ms
return {
'users': n,
'retrieval_ms': retr_ms,
'rerank_ms': rerank_ms,
'total_ms': total_ms,
'per_user_ms': total_ms,
'throughput_rps': 1000.0 / total_ms,
}
# Batched mode: one shot.
if has_rerank:
torch.cuda.synchronize(device)
t0 = time.perf_counter()
scores_gpu, indices_gpu = searcher.search_torch(sample_gpu, top_k)
torch.cuda.synchronize(device)
retr_ms = (time.perf_counter() - t0) * 1000
rerank_s = score_candidates(
reranker, searcher.item_emb, sample_gpu, indices_gpu,
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=sample_user_idx_gpu,
)
rerank_ms = rerank_s * 1000
else:
t0 = time.perf_counter()
_, _ = searcher.search(sample, top_k)
retr_ms = (time.perf_counter() - t0) * 1000
rerank_ms = 0.0
total_ms = retr_ms + rerank_ms
per_user_ms = total_ms / n
return {
'users': n,
'retrieval_ms': retr_ms,
'rerank_ms': rerank_ms,
'total_ms': total_ms,
'per_user_ms': per_user_ms,
'throughput_rps': n * 1000.0 / total_ms,
}
def _format_header() -> tuple[str, str]:
header = f"{'Users':>11} | {'Per-user':>12} | {'Throughput':>14}"
return header, "-" * len(header)
def _format_row(r: dict) -> str:
return (
f"{r['users']:>11,} | "
f"{r['per_user_ms']:>10.3f}ms | "
f"{r['throughput_rps']:>10,.0f} /s"
)
def _fmt_duration(seconds: float) -> str:
if seconds < 1.0:
return f"{seconds * 1000:.0f}ms"
if seconds < 60.0:
return f"{seconds:.1f}s"
return f"{seconds / 60:.1f}min"
def main() -> int:
workspace = Path(os.environ.get('PLAYBOOK_WORKSPACE', os.path.expanduser('~')))
parser = argparse.ArgumentParser(description='In-process benchmark for the HLLM retrieval engine.')
parser.add_argument('--processed-dir', default=str(workspace / 'data' / 'processed'))
parser.add_argument('--models-dir', default=str(workspace / 'models'))
parser.add_argument('--users', type=int, nargs='+',
default=[1, 1_000, 10_000, 100_000, 1_000_000],
help='User-batch sizes to sweep over (default: 1 1000 10000 100000 1000000).')
parser.add_argument('--top-k', type=int, default=100,
help='Retrieval depth — candidate set size before re-ranking (default: 100).')
parser.add_argument('--with-reranker', action='store_true',
help='Also benchmark the trained LightGBM re-ranker over the retrieved candidates.')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--save', help='Optional path to save full results as JSON.')
args = parser.parse_args()
try:
device = detect_device()
except RuntimeError as e:
print(f"ERROR: {e}", file=sys.stderr)
return 1
print("=" * 80)
print("HLLM Retrieval Engine Benchmark")
print("=" * 80)
t = time.perf_counter()
item_embeddings, user_emb_matrix, user_matrix_gpu, reranker = load_engine(
Path(args.processed_dir), Path(args.models_dir), args.with_reranker, device,
)
searcher = GpuSearcher(item_embeddings, device)
print(f"Loaded: {item_embeddings.shape[0]:,} items × {item_embeddings.shape[1]} dims, "
f"{user_emb_matrix.shape[0]:,} user embeddings. "
f"({time.perf_counter()-t:.1f}s)")
print(f"Search backend: {searcher.name}")
if reranker is not None:
n_trees = reranker.booster.num_trees()
print(f"Reranker: LightGBM ({n_trees} trees, {len(FEATURE_COLS)} features)")
print(f"top_k retrieval depth: {args.top_k}")
print()
# Warm up the GPU once before the sweep so cuBLAS algorithm picks and CUDA
# kernel JITs don't get charged to the first row's wall time.
if reranker is not None:
warmup_q_gpu = user_matrix_gpu[:1]
warmup_idx = torch.zeros(1, dtype=torch.long, device=device)
scores_gpu, idx_gpu = searcher.search_torch(warmup_q_gpu, args.top_k)
_ = score_candidates(
reranker, searcher.item_emb, warmup_q_gpu, idx_gpu,
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=warmup_idx,
)
else:
_, _ = searcher.search(user_emb_matrix[:1], args.top_k)
# Live progress: each row prints "Running X users..." before the work, then
# "done (T)" once the batch completes. Users feel the wall time between the
# two halves of the line.
results = []
for n in args.users:
print(f"Running {n:>11,} users...", end=' ', flush=True)
t0 = time.perf_counter()
r = bench_at_n(n, user_emb_matrix, user_matrix_gpu, searcher, reranker,
args.top_k, args.seed)
wall = time.perf_counter() - t0
results.append(r)
print(f"done ({_fmt_duration(wall)})", flush=True)
# Final table — printed only once, after all benchmarks complete.
print()
print("=" * 80)
print("Summary")
print("=" * 80)
header, sep = _format_header()
print(header)
print(sep)
for r in results:
print(_format_row(r))
if args.save:
out = {
'config': {
'users': args.users,
'top_k': args.top_k,
'with_reranker': args.with_reranker,
'search_backend': searcher.name,
'item_count': int(item_embeddings.shape[0]),
'embedding_dim': int(item_embeddings.shape[1]),
},
'results': results,
}
Path(args.save).write_text(json.dumps(out, indent=2))
print(f"\nSaved JSON results to {args.save}")
return 0
if __name__ == '__main__':
raise SystemExit(main())