mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-06-19 12:59:34 +00:00
567 lines
22 KiB
Python
567 lines
22 KiB
Python
"""Library-level benchmark for the trained retrieval engine.
|
||
|
||
Measures throughput and latency of the HLLM retriever and optionally the
|
||
trained LightGBM re-ranker, in-process on GPU. There is no HTTP server, no
|
||
FastAPI, no JSON serialization — these numbers reflect what the hardware
|
||
can achieve for the recommendation workload, independent of any particular
|
||
serving stack.
|
||
|
||
Retrieval runs as `torch.mm + topk` on the first available CUDA device
|
||
(preferring GB300). Mathematically equivalent to FAISS IndexFlatIP — exact
|
||
inner-product search, no quantization — just executed on the GPU.
|
||
|
||
USAGE
|
||
# Retrieval-only sweep
|
||
uv run python assets/benchmark_retrieval.py
|
||
|
||
# Retrieval + trained re-ranker
|
||
uv run python assets/benchmark_retrieval.py --with-reranker
|
||
|
||
# Custom sweep
|
||
uv run python assets/benchmark_retrieval.py --users 1 100 10000 1000000
|
||
|
||
# Save results to JSON
|
||
uv run python assets/benchmark_retrieval.py --save bench.json
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
import lightgbm as lgb
|
||
import numpy as np
|
||
import pandas as pd
|
||
import torch
|
||
|
||
from train_reranker_lightgbm import (
|
||
FEATURE_COLS, build_user_samples, compute_item_stats, compute_user_stats,
|
||
)
|
||
|
||
|
||
class LGBMReranker:
|
||
"""LightGBM lambdarank re-ranker with GPU feature precompute.
|
||
|
||
Vectorizes the per-pair feature build that train_reranker_lightgbm.py
|
||
does in a Python loop, so 100K-user batches stay tractable in benchmark
|
||
sweeps. Item-side features and user-side context are staged on GPU once;
|
||
each scoring call gathers candidates, computes HLLM history sims, and
|
||
submits a single chunked numpy batch to ``Booster.predict``.
|
||
"""
|
||
|
||
HISTORY_RECENT_K = 10
|
||
ITEM_FEATURE_COLS = [
|
||
'item_total_purchases', 'item_unique_buyers',
|
||
'item_pop_30d', 'item_pop_90d', 'item_pop_180d', 'item_trend',
|
||
'item_recency_days', 'item_age_days',
|
||
'log_price', 'title_length', 'desc_length', 'has_image',
|
||
]
|
||
USER_SCALAR_COLS = [
|
||
'user_total_purchases', 'user_unique_items',
|
||
'user_avg_price', 'user_price_std', 'user_recency_days',
|
||
]
|
||
|
||
def __init__(
|
||
self,
|
||
model_path: Path,
|
||
processed_dir: Path,
|
||
item_embeddings: np.ndarray,
|
||
item_id_map: np.ndarray,
|
||
device: torch.device,
|
||
) -> None:
|
||
self.device = device
|
||
self.booster = lgb.Booster(model_file=str(model_path))
|
||
|
||
interactions = pd.read_parquet(processed_dir / 'dress_interactions.parquet')
|
||
metadata = pd.read_parquet(processed_dir / 'dress_metadata.parquet')
|
||
item_to_idx = {str(iid): i for i, iid in enumerate(item_id_map) if iid != '[PAD]'}
|
||
item_idx_to_id = [str(i) for i in item_id_map]
|
||
|
||
samples = build_user_samples(interactions, item_to_idx)
|
||
if not samples:
|
||
raise RuntimeError('No usable users for LightGBM reranker precompute.')
|
||
self.n_users = len(samples)
|
||
|
||
# Item-side feature lookup (n_items, 12) on GPU.
|
||
item_stats = compute_item_stats(interactions, metadata)
|
||
item_stats_arr = item_stats.reindex(item_idx_to_id).fillna(0)
|
||
self._item_feat_gpu = torch.from_numpy(
|
||
item_stats_arr[self.ITEM_FEATURE_COLS].to_numpy(dtype=np.float32),
|
||
).to(device)
|
||
self._log_price_idx_in_item_block = self.ITEM_FEATURE_COLS.index('log_price')
|
||
|
||
# User-side static info: padded recent-history embeddings + history idx
|
||
# tensor (for is_repurchase) + scalar features.
|
||
user_stats = compute_user_stats(samples, metadata, item_idx_to_id)
|
||
dim = item_embeddings.shape[1]
|
||
item_emb_t = torch.from_numpy(item_embeddings).to(device)
|
||
|
||
max_hist = max(len(s[1]) for s in samples)
|
||
self._hist_emb_padded = torch.zeros(
|
||
self.n_users, self.HISTORY_RECENT_K, dim, dtype=torch.float32, device=device,
|
||
)
|
||
self._hist_mask = torch.zeros(
|
||
self.n_users, self.HISTORY_RECENT_K, dtype=torch.bool, device=device,
|
||
)
|
||
self._history_idx = torch.full(
|
||
(self.n_users, max_hist), -1, dtype=torch.long, device=device,
|
||
)
|
||
scalar_buf = np.zeros((self.n_users, len(self.USER_SCALAR_COLS)), dtype=np.float32)
|
||
|
||
for i, (uid, hist_idxs, _, _) in enumerate(samples):
|
||
recent = hist_idxs[-self.HISTORY_RECENT_K:]
|
||
self._hist_emb_padded[i, :len(recent)] = item_emb_t[recent]
|
||
self._hist_mask[i, :len(recent)] = True
|
||
self._history_idx[i, :len(hist_idxs)] = torch.tensor(
|
||
hist_idxs, dtype=torch.long, device=device,
|
||
)
|
||
us = user_stats[uid]
|
||
for k, col in enumerate(self.USER_SCALAR_COLS):
|
||
scalar_buf[i, k] = us[col]
|
||
self._user_scalars_gpu = torch.from_numpy(scalar_buf).to(device)
|
||
self._user_avg_price_idx = self.USER_SCALAR_COLS.index('user_avg_price')
|
||
|
||
# Final feature column order must match training.
|
||
self._feature_to_pos = {c: i for i, c in enumerate(FEATURE_COLS)}
|
||
|
||
def _build_features_chunk(
|
||
self,
|
||
cand_idx: torch.Tensor, # (n, K) int64 on device
|
||
cand_scores: torch.Tensor, # (n, K) float32 on device
|
||
sample_user_idx: torch.Tensor, # (n,) int64 on device
|
||
item_emb_gpu: torch.Tensor, # (n_items, dim)
|
||
) -> np.ndarray:
|
||
n, K = cand_idx.shape
|
||
K_full = len(FEATURE_COLS)
|
||
device = self.device
|
||
|
||
# Item-side block: (n, K, 12)
|
||
item_feats = self._item_feat_gpu[cand_idx]
|
||
|
||
# HLLM dot product is the retrieval score we already have.
|
||
hllm_dot = cand_scores # (n, K)
|
||
|
||
# HLLM sims vs. recent history.
|
||
hist_emb = self._hist_emb_padded[sample_user_idx] # (n, hist_k, dim)
|
||
hist_mask = self._hist_mask[sample_user_idx] # (n, hist_k)
|
||
cand_emb = item_emb_gpu[cand_idx] # (n, K, dim)
|
||
sims = torch.bmm(cand_emb, hist_emb.transpose(1, 2)) # (n, K, hist_k)
|
||
mask = hist_mask.unsqueeze(1) # (n, 1, hist_k)
|
||
sims_masked_min = sims.masked_fill(~mask, -float('inf'))
|
||
max_sim = sims_masked_min.max(dim=2).values
|
||
sims_masked_zero = sims.masked_fill(~mask, 0.0)
|
||
valid = mask.sum(dim=2).clamp(min=1).to(sims.dtype)
|
||
avg_sim = sims_masked_zero.sum(dim=2) / valid
|
||
|
||
# User scalar broadcast: (n, U) -> (n, K, U)
|
||
user_block = self._user_scalars_gpu[sample_user_idx]
|
||
user_block_kk = user_block.unsqueeze(1).expand(-1, K, -1)
|
||
|
||
# is_repurchase: (n, K) bool
|
||
hist_idx = self._history_idx[sample_user_idx] # (n, max_hist)
|
||
eq = cand_idx.unsqueeze(2) == hist_idx.unsqueeze(1) # (n, K, max_hist)
|
||
is_rep = eq.any(dim=2).to(torch.float32)
|
||
|
||
# Cross features
|
||
log_price = item_feats[..., self._log_price_idx_in_item_block]
|
||
cand_price = torch.expm1(log_price)
|
||
user_avg_price = user_block[:, self._user_avg_price_idx].unsqueeze(1)
|
||
price_ratio = cand_price / (user_avg_price + 1e-8)
|
||
price_diff = cand_price - user_avg_price
|
||
|
||
# Assemble (n, K, 23) in FEATURE_COLS order.
|
||
out = torch.empty(n, K, K_full, dtype=torch.float32, device=device)
|
||
f2p = self._feature_to_pos
|
||
out[..., f2p['hllm_dot_product']] = hllm_dot
|
||
out[..., f2p['hllm_max_hist_sim']] = max_sim
|
||
out[..., f2p['hllm_avg_hist_sim']] = avg_sim
|
||
for c, col in enumerate(self.ITEM_FEATURE_COLS):
|
||
out[..., f2p[col]] = item_feats[..., c]
|
||
for c, col in enumerate(self.USER_SCALAR_COLS):
|
||
out[..., f2p[col]] = user_block_kk[..., c]
|
||
out[..., f2p['price_ratio']] = price_ratio
|
||
out[..., f2p['price_diff']] = price_diff
|
||
out[..., f2p['is_repurchase']] = is_rep
|
||
|
||
return out.reshape(n * K, K_full).cpu().numpy()
|
||
|
||
|
||
def detect_device() -> torch.device:
|
||
if not torch.cuda.is_available():
|
||
raise RuntimeError(
|
||
"CUDA GPU not available. This benchmark requires a CUDA-capable device "
|
||
"(it characterizes the GPU's retrieval throughput on this hardware)."
|
||
)
|
||
# Prefer GB300 if multiple GPUs are present.
|
||
for i in range(torch.cuda.device_count()):
|
||
try:
|
||
if 'GB300' in torch.cuda.get_device_name(i):
|
||
return torch.device(f'cuda:{i}')
|
||
except Exception:
|
||
continue
|
||
return torch.device('cuda:0')
|
||
|
||
|
||
class GpuSearcher:
|
||
"""torch.mm + topk on a CUDA device. Mathematically equivalent to FAISS IndexFlatIP
|
||
(exact inner-product search, no quantization), just executed on the GPU.
|
||
|
||
Chunks queries internally so the (queries × items) scores tensor stays under a
|
||
fixed VRAM budget, allowing very large sweeps (e.g. 1M users) without OOM.
|
||
"""
|
||
|
||
# Cap the on-GPU scores tensor so a 1M-user batch over a 16K-item index doesn't
|
||
# try to allocate 64 GB at once. ~16 GiB gives plenty of headroom on GB300.
|
||
_MAX_SCORES_BYTES = 16 * 1024**3
|
||
|
||
def __init__(self, item_embeddings: np.ndarray, device: torch.device):
|
||
self.device = device
|
||
self.item_emb = torch.from_numpy(item_embeddings).to(device)
|
||
gpu_name = torch.cuda.get_device_name(device)
|
||
self.name = f"torch.mm + topk on {device} ({gpu_name})"
|
||
n_items = item_embeddings.shape[0]
|
||
self._chunk = max(1, self._MAX_SCORES_BYTES // (n_items * 4))
|
||
|
||
def _search_one_torch(self, queries_gpu: torch.Tensor, k: int):
|
||
"""Internal: queries already on self.device. Returns GPU (distances, indices)."""
|
||
scores = torch.mm(queries_gpu, self.item_emb.T)
|
||
return torch.topk(scores, k, dim=1)
|
||
|
||
def _search_one(self, queries_np: np.ndarray, k: int):
|
||
q = torch.from_numpy(queries_np).to(self.device, non_blocking=True)
|
||
return self._search_one_torch(q, k)
|
||
|
||
def search(self, queries: np.ndarray, k: int):
|
||
n_q = queries.shape[0]
|
||
if n_q <= self._chunk:
|
||
distances, indices = self._search_one(queries, k)
|
||
torch.cuda.synchronize(self.device)
|
||
return distances.cpu().numpy(), indices.cpu().numpy()
|
||
|
||
all_d = np.empty((n_q, k), dtype=np.float32)
|
||
all_i = np.empty((n_q, k), dtype=np.int64)
|
||
for start in range(0, n_q, self._chunk):
|
||
end = min(start + self._chunk, n_q)
|
||
distances, indices = self._search_one(queries[start:end], k)
|
||
all_d[start:end] = distances.cpu().numpy()
|
||
all_i[start:end] = indices.cpu().numpy()
|
||
torch.cuda.synchronize(self.device)
|
||
return all_d, all_i
|
||
|
||
def search_torch(self, queries: torch.Tensor, k: int):
|
||
"""All-GPU search: GPU torch in, GPU torch out. For end-to-end GPU pipelines
|
||
where the reranker (or any downstream consumer) doesn't need a numpy round-trip."""
|
||
if queries.device != self.device:
|
||
queries = queries.to(self.device, non_blocking=True)
|
||
n_q = queries.shape[0]
|
||
if n_q <= self._chunk:
|
||
d, i = self._search_one_torch(queries, k)
|
||
torch.cuda.synchronize(self.device)
|
||
return d, i
|
||
|
||
all_d = torch.empty((n_q, k), dtype=torch.float32, device=self.device)
|
||
all_i = torch.empty((n_q, k), dtype=torch.int64, device=self.device)
|
||
for start in range(0, n_q, self._chunk):
|
||
end = min(start + self._chunk, n_q)
|
||
d, i = self._search_one_torch(queries[start:end], k)
|
||
all_d[start:end] = d
|
||
all_i[start:end] = i
|
||
torch.cuda.synchronize(self.device)
|
||
return all_d, all_i
|
||
|
||
|
||
def load_engine(processed_dir: Path, models_dir: Path, with_reranker: bool, device: torch.device):
|
||
item_embeddings = np.load(processed_dir / 'hllm_item_embeddings.npy').astype(np.float32)
|
||
item_id_map = np.load(processed_dir / 'hllm_item_id_map.npy', allow_pickle=True)
|
||
interactions = pd.read_parquet(processed_dir / 'dress_interactions.parquet')
|
||
interactions = interactions.sort_values(['user_id', 'timestamp'])
|
||
|
||
item_to_idx = {str(iid): i for i, iid in enumerate(item_id_map) if iid != '[PAD]'}
|
||
|
||
# Build a single (n_users, dim) matrix so we can sample with vectorized indexing
|
||
# instead of a 1M-iteration Python lookup at very large sweep sizes.
|
||
user_emb_list: list[np.ndarray] = []
|
||
for _, group in interactions.groupby('user_id'):
|
||
idxs = [item_to_idx[str(i)] for i in group['item_id'] if str(i) in item_to_idx]
|
||
if idxs:
|
||
emb = item_embeddings[idxs].mean(axis=0)
|
||
emb = emb / (np.linalg.norm(emb) + 1e-8)
|
||
user_emb_list.append(emb.astype(np.float32))
|
||
user_emb_matrix = np.stack(user_emb_list)
|
||
|
||
reranker = None
|
||
user_matrix_gpu = None
|
||
if with_reranker:
|
||
lgbm_path = models_dir / 'reranker_lightgbm' / 'reranker_lightgbm.txt'
|
||
if not lgbm_path.exists():
|
||
raise FileNotFoundError(
|
||
f"No re-ranker checkpoint found at {lgbm_path}\n"
|
||
"Run Step 5 (`bash assets/train_reranker.sh`) first."
|
||
)
|
||
reranker = LGBMReranker(
|
||
lgbm_path, processed_dir, item_embeddings, item_id_map, device,
|
||
)
|
||
user_matrix_gpu = torch.from_numpy(user_emb_matrix).to(device, non_blocking=True)
|
||
|
||
return item_embeddings, user_emb_matrix, user_matrix_gpu, reranker
|
||
|
||
|
||
def score_candidates(
|
||
reranker,
|
||
item_emb_gpu: torch.Tensor,
|
||
user_matrix_gpu: torch.Tensor,
|
||
cand_idx_gpu: torch.Tensor,
|
||
cand_scores_gpu: torch.Tensor | None = None,
|
||
sample_user_idx_gpu: torch.Tensor | None = None,
|
||
chunk_size: int = 2048,
|
||
) -> float:
|
||
"""Score (n_users × k_candidates) pairs with the LightGBM reranker and
|
||
return elapsed seconds. GPU feature engineering, then chunked CPU
|
||
``Booster.predict``. ``cand_scores_gpu`` and ``sample_user_idx_gpu`` are
|
||
required.
|
||
"""
|
||
device = item_emb_gpu.device
|
||
n_users, _k = cand_idx_gpu.shape
|
||
if cand_idx_gpu.dtype != torch.int64:
|
||
cand_idx_gpu = cand_idx_gpu.to(torch.int64)
|
||
|
||
if cand_scores_gpu is None or sample_user_idx_gpu is None:
|
||
raise ValueError('LGBMReranker requires cand_scores_gpu and sample_user_idx_gpu.')
|
||
torch.cuda.synchronize(device)
|
||
t0 = time.perf_counter()
|
||
for start in range(0, n_users, chunk_size):
|
||
end = min(start + chunk_size, n_users)
|
||
features_np = reranker._build_features_chunk(
|
||
cand_idx_gpu[start:end],
|
||
cand_scores_gpu[start:end],
|
||
sample_user_idx_gpu[start:end],
|
||
item_emb_gpu,
|
||
)
|
||
_ = reranker.booster.predict(features_np)
|
||
torch.cuda.synchronize(device)
|
||
return time.perf_counter() - t0
|
||
|
||
|
||
def bench_at_n(
|
||
n: int,
|
||
user_emb_matrix: np.ndarray,
|
||
user_matrix_gpu: torch.Tensor | None,
|
||
searcher: GpuSearcher,
|
||
reranker,
|
||
top_k: int,
|
||
seed: int,
|
||
) -> dict:
|
||
rng = np.random.default_rng(seed)
|
||
n_unique = user_emb_matrix.shape[0]
|
||
has_rerank = reranker is not None
|
||
device = searcher.device
|
||
|
||
# Build the sample. Reranker path stays on GPU; retrieval-only path stays in numpy.
|
||
# For LGBMReranker we also need the real-user index per query (history sims,
|
||
# user scalars, is_repurchase). Track sample_user_idx_gpu always when reranking.
|
||
sample_user_idx_gpu = None
|
||
if has_rerank:
|
||
if n <= n_unique:
|
||
sample_user_idx_gpu = torch.arange(n, device=device, dtype=torch.long)
|
||
else:
|
||
sample_user_idx_gpu = torch.from_numpy(
|
||
rng.integers(0, n_unique, size=n).astype(np.int64),
|
||
).to(device)
|
||
sample_gpu = user_matrix_gpu[sample_user_idx_gpu]
|
||
else:
|
||
if n <= n_unique:
|
||
sample = user_emb_matrix[:n]
|
||
else:
|
||
sample = user_emb_matrix[rng.integers(0, n_unique, size=n)]
|
||
|
||
if n == 1:
|
||
# Latency mode: 100 single-user iterations.
|
||
if has_rerank:
|
||
user_q_gpu = sample_gpu.reshape(1, -1)
|
||
user_idx_q_gpu = sample_user_idx_gpu[:1]
|
||
retr_lat, total_lat = [], []
|
||
for _ in range(100):
|
||
torch.cuda.synchronize(device)
|
||
t0 = time.perf_counter()
|
||
scores_gpu, idx_gpu = searcher.search_torch(user_q_gpu, top_k)
|
||
torch.cuda.synchronize(device)
|
||
retr_lat.append((time.perf_counter() - t0) * 1000)
|
||
score_candidates(
|
||
reranker, searcher.item_emb, user_q_gpu, idx_gpu,
|
||
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=user_idx_q_gpu,
|
||
)
|
||
total_lat.append((time.perf_counter() - t0) * 1000)
|
||
else:
|
||
user_emb = sample.reshape(1, -1)
|
||
retr_lat = []
|
||
for _ in range(100):
|
||
t0 = time.perf_counter()
|
||
_, _ = searcher.search(user_emb, top_k)
|
||
retr_lat.append((time.perf_counter() - t0) * 1000)
|
||
total_lat = retr_lat
|
||
|
||
retr_ms = float(np.mean(retr_lat))
|
||
total_ms = float(np.mean(total_lat))
|
||
rerank_ms = total_ms - retr_ms
|
||
return {
|
||
'users': n,
|
||
'retrieval_ms': retr_ms,
|
||
'rerank_ms': rerank_ms,
|
||
'total_ms': total_ms,
|
||
'per_user_ms': total_ms,
|
||
'throughput_rps': 1000.0 / total_ms,
|
||
}
|
||
|
||
# Batched mode: one shot.
|
||
if has_rerank:
|
||
torch.cuda.synchronize(device)
|
||
t0 = time.perf_counter()
|
||
scores_gpu, indices_gpu = searcher.search_torch(sample_gpu, top_k)
|
||
torch.cuda.synchronize(device)
|
||
retr_ms = (time.perf_counter() - t0) * 1000
|
||
rerank_s = score_candidates(
|
||
reranker, searcher.item_emb, sample_gpu, indices_gpu,
|
||
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=sample_user_idx_gpu,
|
||
)
|
||
rerank_ms = rerank_s * 1000
|
||
else:
|
||
t0 = time.perf_counter()
|
||
_, _ = searcher.search(sample, top_k)
|
||
retr_ms = (time.perf_counter() - t0) * 1000
|
||
rerank_ms = 0.0
|
||
|
||
total_ms = retr_ms + rerank_ms
|
||
per_user_ms = total_ms / n
|
||
return {
|
||
'users': n,
|
||
'retrieval_ms': retr_ms,
|
||
'rerank_ms': rerank_ms,
|
||
'total_ms': total_ms,
|
||
'per_user_ms': per_user_ms,
|
||
'throughput_rps': n * 1000.0 / total_ms,
|
||
}
|
||
|
||
|
||
def _format_header() -> tuple[str, str]:
|
||
header = f"{'Users':>11} | {'Per-user':>12} | {'Throughput':>14}"
|
||
return header, "-" * len(header)
|
||
|
||
|
||
def _format_row(r: dict) -> str:
|
||
return (
|
||
f"{r['users']:>11,} | "
|
||
f"{r['per_user_ms']:>10.3f}ms | "
|
||
f"{r['throughput_rps']:>10,.0f} /s"
|
||
)
|
||
|
||
|
||
def _fmt_duration(seconds: float) -> str:
|
||
if seconds < 1.0:
|
||
return f"{seconds * 1000:.0f}ms"
|
||
if seconds < 60.0:
|
||
return f"{seconds:.1f}s"
|
||
return f"{seconds / 60:.1f}min"
|
||
|
||
|
||
def main() -> int:
|
||
workspace = Path(os.environ.get('PLAYBOOK_WORKSPACE', os.path.expanduser('~')))
|
||
parser = argparse.ArgumentParser(description='In-process benchmark for the HLLM retrieval engine.')
|
||
parser.add_argument('--processed-dir', default=str(workspace / 'data' / 'processed'))
|
||
parser.add_argument('--models-dir', default=str(workspace / 'models'))
|
||
parser.add_argument('--users', type=int, nargs='+',
|
||
default=[1, 1_000, 10_000, 100_000, 1_000_000],
|
||
help='User-batch sizes to sweep over (default: 1 1000 10000 100000 1000000).')
|
||
parser.add_argument('--top-k', type=int, default=100,
|
||
help='Retrieval depth — candidate set size before re-ranking (default: 100).')
|
||
parser.add_argument('--with-reranker', action='store_true',
|
||
help='Also benchmark the trained LightGBM re-ranker over the retrieved candidates.')
|
||
parser.add_argument('--seed', type=int, default=42)
|
||
parser.add_argument('--save', help='Optional path to save full results as JSON.')
|
||
args = parser.parse_args()
|
||
|
||
try:
|
||
device = detect_device()
|
||
except RuntimeError as e:
|
||
print(f"ERROR: {e}", file=sys.stderr)
|
||
return 1
|
||
|
||
print("=" * 80)
|
||
print("HLLM Retrieval Engine Benchmark")
|
||
print("=" * 80)
|
||
t = time.perf_counter()
|
||
item_embeddings, user_emb_matrix, user_matrix_gpu, reranker = load_engine(
|
||
Path(args.processed_dir), Path(args.models_dir), args.with_reranker, device,
|
||
)
|
||
searcher = GpuSearcher(item_embeddings, device)
|
||
|
||
print(f"Loaded: {item_embeddings.shape[0]:,} items × {item_embeddings.shape[1]} dims, "
|
||
f"{user_emb_matrix.shape[0]:,} user embeddings. "
|
||
f"({time.perf_counter()-t:.1f}s)")
|
||
print(f"Search backend: {searcher.name}")
|
||
if reranker is not None:
|
||
n_trees = reranker.booster.num_trees()
|
||
print(f"Reranker: LightGBM ({n_trees} trees, {len(FEATURE_COLS)} features)")
|
||
print(f"top_k retrieval depth: {args.top_k}")
|
||
print()
|
||
|
||
# Warm up the GPU once before the sweep so cuBLAS algorithm picks and CUDA
|
||
# kernel JITs don't get charged to the first row's wall time.
|
||
if reranker is not None:
|
||
warmup_q_gpu = user_matrix_gpu[:1]
|
||
warmup_idx = torch.zeros(1, dtype=torch.long, device=device)
|
||
scores_gpu, idx_gpu = searcher.search_torch(warmup_q_gpu, args.top_k)
|
||
_ = score_candidates(
|
||
reranker, searcher.item_emb, warmup_q_gpu, idx_gpu,
|
||
cand_scores_gpu=scores_gpu, sample_user_idx_gpu=warmup_idx,
|
||
)
|
||
else:
|
||
_, _ = searcher.search(user_emb_matrix[:1], args.top_k)
|
||
|
||
# Live progress: each row prints "Running X users..." before the work, then
|
||
# "done (T)" once the batch completes. Users feel the wall time between the
|
||
# two halves of the line.
|
||
results = []
|
||
for n in args.users:
|
||
print(f"Running {n:>11,} users...", end=' ', flush=True)
|
||
t0 = time.perf_counter()
|
||
r = bench_at_n(n, user_emb_matrix, user_matrix_gpu, searcher, reranker,
|
||
args.top_k, args.seed)
|
||
wall = time.perf_counter() - t0
|
||
results.append(r)
|
||
print(f"done ({_fmt_duration(wall)})", flush=True)
|
||
|
||
# Final table — printed only once, after all benchmarks complete.
|
||
print()
|
||
print("=" * 80)
|
||
print("Summary")
|
||
print("=" * 80)
|
||
header, sep = _format_header()
|
||
print(header)
|
||
print(sep)
|
||
for r in results:
|
||
print(_format_row(r))
|
||
|
||
if args.save:
|
||
out = {
|
||
'config': {
|
||
'users': args.users,
|
||
'top_k': args.top_k,
|
||
'with_reranker': args.with_reranker,
|
||
'search_backend': searcher.name,
|
||
'item_count': int(item_embeddings.shape[0]),
|
||
'embedding_dim': int(item_embeddings.shape[1]),
|
||
},
|
||
'results': results,
|
||
}
|
||
Path(args.save).write_text(json.dumps(out, indent=2))
|
||
print(f"\nSaved JSON results to {args.save}")
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
raise SystemExit(main())
|