dgx-spark-playbooks/nvidia/station-rec-sys/assets/prepare_data.py

"""Process Amazon Clothing → Dresses subset for HLLM training.

Filters to dress items only, then applies 5-core filtering.
Target: ~500K-800K interactions for 6-hour training on single GB300.

# [arXiv:2409.12740] HLLM data format: interactions CSV + item text CSV
"""

import json
import os
import re
import time
import sys
from pathlib import Path

import pandas as pd
import numpy as np

# A loose `'dress' in title` substring filter admits "dress shoes", "dress
# pants", "dressy belts", etc. This regex pair was chosen by inspecting the
# 244 false positives the loose filter let through (see findings.md
# 2026-05-10): require a word-boundary match against dress-family terms AND
# reject items whose titles also match non-dress garment keywords.
_DRESS_RE = re.compile(r'\b(dress(es)?|sundress(es)?|gowns?)\b', re.IGNORECASE)
# Words below are dropped only when no dress-family word appears in the title
# (see looks_non_dress). Words like "shirt", "tank", "short", "cap", "blouse"
# are intentionally EXCLUDED because they appear inside legit dress titles
# ("T-shirt dress", "Tank dress", "Short sleeve dress", "Cap sleeve dress").
_NON_DRESS_RE = re.compile(
    r'\b('
    r'shoes?|boots?|sandals?|heels?|sneakers?|loafers?|'
    r'socks?|tights?|leggings?|'
    r'handbags?|purses?|totes?|clutches?|wallets?|'
    r'necklaces?|bracelets?|earrings?|watches?|'
    r'pants?|jeans?|shorts?|skirts?|'
    r'shirts?|blouses?|tunics?|tops|tees?|t-shirts?|'
    r'sweaters?|cardigans?|hoodies?|'
    r'jackets?|coats?|blazers?|vests?|'
    r'jumpsuits?|rompers?|'
    r'swimsuits?|bikinis?|'
    r'pajamas?|nightgowns?|kimonos?|sleepwear|underwear|panties'
    r')\b',
    re.IGNORECASE,
)


def is_dress(title: str, categories: str = '') -> bool:
    """Strict dress filter — requires a dress-family word AND no obvious
    non-dress garment word.

    Use at data-prep time to build a clean dataset from scratch. At serve
    time after `prepare_data.py`'s loose filter has already run, use
    `looks_non_dress` instead — strict-positive matching is too aggressive
    against legit dresses whose titles say only "Summer Maxi Long Sleeve".
    """
    text = f"{title} {categories}"
    if _NON_DRESS_RE.search(text):
        return False
    return bool(_DRESS_RE.search(text))


def looks_non_dress(title: str) -> bool:
    """Negative-only check for serve-time filtering.

    Returns True if the title's *head noun* is a non-dress garment.
    Uses an "appears last wins" heuristic: in "T-Shirt Dress" the head is
    Dress (keep), in "Mary Jane Dress Shoes" the head is Shoes (drop).
    If only a non-dress noun appears, drop. If only a dress noun, keep.
    If neither, keep (trust upstream loose filter).
    """
    dress_hits = list(_DRESS_RE.finditer(title))
    nondress_hits = list(_NON_DRESS_RE.finditer(title))
    if not nondress_hits:
        return False
    if not dress_hits:
        return True
    last_dress = max(m.end() for m in dress_hits)
    last_nondress = max(m.end() for m in nondress_hits)
    return last_nondress > last_dress

WORKSPACE = Path(os.environ.get('PLAYBOOK_WORKSPACE', os.path.expanduser('~')))
DATA_DIR = WORKSPACE / "data"
RAW_DIR = DATA_DIR / "raw" / "raw"
HLLM_DATASET_DIR = WORKSPACE / "hllm-code" / "dataset"
HLLM_INFO_DIR = WORKSPACE / "hllm-code" / "information"
OUT_DIR = DATA_DIR / "processed"

REVIEWS_PATH = RAW_DIR / "review_categories" / "Clothing_Shoes_and_Jewelry.jsonl"
META_PATH = RAW_DIR / "meta_categories" / "meta_Clothing_Shoes_and_Jewelry.jsonl"


def find_dress_items(meta_path):
    """Identify all dress-related items from metadata."""
    print("Identifying dress items from metadata...")
    dress_items = {}
    t0 = time.time()
    with open(meta_path) as f:
        for i, line in enumerate(f):
            d = json.loads(line)
            cats = str(d.get('categories', []))
            title = str(d.get('title', ''))
            if is_dress(title, cats):
                pid = d['parent_asin']
                desc = d.get('description', '')
                if isinstance(desc, list):
                    desc = ' '.join(str(x) for x in desc)
                features = d.get('features', [])
                if isinstance(features, list):
                    features = ' '.join(str(x) for x in features)

                images = d.get('images', [])
                img_url = ''
                if images and isinstance(images, list):
                    first_img = images[0]
                    if isinstance(first_img, dict):
                        img_url = first_img.get('large', first_img.get('thumb', ''))

                price = d.get('price', None)
                try:
                    price = float(price) if price else None
                except (ValueError, TypeError):
                    price = None

                dress_items[pid] = {
                    'item_id': pid,
                    'title': str(d.get('title', ''))[:512],
                    'description': str(desc)[:512],
                    'price': price,
                    'image_url': str(img_url),
                }
            if (i + 1) % 1_000_000 == 0:
                print(f"  Scanned {i+1:,} items, found {len(dress_items):,} dresses")

    print(f"  Total: {len(dress_items):,} dress items in {time.time()-t0:.0f}s")
    return dress_items


def load_dress_reviews(reviews_path, dress_item_ids):
    """Load only reviews for dress items."""
    print(f"Loading dress reviews...")
    rows = []
    t0 = time.time()
    total = 0
    with open(reviews_path) as f:
        for line in f:
            total += 1
            d = json.loads(line)
            if d['parent_asin'] in dress_item_ids:
                rows.append({
                    'user_id': d['user_id'],
                    'item_id': d['parent_asin'],
                    'rating': d['rating'],
                    'timestamp': int(d['timestamp']) // 1000,
                })
            if total % 10_000_000 == 0:
                print(f"  Scanned {total:,}, found {len(rows):,} dress reviews")

    df = pd.DataFrame(rows)
    print(f"  Total: {len(df):,} dress reviews from {total:,} total ({len(df)/total:.1%})")
    return df


def five_core_filter(df, min_count=5):
    """Iteratively filter users and items with < min_count interactions."""
    print(f"5-core filtering...")
    print(f"  Before: {len(df):,} interactions, {df['user_id'].nunique():,} users, {df['item_id'].nunique():,} items")

    prev_len = 0
    while len(df) != prev_len:
        prev_len = len(df)
        item_counts = df['item_id'].value_counts()
        df = df[df['item_id'].isin(item_counts[item_counts >= min_count].index)]
        user_counts = df['user_id'].value_counts()
        df = df[df['user_id'].isin(user_counts[user_counts >= min_count].index)]

    print(f"  After: {len(df):,} interactions, {df['user_id'].nunique():,} users, {df['item_id'].nunique():,} items")
    print(f"  Avg/user: {len(df)/max(df['user_id'].nunique(), 1):.1f}")
    return df


def main():
    t_total = time.time()
    print("=" * 60)
    print("PROCESS AMAZON DRESSES FOR HLLM")
    print("=" * 60)

    # Find dress items
    dress_items = find_dress_items(META_PATH)

    # Load dress reviews
    reviews = load_dress_reviews(REVIEWS_PATH, set(dress_items.keys()))

    # 5-core filter
    filtered = five_core_filter(reviews)

    # Save HLLM format
    HLLM_DATASET_DIR.mkdir(parents=True, exist_ok=True)
    HLLM_INFO_DIR.mkdir(parents=True, exist_ok=True)
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    dataset_name = "amazon_dresses"

    # Interactions
    interactions = filtered[['item_id', 'user_id', 'timestamp']].sort_values(['user_id', 'timestamp'])
    interactions.to_csv(HLLM_DATASET_DIR / f"{dataset_name}.csv", index=False)
    print(f"\nSaved interactions: {HLLM_DATASET_DIR / dataset_name}.csv ({len(interactions):,} rows)")

    # Item info (only items in filtered set)
    valid_items = set(filtered['item_id'].unique())
    item_rows = [dress_items[pid] for pid in valid_items if pid in dress_items]
    item_df = pd.DataFrame(item_rows)

    # HLLM format: item_id, title, description
    item_df[['item_id', 'title', 'description']].to_csv(HLLM_INFO_DIR / f"{dataset_name}.csv", index=False)
    print(f"Saved item info: {HLLM_INFO_DIR / dataset_name}.csv ({len(item_df):,} items)")

    # Full metadata for UI
    item_df.to_parquet(OUT_DIR / "dress_metadata.parquet", index=False)
    interactions.to_parquet(OUT_DIR / "dress_interactions.parquet", index=False)

    # Stats
    print(f"\n{'='*60}")
    print(f"FINAL DRESSES DATASET:")
    print(f"  Interactions: {len(interactions):,}")
    print(f"  Users: {interactions['user_id'].nunique():,}")
    print(f"  Items: {interactions['item_id'].nunique():,}")
    print(f"  Avg/user: {len(interactions)/interactions['user_id'].nunique():.1f}")
    has_price = item_df['price'].notna().sum()
    has_image = (item_df['image_url'].str.len() > 0).sum()
    print(f"  Items with price: {has_price:,} ({has_price/len(item_df):.0%})")
    print(f"  Items with image: {has_image:,} ({has_image/len(item_df):.0%})")
    print(f"  Total time: {time.time()-t_total:.0f}s")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()