dgx-spark-playbooks/nvidia/station-rec-sys/assets/pricing_agent.py

"""Dynamic pricing agent for the rec-sys playbook.

Trains a PPO policy that picks per-item price multipliers each day,
evaluated against FixedPrice and AgeMarkdown baselines on a calibrated
inventory simulator. Subcommands:

    smoke   Run baselines on a tiny synthetic catalog and print KPIs.
    train   Train the PPO agent against the simulator.
    eval    Evaluate a trained PPO agent + baselines.
    all     train then eval.

The simulator, demand model, and baselines port code from
`enterprise-retail-demo/src/pricing/` into a single file. Bug fixes from
the prior DQN implementation are applied in the PPO section.
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd

# torch is imported lazily inside the training/eval entry points so the
# `smoke` subcommand can run on environments without GPU drivers.


# ────────────────────────────────────────────────────────────────────
# Pricing config — embedded constants (was configs/pricing.yaml in the
# upstream repo). Inlined here so the playbook stays a single file.
# ────────────────────────────────────────────────────────────────────

PRICING_CONFIG: Dict[str, Any] = {
    "simulator": {
        "horizon_days": 14,
        "num_shoppers_per_day": 10_000,
        "replenish_interval": 7,
        "replenish_fraction": 0.5,
    },
    "inventory": {
        "min_initial": 50,
        "max_initial": 200,
        "cost_fraction_min": 0.40,
        "cost_fraction_max": 0.60,
        "holding_cost_per_unit_day": 0.002,  # ≈ 50%/yr
        "stockout_penalty": 5.0,
    },
    "demand_model": {
        # logit(p) = alpha + beta·rec + gamma·season − epsilon·Δp/p₀ − delta·stockout
        # Categories are derived from price tier on the real Amazon Dresses catalog
        # (luxury = top third by price, midrange = middle, budget = bottom).
        # Elasticity ranges follow Tellis (1988) "The Price Elasticity of Selective
        # Demand": apparel ~2.0, luxury/premium <1.0, budget/clearance 3–4.
        "seasonal_amplitude": 0.2,
        "rec_score_weight": 1.0,
        "stockout_delta": 10.0,
        "elasticity": {
            "luxury":   [0.5, 1.0],   # premium buyers less price-sensitive
            "midrange": [1.5, 2.5],   # standard apparel elasticity
            "budget":   [2.5, 4.0],   # budget shoppers highly price-sensitive
            "default":  [1.5, 2.5],
        },
        # alpha (pre-sigmoid) calibrated to ~6–15 units/day/item with 10K shoppers
        "base_demand": {
            "luxury":   -7.5,  # ~6 units/day — lower volume
            "midrange": -7.0,  # ~9 units/day
            "budget":   -6.5,  # ~15 units/day — volume play
            "default":  -7.0,
        },
    },
    # 9 multipliers covering -40% to +25%. Replaces the upstream [0.85, 1.05]
    # range that couldn't reach the deep markdowns aging stock needs.
    "price_grid": {
        "multipliers": [0.60, 0.70, 0.80, 0.90, 1.00, 1.05, 1.10, 1.15, 1.25],
    },
}

CATEGORIES: Tuple[str, ...] = ("luxury", "midrange", "budget")


# ────────────────────────────────────────────────────────────────────
# Demand model
# ────────────────────────────────────────────────────────────────────


def _sigmoid(x: np.ndarray) -> np.ndarray:
    """Numerically-stable sigmoid."""
    return np.where(
        x >= 0,
        1.0 / (1.0 + np.exp(-x)),
        np.exp(x) / (1.0 + np.exp(x)),
    )


class DemandModel:
    """Vectorized log-linear demand model.

    Purchase probability per item:

        sigmoid( alpha_cat + beta·rec − epsilon_cat·(p−p₀)/p₀
                 + gamma·season(day) − delta·I(inventory=0) )

    Elasticity is set to the midpoint of each category's configured range
    (deterministic for reproducibility).
    """

    def __init__(self, config: Dict[str, Any]) -> None:
        self.elasticity_ranges = config.get("elasticity", {})
        self.default_elasticity = self.elasticity_ranges.get("default", [1.5, 2.5])
        self.base_demand = config.get("base_demand", {})
        self.default_base_demand = self.base_demand.get("default", -7.0)
        self.seasonal_amplitude = config.get("seasonal_amplitude", 0.2)
        self.rec_score_weight = config.get("rec_score_weight", 1.0)
        self.stockout_delta = config.get("stockout_delta", 10.0)

    def _alpha(self, categories: np.ndarray) -> np.ndarray:
        out = np.full(len(categories), self.default_base_demand, dtype=np.float64)
        for cat, val in self.base_demand.items():
            if cat == "default":
                continue
            out[categories == cat] = val
        return out

    def _epsilon(self, categories: np.ndarray) -> np.ndarray:
        default_mid = float(np.mean(self.default_elasticity))
        out = np.full(len(categories), default_mid, dtype=np.float64)
        for cat, (lo, hi) in self.elasticity_ranges.items():
            if cat == "default":
                continue
            out[categories == cat] = 0.5 * (lo + hi)
        return out

    @staticmethod
    def season_factor(day: int) -> float:
        """Sinusoidal weekly pattern, peaks Saturday (weekday 5)."""
        return float(np.sin(2.0 * np.pi * ((day % 7) - 2) / 7.0))

    def purchase_probability(
        self,
        categories: np.ndarray,
        inventories: np.ndarray,
        base_prices: np.ndarray,
        prices: np.ndarray,
        day: int,
        rec_scores: np.ndarray,
    ) -> np.ndarray:
        alpha = self._alpha(categories)
        epsilon = self._epsilon(categories)
        price_ratio = np.where(
            base_prices > 0, (prices - base_prices) / base_prices, 0.0
        )
        stockout = (inventories <= 0).astype(np.float64)
        logit = (
            alpha
            + self.rec_score_weight * rec_scores
            + self.seasonal_amplitude * self.season_factor(day)
            - epsilon * price_ratio
            - self.stockout_delta * stockout
        )
        return _sigmoid(logit)

    def expected_demand(
        self,
        categories: np.ndarray,
        inventories: np.ndarray,
        base_prices: np.ndarray,
        prices: np.ndarray,
        day: int,
        rec_scores: np.ndarray,
        num_shoppers: int,
    ) -> np.ndarray:
        prob = self.purchase_probability(
            categories, inventories, base_prices, prices, day, rec_scores
        )
        return num_shoppers * prob


# ────────────────────────────────────────────────────────────────────
# Inventory state
# ────────────────────────────────────────────────────────────────────


class InventoryState:
    """Per-item inventory + pricing state. All arrays shape (n_items,)."""

    def __init__(
        self,
        inventories: np.ndarray,
        base_prices: np.ndarray,
        unit_costs: np.ndarray,
        current_prices: np.ndarray,
        days_in_stock: np.ndarray,
        categories: np.ndarray,
        rec_scores: np.ndarray,
    ) -> None:
        self.inventories = inventories.astype(np.float64)
        self.base_prices = base_prices.astype(np.float64)
        self.unit_costs = unit_costs.astype(np.float64)
        self.current_prices = current_prices.astype(np.float64)
        self.days_in_stock = days_in_stock.astype(np.float64)
        self.categories = np.asarray(categories)
        self.rec_scores = rec_scores.astype(np.float64)
        self._initial_inventories = self.inventories.copy()

    @property
    def n_items(self) -> int:
        return len(self.inventories)

    def copy(self) -> "InventoryState":
        return InventoryState(
            inventories=self.inventories.copy(),
            base_prices=self.base_prices.copy(),
            unit_costs=self.unit_costs.copy(),
            current_prices=self.current_prices.copy(),
            days_in_stock=self.days_in_stock.copy(),
            categories=self.categories.copy(),
            rec_scores=self.rec_scores.copy(),
        )

    @classmethod
    def initialize(
        cls,
        item_features: pd.DataFrame,
        config: Dict[str, Any],
        seed: int = 42,
    ) -> "InventoryState":
        """Build a starting state from an item-features DataFrame.

        Expected columns: ``avg_price`` (float), optional ``category`` (str)
        and ``rec_score`` (float). Missing columns are synthesized.
        """
        inv_cfg = config.get("inventory", {})
        rng = np.random.default_rng(seed)
        n = len(item_features)

        if "avg_price" in item_features.columns:
            base_prices = item_features["avg_price"].to_numpy(dtype=np.float64)
            median = np.nanmedian(base_prices[base_prices > 0]) if (base_prices > 0).any() else 50.0
            base_prices = np.where(
                np.isnan(base_prices) | (base_prices <= 0), median, base_prices
            )
        else:
            base_prices = rng.uniform(5.0, 100.0, size=n)

        cf_lo = inv_cfg.get("cost_fraction_min", 0.40)
        cf_hi = inv_cfg.get("cost_fraction_max", 0.60)
        unit_costs = base_prices * rng.uniform(cf_lo, cf_hi, size=n)

        inv_lo = inv_cfg.get("min_initial", 50)
        inv_hi = inv_cfg.get("max_initial", 200)
        inventories = rng.integers(inv_lo, inv_hi + 1, size=n).astype(np.float64)

        if "category" in item_features.columns:
            categories = item_features["category"].to_numpy(dtype=str)
        else:
            categories = np.array([CATEGORIES[i % len(CATEGORIES)] for i in range(n)])

        if "rec_score" in item_features.columns:
            rec_scores = item_features["rec_score"].to_numpy(dtype=np.float64)
        else:
            rec_scores = rng.uniform(0.0, 1.0, size=n)

        return cls(
            inventories=inventories,
            base_prices=base_prices,
            unit_costs=unit_costs,
            current_prices=base_prices.copy(),
            days_in_stock=np.zeros(n, dtype=np.float64),
            categories=categories,
            rec_scores=rec_scores,
        )


# ────────────────────────────────────────────────────────────────────
# Simulator + result tracking
# ────────────────────────────────────────────────────────────────────


@dataclass
class SimulationResult:
    daily_revenue: List[float] = field(default_factory=list)
    daily_margin: List[float] = field(default_factory=list)
    daily_units_sold: List[float] = field(default_factory=list)
    daily_stockout_count: List[int] = field(default_factory=list)
    n_items: int = 0
    horizon_days: int = 0
    initial_inventories: np.ndarray = field(default_factory=lambda: np.array([]))

    @property
    def total_revenue(self) -> float:
        return float(np.sum(self.daily_revenue))

    @property
    def total_margin(self) -> float:
        return float(np.sum(self.daily_margin))

    @property
    def avg_stockout_rate(self) -> float:
        if self.n_items == 0 or self.horizon_days == 0:
            return 0.0
        return float(np.mean(self.daily_stockout_count)) / self.n_items

    @property
    def sell_through_rate(self) -> float:
        total_initial = float(np.sum(self.initial_inventories))
        return (float(np.sum(self.daily_units_sold)) / total_initial) if total_initial > 0 else 0.0


class Simulator:
    """Day-by-day inventory + pricing simulator with weekly replenishment."""

    def __init__(self, demand_model: DemandModel, config: Dict[str, Any]) -> None:
        self.demand_model = demand_model
        sim_cfg = config.get("simulator", {})
        inv_cfg = config.get("inventory", {})
        self.num_shoppers = sim_cfg.get("num_shoppers_per_day", 10_000)
        self.replenish_interval = sim_cfg.get("replenish_interval", 7)
        self.replenish_fraction = sim_cfg.get("replenish_fraction", 0.5)
        self.holding_cost = inv_cfg.get("holding_cost_per_unit_day", 0.002)

    def run(
        self,
        initial_state: InventoryState,
        policy: Any,
        horizon_days: int,
        seed: int = 123,
    ) -> SimulationResult:
        state = initial_state.copy()
        result = SimulationResult(
            n_items=state.n_items,
            horizon_days=horizon_days,
            initial_inventories=state._initial_inventories.copy(),
        )
        rng = np.random.default_rng(seed)

        for day in range(horizon_days):
            try:
                prices = policy.select_prices(state, day=day)
            except TypeError:
                prices = policy.select_prices(state)
            state.current_prices = prices

            expected = self.demand_model.expected_demand(
                categories=state.categories,
                inventories=state.inventories,
                base_prices=state.base_prices,
                prices=prices,
                day=day,
                rec_scores=state.rec_scores,
                num_shoppers=self.num_shoppers,
            )
            realised = rng.poisson(lam=np.clip(expected, 0, None)).astype(np.float64)
            sold = np.minimum(realised, state.inventories)

            day_revenue = float(np.sum(prices * sold))
            day_cogs = float(np.sum(state.unit_costs * sold))
            day_holding = float(
                self.holding_cost * np.sum(state.inventories * state.base_prices)
            )
            day_margin = day_revenue - day_cogs - day_holding

            state.inventories -= sold
            state.days_in_stock = np.where(
                state.inventories > 0,
                state.days_in_stock + 1,
                state.days_in_stock,
            )

            result.daily_revenue.append(day_revenue)
            result.daily_margin.append(day_margin)
            result.daily_units_sold.append(float(np.sum(sold)))
            result.daily_stockout_count.append(int(np.sum(state.inventories <= 0)))

            if (
                self.replenish_interval > 0
                and (day + 1) % self.replenish_interval == 0
            ):
                replenish_level = state._initial_inventories * self.replenish_fraction
                state.inventories = np.maximum(state.inventories, replenish_level)

        return result


# ────────────────────────────────────────────────────────────────────
# Baseline policies
# ────────────────────────────────────────────────────────────────────


class FixedPrice:
    """Always charge the base price — control policy."""

    name = "FixedPrice"

    def select_prices(self, state: InventoryState, day: int = 0) -> np.ndarray:
        return state.base_prices.copy()


class AgeMarkdown:
    """Markdown by ``weekly_discount`` per week of dwell, floored at -50%."""

    name = "AgeMarkdown"

    def __init__(self, weekly_discount: float = 0.05) -> None:
        self.weekly_discount = weekly_discount

    def select_prices(self, state: InventoryState, day: int = 0) -> np.ndarray:
        weeks = state.days_in_stock // 7
        discount = np.clip(self.weekly_discount * weeks, 0.0, 0.50)
        return state.base_prices * (1.0 - discount)


# ────────────────────────────────────────────────────────────────────
# Catalog loaders
# ────────────────────────────────────────────────────────────────────


def _derive_price_tier(prices: np.ndarray) -> np.ndarray:
    """Split items into 3 elasticity-defining tiers by price terciles."""
    valid = prices[prices > 0]
    if len(valid) < 3:
        return np.full(len(prices), "midrange", dtype=object)
    q33, q67 = np.percentile(valid, [100 / 3, 200 / 3])
    tiers = np.where(prices < q33, "budget",
              np.where(prices < q67, "midrange", "luxury"))
    return tiers.astype(object)


def _derive_popularity_rec_score(item_ids: np.ndarray, interactions_path: Path) -> np.ndarray:
    """Per-item popularity in [0, 1], from interaction counts. sqrt-flattened
    so a handful of mega-popular items don't dominate the signal."""
    if not interactions_path.exists():
        return np.full(len(item_ids), 0.5)
    inter = pd.read_parquet(interactions_path, columns=["item_id"])
    counts = inter.groupby("item_id").size()
    item_counts = pd.Series(item_ids).map(counts).fillna(0).to_numpy(dtype=np.float64)
    if item_counts.max() == 0:
        return np.full(len(item_ids), 0.5)
    return np.sqrt(item_counts / item_counts.max())


def load_amazon_dresses_catalog(n_items: int, seed: int) -> pd.DataFrame:
    """Load the real Amazon Dresses catalog produced by the playbook's
    `prepare_data.py`, with derived price-tier categories and popularity-
    based rec_score signals.

    Sample to ``n_items`` for tractable training; pass ``n_items <= 0`` to
    use the full catalog (~14k items after dropping missing prices).
    """
    workspace = workspace_root()
    meta_path = workspace / "data" / "processed" / "dress_metadata.parquet"
    inter_path = workspace / "data" / "processed" / "dress_interactions.parquet"
    if not meta_path.exists():
        raise FileNotFoundError(
            f"Amazon Dresses metadata not found at {meta_path}.\n"
            f"Run `bash assets/setup.sh` to download/prepare the dataset, "
            f"or pass `--synthetic` to use a generated catalog."
        )
    df = pd.read_parquet(meta_path)
    # Drop items with missing or non-positive prices (~2k of 16k in the dataset).
    df = df[df["price"].notna() & (df["price"] > 0)].reset_index(drop=True)
    if n_items > 0 and n_items < len(df):
        df = df.sample(n=n_items, random_state=seed).reset_index(drop=True)
    df["category"] = _derive_price_tier(df["price"].to_numpy())
    df["rec_score"] = _derive_popularity_rec_score(df["item_id"].to_numpy(), inter_path)
    df = df.rename(columns={"price": "avg_price"})
    return df[["item_id", "category", "avg_price", "rec_score"]]


def build_synthetic_catalog(n_items: int = 100, seed: int = 0) -> pd.DataFrame:
    """A tiny synthetic catalog for the `smoke` subcommand and as a
    `--synthetic` fallback when the real dataset isn't available.

    Uses lognormal prices (median ~$33, matching the Amazon Dresses
    median) and the same luxury/midrange/budget tier scheme so the
    simulator and demand model behave identically across synthetic and
    real catalogs.
    """
    rng = np.random.default_rng(seed)
    prices = np.round(rng.lognormal(mean=3.5, sigma=0.6, size=n_items), 2)
    rec_scores = rng.beta(2.0, 5.0, size=n_items)
    cats = _derive_price_tier(prices)
    return pd.DataFrame(
        {
            "item_id": np.arange(n_items),
            "category": cats,
            "avg_price": prices,
            "rec_score": rec_scores,
        }
    )


def load_catalog(n_items: int, seed: int, synthetic: bool) -> pd.DataFrame:
    """Top-level catalog loader: real Amazon Dresses by default, synthetic
    if ``--synthetic`` was passed or the real data is unavailable."""
    if synthetic:
        return build_synthetic_catalog(n_items=n_items if n_items > 0 else 1000, seed=seed)
    return load_amazon_dresses_catalog(n_items=n_items, seed=seed)


# ────────────────────────────────────────────────────────────────────
# Eval helpers
# ────────────────────────────────────────────────────────────────────


def run_policy(
    policy: Any,
    initial_state: InventoryState,
    simulator: Simulator,
    horizon_days: int,
    seed: int = 123,
) -> Tuple[str, SimulationResult, float]:
    name = getattr(policy, "name", type(policy).__name__)
    t0 = time.perf_counter()
    result = simulator.run(initial_state, policy, horizon_days, seed=seed)
    return name, result, time.perf_counter() - t0


def format_kpi_row(name: str, result: SimulationResult, elapsed: float, baseline_rev: float | None) -> str:
    rev = result.total_revenue
    lift = (rev / baseline_rev) if (baseline_rev and baseline_rev > 0) else 1.0
    return (
        f"  {name:18s} | Rev: {rev:10.2f} ({lift:.2f}x) | "
        f"Margin: {result.total_margin:10.2f} | "
        f"Stockout: {result.avg_stockout_rate:5.1%} | "
        f"Sell-through: {result.sell_through_rate:5.1%} | "
        f"{elapsed*1000:5.0f}ms"
    )


def print_kpi_table(rows: List[Tuple[str, SimulationResult, float]]) -> None:
    if not rows:
        return
    baseline_rev = rows[0][1].total_revenue
    print("-" * 110)
    for name, res, elapsed in rows:
        print(format_kpi_row(name, res, elapsed, baseline_rev))
    print("-" * 110)


# ────────────────────────────────────────────────────────────────────
# PPO agent (torch) — discrete action over price multipliers,
# shared MLP backbone, vectorized simulator envs.
# ────────────────────────────────────────────────────────────────────


CATEGORY_TO_IDX: Dict[str, int] = {c: i for i, c in enumerate(CATEGORIES)}
N_CATEGORIES: int = len(CATEGORIES)
N_STATE_FEATURES: int = 8 + N_CATEGORIES  # 8 numeric + 3 one-hot


def encode_state_per_item(
    state: InventoryState,
    day: int,
    horizon: int,
    price_norm: float,
    inv_norm: float,
) -> np.ndarray:
    """Return a (n_items, N_STATE_FEATURES) float32 array.

    Features per item (in order):
        0  inventory_ratio (current / initial)
        1  days_in_stock / horizon
        2  day / horizon
        3  sin(2π · day_of_week / 7)
        4  cos(2π · day_of_week / 7)
        5  log1p(base_price) / log1p(price_norm)
        6  log1p(inventory)  / log1p(inv_norm)
        7  rec_score
        8..10  category one-hot (fashion, basics, seasonal)
    """
    n = state.n_items
    initial = state._initial_inventories
    inv_ratio = np.where(initial > 0, state.inventories / initial, 0.0)
    days_norm = state.days_in_stock / max(horizon, 1)
    day_norm = np.full(n, day / max(horizon, 1))
    day_of_week = day % 7
    dow_sin = np.full(n, np.sin(2.0 * np.pi * day_of_week / 7.0))
    dow_cos = np.full(n, np.cos(2.0 * np.pi * day_of_week / 7.0))
    bp = np.log1p(state.base_prices) / np.log1p(max(price_norm, 1.0))
    inv = np.log1p(state.inventories) / np.log1p(max(inv_norm, 1.0))
    rec = state.rec_scores

    cat_onehot = np.zeros((n, N_CATEGORIES), dtype=np.float64)
    for i, cat in enumerate(state.categories):
        idx = CATEGORY_TO_IDX.get(str(cat), 0)
        cat_onehot[i, idx] = 1.0

    return np.column_stack(
        [inv_ratio, days_norm, day_norm, dow_sin, dow_cos, bp, inv, rec, cat_onehot]
    ).astype(np.float32)


class PricingEnv:
    """Gym-like wrapper exposing the simulator one day at a time.

    ``reset()`` returns the encoded state. ``step(actions)`` advances one
    day and returns (next_state, per_item_reward, done, info).
    Per-item reward is the per-item margin contribution to total margin,
    so summing over items recovers the eval-metric exactly (no
    reward-eval drift).
    """

    def __init__(
        self,
        initial_state: InventoryState,
        simulator: Simulator,
        multipliers: np.ndarray,
        horizon: int,
        seed: int,
    ) -> None:
        self.initial_state = initial_state
        self.simulator = simulator
        self.multipliers = multipliers
        self.horizon = horizon
        self.seed = seed
        self._price_norm = float(self.initial_state.base_prices.max())
        self._inv_norm = float(self.initial_state._initial_inventories.max())
        self.reset()

    def reset(self) -> np.ndarray:
        self.state = self.initial_state.copy()
        self.day = 0
        self.rng = np.random.default_rng(self.seed)
        return encode_state_per_item(
            self.state, self.day, self.horizon, self._price_norm, self._inv_norm
        )

    def step(self, action_idx: np.ndarray) -> Tuple[np.ndarray, np.ndarray, bool, Dict[str, float]]:
        prices = self.state.base_prices * self.multipliers[action_idx]
        self.state.current_prices = prices

        expected = self.simulator.demand_model.expected_demand(
            categories=self.state.categories,
            inventories=self.state.inventories,
            base_prices=self.state.base_prices,
            prices=prices,
            day=self.day,
            rec_scores=self.state.rec_scores,
            num_shoppers=self.simulator.num_shoppers,
        )
        realised = self.rng.poisson(lam=np.clip(expected, 0, None)).astype(np.float64)
        sold = np.minimum(realised, self.state.inventories)

        revenue = prices * sold
        cogs = self.state.unit_costs * sold
        holding = self.simulator.holding_cost * self.state.inventories * self.state.base_prices
        reward_per_item = (revenue - cogs - holding).astype(np.float32)

        self.state.inventories -= sold
        self.state.days_in_stock = np.where(
            self.state.inventories > 0,
            self.state.days_in_stock + 1,
            self.state.days_in_stock,
        )
        self.day += 1
        if (
            self.simulator.replenish_interval > 0
            and self.day % self.simulator.replenish_interval == 0
        ):
            replenish_level = self.state._initial_inventories * self.simulator.replenish_fraction
            self.state.inventories = np.maximum(self.state.inventories, replenish_level)

        done = self.day >= self.horizon
        next_obs = encode_state_per_item(
            self.state, self.day, self.horizon, self._price_norm, self._inv_norm
        )
        info = {
            "total_margin": float(reward_per_item.sum()),
            "total_revenue": float(revenue.sum()),
        }
        return next_obs, reward_per_item, done, info


# ────────────────────────────────────────────────────────────────────
# Network
# ────────────────────────────────────────────────────────────────────


def _build_actor_critic(n_actions: int, hidden: int = 256):
    """Return an ActorCritic torch module. Imported lazily."""
    import torch
    import torch.nn as nn

    class ActorCritic(nn.Module):
        def __init__(self) -> None:
            super().__init__()
            self.backbone = nn.Sequential(
                nn.Linear(N_STATE_FEATURES, hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.ReLU(),
            )
            self.actor = nn.Linear(hidden, n_actions)
            self.critic = nn.Linear(hidden, 1)
            for m in self.modules():
                if isinstance(m, nn.Linear):
                    nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
                    nn.init.zeros_(m.bias)
            nn.init.orthogonal_(self.actor.weight, gain=0.01)
            nn.init.orthogonal_(self.critic.weight, gain=1.0)

        def forward(self, x):
            h = self.backbone(x)
            return self.actor(h), self.critic(h).squeeze(-1)

    return ActorCritic()


# ────────────────────────────────────────────────────────────────────
# PPO Trainer
# ────────────────────────────────────────────────────────────────────


@dataclass
class PPOConfig:
    n_iters: int = 200
    n_envs: int = 16
    horizon: int = 14
    n_items: int = 1000
    n_epochs: int = 4
    minibatch_size: int = 4096
    lr: float = 3e-4
    gamma: float = 0.99
    gae_lambda: float = 0.95
    clip_eps: float = 0.2
    value_coef: float = 0.5
    entropy_coef_start: float = 0.05
    entropy_coef_end: float = 0.005
    max_grad_norm: float = 0.5
    reward_scale: float = 100.0
    value_clip_eps: float = 0.2
    device: str = "auto"
    seed: int = 0


def _compute_gae(
    rewards: np.ndarray,  # (T, K, N)
    values: np.ndarray,   # (T+1, K, N)
    gamma: float,
    lam: float,
) -> Tuple[np.ndarray, np.ndarray]:
    """Generalized Advantage Estimation per item-trajectory.

    Treats each (env, item) pair as an independent length-T trajectory.
    """
    T = rewards.shape[0]
    advantages = np.zeros_like(rewards, dtype=np.float32)
    last_gae = np.zeros(rewards.shape[1:], dtype=np.float32)
    for t in reversed(range(T)):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        last_gae = delta + gamma * lam * last_gae
        advantages[t] = last_gae
    returns = advantages + values[:-1]
    return advantages, returns


class PPOTrainer:
    """Vectorized PPO trainer for the pricing env."""

    def __init__(self, cfg: PPOConfig, catalog: pd.DataFrame, multipliers: np.ndarray):
        import torch

        self.cfg = cfg
        self.multipliers = multipliers
        self.n_actions = len(multipliers)
        self.device = self._resolve_device(cfg.device)

        # Build K parallel envs with different seeds for diversity.
        self.envs: List[PricingEnv] = []
        for k in range(cfg.n_envs):
            initial_state = InventoryState.initialize(catalog, PRICING_CONFIG, seed=cfg.seed + k)
            demand = DemandModel(PRICING_CONFIG["demand_model"])
            simulator = Simulator(demand, PRICING_CONFIG)
            self.envs.append(
                PricingEnv(initial_state, simulator, multipliers, cfg.horizon, seed=cfg.seed + k)
            )

        self.net = _build_actor_critic(self.n_actions).to(self.device)
        self.optim = torch.optim.Adam(self.net.parameters(), lr=cfg.lr)
        self.train_curve: List[float] = []  # mean total revenue per iteration

    @staticmethod
    def _resolve_device(s: str) -> str:
        import torch
        if s == "auto":
            return "cuda" if torch.cuda.is_available() else "cpu"
        return s

    def _act(self, obs_kn: "Any", greedy: bool = False):
        """obs_kn: (K, N, F) torch tensor. Returns actions, log_probs, values, entropy."""
        import torch
        logits, value = self.net(obs_kn)
        if greedy:
            action = logits.argmax(dim=-1)
        else:
            dist = torch.distributions.Categorical(logits=logits)
            action = dist.sample()
        log_prob = torch.nn.functional.log_softmax(logits, dim=-1).gather(
            -1, action.unsqueeze(-1)
        ).squeeze(-1)
        entropy = torch.distributions.Categorical(logits=logits).entropy()
        return action, log_prob, value, entropy

    def collect_rollout(self):
        """Run K parallel envs for `horizon` steps. Returns dict of tensors."""
        import torch
        cfg = self.cfg
        n_items = self.envs[0].state.n_items
        obs_buf = np.zeros((cfg.horizon, cfg.n_envs, n_items, N_STATE_FEATURES), dtype=np.float32)
        act_buf = np.zeros((cfg.horizon, cfg.n_envs, n_items), dtype=np.int64)
        logp_buf = np.zeros((cfg.horizon, cfg.n_envs, n_items), dtype=np.float32)
        val_buf = np.zeros((cfg.horizon + 1, cfg.n_envs, n_items), dtype=np.float32)
        rew_buf = np.zeros((cfg.horizon, cfg.n_envs, n_items), dtype=np.float32)

        obs = np.stack([env.reset() for env in self.envs], axis=0)  # (K, N, F)
        episode_returns = np.zeros(cfg.n_envs, dtype=np.float64)
        episode_revenues = np.zeros(cfg.n_envs, dtype=np.float64)

        for t in range(cfg.horizon):
            obs_t = torch.from_numpy(obs).to(self.device)  # (K, N, F)
            with torch.no_grad():
                action, log_prob, value, _ = self._act(obs_t, greedy=False)
            action_np = action.cpu().numpy()

            obs_buf[t] = obs
            act_buf[t] = action_np
            logp_buf[t] = log_prob.cpu().numpy()
            val_buf[t] = value.cpu().numpy()

            next_obs_list = []
            for k, env in enumerate(self.envs):
                next_o, reward, _done, info = env.step(action_np[k])
                rew_buf[t, k] = reward / cfg.reward_scale  # scale rewards for stable critic
                episode_returns[k] += info["total_margin"]
                episode_revenues[k] += info["total_revenue"]
                next_obs_list.append(next_o)
            obs = np.stack(next_obs_list, axis=0)

        # Bootstrap value for last state (used by GAE)
        with torch.no_grad():
            obs_T = torch.from_numpy(obs).to(self.device)
            _, _, last_value, _ = self._act(obs_T, greedy=False)
        val_buf[cfg.horizon] = last_value.cpu().numpy()

        advantages, returns = _compute_gae(rew_buf, val_buf, cfg.gamma, cfg.gae_lambda)

        return {
            "obs": obs_buf,
            "actions": act_buf,
            "log_probs": logp_buf,
            "values": val_buf[:-1],  # old values for clipped value loss
            "advantages": advantages,
            "returns": returns,
            "episode_returns": episode_returns,
            "episode_revenues": episode_revenues,
        }

    def update(self, rollout: Dict[str, np.ndarray], entropy_coef: float) -> Dict[str, float]:
        """Run PPO epochs on the collected rollout. Returns loss diagnostics."""
        import torch

        cfg = self.cfg
        obs = torch.from_numpy(rollout["obs"]).to(self.device).reshape(-1, N_STATE_FEATURES)
        actions = torch.from_numpy(rollout["actions"]).to(self.device).reshape(-1)
        old_log_probs = torch.from_numpy(rollout["log_probs"]).to(self.device).reshape(-1)
        old_values = torch.from_numpy(rollout["values"]).to(self.device).reshape(-1)
        advantages = torch.from_numpy(rollout["advantages"]).to(self.device).reshape(-1)
        returns = torch.from_numpy(rollout["returns"]).to(self.device).reshape(-1)
        # Normalize advantages for stable gradients
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        N = obs.shape[0]
        idx = torch.arange(N, device=self.device)

        policy_losses, value_losses, entropies = [], [], []
        for _epoch in range(cfg.n_epochs):
            perm = idx[torch.randperm(N, device=self.device)]
            for start in range(0, N, cfg.minibatch_size):
                mb = perm[start : start + cfg.minibatch_size]
                logits, value = self.net(obs[mb])
                dist = torch.distributions.Categorical(logits=logits)
                new_log_prob = dist.log_prob(actions[mb])
                entropy = dist.entropy().mean()

                ratio = (new_log_prob - old_log_probs[mb]).exp()
                adv = advantages[mb]
                surrogate1 = ratio * adv
                surrogate2 = torch.clamp(ratio, 1 - cfg.clip_eps, 1 + cfg.clip_eps) * adv
                policy_loss = -torch.min(surrogate1, surrogate2).mean()

                # Clipped value loss: prevent value-function divergence after policy converges.
                v_clipped = old_values[mb] + torch.clamp(
                    value - old_values[mb], -cfg.value_clip_eps, cfg.value_clip_eps
                )
                v_loss_unclipped = (value - returns[mb]).pow(2)
                v_loss_clipped = (v_clipped - returns[mb]).pow(2)
                value_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean()

                loss = policy_loss + cfg.value_coef * value_loss - entropy_coef * entropy

                self.optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.net.parameters(), cfg.max_grad_norm)
                self.optim.step()

                policy_losses.append(float(policy_loss.detach()))
                value_losses.append(float(value_loss.detach()))
                entropies.append(float(entropy.detach()))

        return {
            "policy_loss": float(np.mean(policy_losses)),
            "value_loss": float(np.mean(value_losses)),
            "entropy": float(np.mean(entropies)),
        }

    def train(self, verbose: bool = True) -> List[float]:
        cfg = self.cfg
        if verbose:
            print(
                f"Training PPO: {cfg.n_iters} iters × {cfg.n_envs} envs × "
                f"{cfg.horizon} days × {cfg.n_items} items on {self.device}"
            )
        t0 = time.perf_counter()
        for it in range(cfg.n_iters):
            # Linear anneal of entropy bonus from start → end across training.
            frac = it / max(1, cfg.n_iters - 1)
            entropy_coef = cfg.entropy_coef_start + (cfg.entropy_coef_end - cfg.entropy_coef_start) * frac
            rollout = self.collect_rollout()
            losses = self.update(rollout, entropy_coef=entropy_coef)
            mean_revenue = float(rollout["episode_revenues"].mean())
            mean_margin = float(rollout["episode_returns"].mean())
            self.train_curve.append(mean_revenue)
            if verbose and (it + 1) % max(1, cfg.n_iters // 20) == 0:
                print(
                    f"  iter {it + 1:4d}/{cfg.n_iters} | "
                    f"rev/ep: {mean_revenue:10.0f} | margin/ep: {mean_margin:10.0f} | "
                    f"pi_loss: {losses['policy_loss']:+.3f} | "
                    f"v_loss: {losses['value_loss']:.3f} | "
                    f"H: {losses['entropy']:.3f} | "
                    f"ent_c: {entropy_coef:.3f}"
                )
        if verbose:
            print(f"Training complete in {time.perf_counter() - t0:.1f}s")
        return self.train_curve


class PPOPolicy:
    """Inference-time wrapper exposing the PricingPolicy interface."""

    name = "PPO"

    def __init__(self, net, multipliers: np.ndarray, device: str, horizon: int,
                 price_norm: float, inv_norm: float, greedy: bool = True) -> None:
        self.net = net
        self.multipliers = multipliers
        self.device = device
        self.horizon = horizon
        self.price_norm = price_norm
        self.inv_norm = inv_norm
        self.greedy = greedy
        self.net.eval()

    def select_prices(self, state: InventoryState, day: int = 0) -> np.ndarray:
        import torch
        obs = encode_state_per_item(state, day, self.horizon, self.price_norm, self.inv_norm)
        with torch.no_grad():
            obs_t = torch.from_numpy(obs).to(self.device)
            logits, _ = self.net(obs_t)
            if self.greedy:
                action = logits.argmax(dim=-1).cpu().numpy()
            else:
                action = torch.distributions.Categorical(logits=logits).sample().cpu().numpy()
        return state.base_prices * self.multipliers[action]


# ────────────────────────────────────────────────────────────────────
# Workspace + persistence helpers
# ────────────────────────────────────────────────────────────────────


def workspace_root() -> Path:
    return Path(os.environ.get("PLAYBOOK_WORKSPACE", str(Path.home())))


def model_dir() -> Path:
    return workspace_root() / "models" / "pricing_ppo"


def processed_dir() -> Path:
    return workspace_root() / "data" / "processed"


def save_checkpoint(
    net,
    cfg: PPOConfig,
    multipliers: np.ndarray,
    catalog_meta: Dict[str, Any],
    train_curve: List[float],
    path: Path,
) -> None:
    import torch
    path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(
        {
            "state_dict": net.state_dict(),
            "config": cfg.__dict__,
            "multipliers": multipliers.tolist(),
            "catalog_meta": catalog_meta,
            "train_curve": train_curve,
        },
        path,
    )


def load_checkpoint(path: Path):
    import torch
    return torch.load(path, map_location="cpu", weights_only=False)


def save_training_curve_png(train_curve: List[float], path: Path) -> None:
    try:
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
    except ImportError:
        print(f"matplotlib not available, skipping {path}", file=sys.stderr)
        return
    path.parent.mkdir(parents=True, exist_ok=True)
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(train_curve, color="#76b900", linewidth=1.5)
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Mean episode revenue")
    ax.set_title("PPO training curve")
    ax.grid(alpha=0.3)
    fig.tight_layout()
    fig.savefig(path, dpi=120)
    plt.close(fig)


# ────────────────────────────────────────────────────────────────────
# CLI
# ────────────────────────────────────────────────────────────────────


def cmd_smoke(args: argparse.Namespace) -> int:
    """Run baselines on a tiny synthetic catalog and print KPIs."""
    print(f"Running pricing smoke test ({args.n_items} items, {args.horizon} days)…")
    catalog = build_synthetic_catalog(n_items=args.n_items, seed=args.seed)
    print(
        f"Catalog: {len(catalog)} items, categories: "
        f"{dict(zip(*np.unique(catalog['category'], return_counts=True)))}"
    )

    demand = DemandModel(PRICING_CONFIG["demand_model"])
    simulator = Simulator(demand, PRICING_CONFIG)
    initial_state = InventoryState.initialize(catalog, PRICING_CONFIG, seed=args.seed)

    policies = [FixedPrice(), AgeMarkdown(weekly_discount=0.05)]
    rows = [run_policy(p, initial_state, simulator, args.horizon, seed=args.seed) for p in policies]
    print_kpi_table(rows)
    return 0


def _ppo_config_from_args(args: argparse.Namespace) -> PPOConfig:
    return PPOConfig(
        n_iters=args.n_iters,
        n_envs=args.n_envs,
        horizon=args.horizon,
        n_items=args.n_items,
        lr=args.lr,
        device=args.device,
        seed=args.seed,
    )


def _print_catalog_summary(catalog: pd.DataFrame, source: str) -> None:
    cat_counts = catalog["category"].value_counts().to_dict()
    price_med = float(catalog["avg_price"].median())
    print(
        f"Catalog [{source}]: {len(catalog)} items | "
        f"price median ${price_med:.2f} | tiers: {cat_counts}"
    )


def cmd_train(args: argparse.Namespace) -> int:
    cfg = _ppo_config_from_args(args)
    catalog = load_catalog(n_items=cfg.n_items, seed=cfg.seed, synthetic=args.synthetic)
    source = "synthetic" if args.synthetic else "Amazon Dresses"
    multipliers = np.asarray(PRICING_CONFIG["price_grid"]["multipliers"], dtype=np.float64)
    _print_catalog_summary(catalog, source)
    print(f"Multipliers: {multipliers.tolist()}, horizon: {cfg.horizon}d")

    trainer = PPOTrainer(cfg, catalog, multipliers)
    trainer.train(verbose=True)

    ckpt_path = model_dir() / "policy.pt"
    catalog_meta = {
        "n_items": len(catalog),
        "seed": cfg.seed,
        "synthetic": args.synthetic,
        "source": source,
        "price_norm": float(catalog["avg_price"].max()),
    }
    save_checkpoint(trainer.net, cfg, multipliers, catalog_meta, trainer.train_curve, ckpt_path)
    print(f"Saved checkpoint → {ckpt_path}")

    curve_path = processed_dir() / "pricing_training_curve.png"
    save_training_curve_png(trainer.train_curve, curve_path)
    print(f"Saved training curve → {curve_path}")
    return 0


def cmd_eval(args: argparse.Namespace) -> int:
    import torch

    ckpt_path = model_dir() / "policy.pt"
    if not ckpt_path.exists():
        print(f"No checkpoint at {ckpt_path}. Run `train` first.", file=sys.stderr)
        return 2

    ckpt = load_checkpoint(ckpt_path)
    cfg_dict = ckpt["config"]
    multipliers = np.asarray(ckpt["multipliers"], dtype=np.float64)
    catalog_meta = ckpt.get("catalog_meta", {})
    synthetic = bool(catalog_meta.get("synthetic", False))
    n_items = int(catalog_meta.get("n_items", cfg_dict.get("n_items", 1000)))
    catalog = load_catalog(n_items=n_items, seed=args.seed, synthetic=synthetic)
    source = catalog_meta.get("source", "synthetic" if synthetic else "Amazon Dresses")
    _print_catalog_summary(catalog, source)

    demand = DemandModel(PRICING_CONFIG["demand_model"])
    simulator = Simulator(demand, PRICING_CONFIG)
    initial_state = InventoryState.initialize(catalog, PRICING_CONFIG, seed=args.seed)

    device = PPOTrainer._resolve_device(args.device)
    net = _build_actor_critic(len(multipliers)).to(device)
    net.load_state_dict(ckpt["state_dict"])
    ppo_policy = PPOPolicy(
        net=net,
        multipliers=multipliers,
        device=device,
        horizon=cfg_dict["horizon"],
        price_norm=float(initial_state.base_prices.max()),
        inv_norm=float(initial_state._initial_inventories.max()),
        greedy=True,
    )

    policies = [FixedPrice(), AgeMarkdown(weekly_discount=0.05), ppo_policy]
    print(f"\nEvaluating ({cfg_dict['n_items']} items, {cfg_dict['horizon']} days, device={device}):")
    rows = [run_policy(p, initial_state, simulator, cfg_dict["horizon"], seed=args.seed) for p in policies]
    print_kpi_table(rows)

    # Save eval results
    eval_path = processed_dir() / "pricing_eval.json"
    eval_path.parent.mkdir(parents=True, exist_ok=True)
    baseline_rev = rows[0][1].total_revenue
    summary = {
        "n_items": cfg_dict["n_items"],
        "horizon": cfg_dict["horizon"],
        "device": device,
        "seed": args.seed,
        "policies": [
            {
                "name": name,
                "total_revenue": res.total_revenue,
                "total_margin": res.total_margin,
                "avg_stockout_rate": res.avg_stockout_rate,
                "sell_through_rate": res.sell_through_rate,
                "revenue_lift_vs_fixed_price": (res.total_revenue / baseline_rev) if baseline_rev > 0 else 1.0,
                "elapsed_s": elapsed,
            }
            for name, res, elapsed in rows
        ],
    }
    with open(eval_path, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"Saved eval summary → {eval_path}")
    return 0


def cmd_train_and_eval(args: argparse.Namespace) -> int:
    rc = cmd_train(args)
    if rc != 0:
        return rc
    return cmd_eval(args)


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="pricing_agent",
        description="Dynamic pricing agent for the rec-sys playbook.",
    )
    sub = parser.add_subparsers(dest="cmd", required=False)

    p_smoke = sub.add_parser("smoke", help="Run baselines on a tiny synthetic catalog.")
    p_smoke.add_argument("--n-items", type=int, default=100)
    p_smoke.add_argument("--horizon", type=int, default=14)
    p_smoke.add_argument("--seed", type=int, default=0)
    p_smoke.set_defaults(func=cmd_smoke)

    def _add_train_flags(p: argparse.ArgumentParser) -> None:
        p.add_argument("--n-iters", type=int, default=200)
        p.add_argument("--n-envs", type=int, default=16)
        p.add_argument("--horizon", type=int, default=14)
        p.add_argument(
            "--n-items",
            type=int,
            default=1000,
            help="Sample size from the Amazon Dresses catalog. Pass 0 for the full ~14k items.",
        )
        p.add_argument("--lr", type=float, default=3e-4)
        p.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"])
        p.add_argument("--seed", type=int, default=0)
        p.add_argument(
            "--synthetic",
            action="store_true",
            help="Use a generated synthetic catalog instead of the real Amazon Dresses data.",
        )

    p_train = sub.add_parser("train", help="Train the PPO agent against the simulator.")
    _add_train_flags(p_train)
    p_train.set_defaults(func=cmd_train)

    p_eval = sub.add_parser("eval", help="Evaluate trained PPO + baselines.")
    p_eval.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"])
    p_eval.add_argument("--seed", type=int, default=0)
    p_eval.set_defaults(func=cmd_eval)

    p_tae = sub.add_parser(
        "train_and_eval",
        help="Train the PPO agent, then evaluate vs. baselines. Default if no subcommand is given.",
    )
    _add_train_flags(p_tae)
    p_tae.set_defaults(func=cmd_train_and_eval)

    return parser


_VALID_SUBCOMMANDS = {"smoke", "train", "eval", "train_and_eval"}


def main(argv: List[str] | None = None) -> int:
    if argv is None:
        argv = sys.argv[1:]
    # Default to `train_and_eval` if the user didn't pick a subcommand. Letting
    # `--help` through gives them the top-level help; anything else (flags, no
    # args) implies they want the full train→eval flow.
    if not argv or (argv[0] not in _VALID_SUBCOMMANDS and argv[0] not in {"-h", "--help"}):
        argv = ["train_and_eval", *argv]
    parser = build_parser()
    args = parser.parse_args(argv)
    return args.func(args)


if __name__ == "__main__":
    raise SystemExit(main())