lerobot/examples/dataset/visualization_tools/action_consistency.py

"""
Action consistency analysis for imitation learning datasets.

Two parallel analyses per dataset:
  1. State-based: KNN in joint-state space → action chunk variance
  2. Image-based: KNN in SigLIP embedding space → action chunk variance

Comparing them reveals whether visual similarity and proprioceptive similarity
agree on where the data is inconsistent — and images are what the policy
primarily sees.
"""

import json
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from huggingface_hub import snapshot_download
from matplotlib.colors import LinearSegmentedColormap
from PIL import Image
from scipy.spatial import cKDTree
from transformers import AutoModel, AutoProcessor

DATASETS = [
    {"repo_id": "lerobot-data-collection/level2_final_quality3", "label": "HQ curated"},
    {"repo_id": "lerobot-data-collection/level12_rac_2_2026-02-08_1", "label": "Full collection"},
]
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)

MAX_FRAMES = 100_000
K_NEIGHBORS = 50
ACTION_CHUNK_SIZE = 30
CAMERA_KEY = "observation.images.base"
ENCODER_MODEL = "google/siglip-base-patch16-224"
ENCODE_BATCH_SIZE = 128
SEED = 42
DPI = 150

CONSISTENCY_CMAP = LinearSegmentedColormap.from_list(
    "consistency", ["#0a2e0a", "#1a8e1a", "#88cc22", "#ffaa22", "#ff2222"]
)

# FK chains from OpenArm bimanual URDF (same as workspace_density.py).
LEFT_CHAIN = [
    ((-np.pi / 2, 0, 0), (0, 0.031, 0.698), None),
    ((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
    ((-np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
    ((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
    ((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
    ((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
    ((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
    ((0, 0, 0), (-0.0375, 0, 0), (0, -1, 0)),
    ((0, 0, 0), (0, 0, 0.1001), None),
    ((0, 0, 0), (0, 0, 0.08), None),
]
RIGHT_CHAIN = [
    ((np.pi / 2, 0, 0), (0, -0.031, 0.698), None),
    ((0, 0, 0), (0, 0, 0.0625), (0, 0, 1)),
    ((np.pi / 2, 0, 0), (-0.0301, 0, 0.06), (-1, 0, 0)),
    ((0, 0, 0), (0.0301, 0, 0.06625), (0, 0, 1)),
    ((0, 0, 0), (0, 0.0315, 0.15375), (0, 1, 0)),
    ((0, 0, 0), (0, -0.0315, 0.0955), (0, 0, 1)),
    ((0, 0, 0), (0.0375, 0, 0.1205), (1, 0, 0)),
    ((0, 0, 0), (-0.0375, 0, 0), (0, 1, 0)),
    ((0, 0, 0), (0, 0, 0.1001), None),
    ((0, 0, 0), (0, 0, 0.08), None),
]


# ── FK math ─────────────────────────────────────────────


def _rot_x(a: float) -> np.ndarray:
    c, s = np.cos(a), np.sin(a)
    return np.array([[1, 0, 0], [0, c, -s], [0, s, c]])


def _rot_y(a: float) -> np.ndarray:
    c, s = np.cos(a), np.sin(a)
    return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]])


def _rot_z(a: float) -> np.ndarray:
    c, s = np.cos(a), np.sin(a)
    return np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])


def _tf(rpy: tuple, xyz: tuple) -> np.ndarray:
    r, p, y = rpy
    mat = np.eye(4)
    mat[:3, :3] = _rot_z(y) @ _rot_y(p) @ _rot_x(r)
    mat[:3, 3] = xyz
    return mat


def _batch_axis_rot(axis: tuple, angles: np.ndarray) -> np.ndarray:
    n = len(angles)
    ax = np.asarray(axis, dtype=np.float64)
    ax = ax / np.linalg.norm(ax)
    x, y, z = ax
    c = np.cos(angles)
    s = np.sin(angles)
    t = 1 - c
    rot = np.zeros((n, 4, 4))
    rot[:, 0, 0] = t * x * x + c
    rot[:, 0, 1] = t * x * y - s * z
    rot[:, 0, 2] = t * x * z + s * y
    rot[:, 1, 0] = t * x * y + s * z
    rot[:, 1, 1] = t * y * y + c
    rot[:, 1, 2] = t * y * z - s * x
    rot[:, 2, 0] = t * x * z - s * y
    rot[:, 2, 1] = t * y * z + s * x
    rot[:, 2, 2] = t * z * z + c
    rot[:, 3, 3] = 1.0
    return rot


def batch_fk(chain: list, joint_angles: np.ndarray) -> np.ndarray:
    n = joint_angles.shape[0]
    tf_batch = np.tile(np.eye(4), (n, 1, 1))
    qi = 0
    for rpy, xyz, axis in chain:
        tf_batch = tf_batch @ _tf(rpy, xyz)
        if axis is not None:
            rot = _batch_axis_rot(axis, joint_angles[:, qi])
            tf_batch = np.einsum("nij,njk->nik", tf_batch, rot)
            qi += 1
    return tf_batch[:, :3, 3]


# ── Data helpers ────────────────────────────────────────


def _flatten_names(obj: object) -> list[str]:
    if isinstance(obj, dict):
        out: list[str] = []
        for v in obj.values():
            out.extend(_flatten_names(v))
        return out
    if isinstance(obj, (list, tuple)):
        out = []
        for item in obj:
            if isinstance(item, (list, tuple, dict)):
                out.extend(_flatten_names(item))
            else:
                out.append(str(item))
        return out
    return [str(obj)]


def _detect_and_convert(vals: np.ndarray) -> np.ndarray:
    mx = np.max(np.abs(vals))
    if mx > 360:
        print(f"    Unit detection: servo ticks (max={mx:.0f})")
        return (vals - 2048) / 2048 * np.pi
    if mx > 6.3:
        print(f"    Unit detection: degrees (max={mx:.1f})")
        return np.deg2rad(vals)
    print(f"    Unit detection: radians (max={mx:.3f})")
    return vals.astype(np.float64)


def _find_joint_indices(features: dict, state_col: str, n_dim: int) -> tuple[list[int], list[int]]:
    feat = features.get("observation.state", features.get(state_col, {}))
    names = _flatten_names(feat.get("names", []))
    left_idx: list[int] = []
    right_idx: list[int] = []
    if names and len(names) == n_dim:
        names_l = [n.lower() for n in names]
        print(f"  Feature names: {names[:4]}…{names[-4:]}")
        for j in range(1, 8):
            for i, nm in enumerate(names_l):
                if f"left_joint_{j}" in nm and i not in left_idx:
                    left_idx.append(i)
                    break
            for i, nm in enumerate(names_l):
                if f"right_joint_{j}" in nm and i not in right_idx:
                    right_idx.append(i)
                    break
    if len(left_idx) == 7 and len(right_idx) == 7:
        print(f"  Matched by name: left={left_idx} right={right_idx}")
        return left_idx, right_idx
    if n_dim >= 16:
        print("  Falling back to positional: [0:7]=left, [8:15]=right")
        return list(range(7)), list(range(8, 15))
    if n_dim >= 14:
        print("  Falling back to positional: [0:7]=left, [7:14]=right")
        return list(range(7)), list(range(7, 14))
    raise RuntimeError(f"State dim {n_dim} too small for bimanual 7-DOF robot")


def download_data(repo_id: str, camera_key: str) -> Path:
    print(f"  Downloading {repo_id} (parquet + {camera_key} videos) …")
    return Path(
        snapshot_download(
            repo_id=repo_id,
            repo_type="dataset",
            allow_patterns=[
                "meta/**",
                "data/**",
                f"videos/{camera_key}/**",
            ],
        )
    )


# ── Data loading ────────────────────────────────────────


def _build_action_chunks(
    actions: np.ndarray, episode_ids: np.ndarray, chunk_size: int
) -> tuple[np.ndarray, np.ndarray]:
    """
    For each frame, concatenate the next chunk_size actions from the same episode.
    Returns (action_chunks, valid_mask).
    """
    n = len(actions)
    act_dim = actions.shape[1]
    chunks = np.zeros((n, chunk_size * act_dim), dtype=np.float64)
    valid = np.zeros(n, dtype=bool)

    for i in range(n):
        end = i + chunk_size
        if end > n:
            continue
        if episode_ids[i] != episode_ids[end - 1]:
            continue
        chunks[i] = actions[i:end].ravel()
        valid[i] = True

    return chunks, valid


def load_state_action_data(local: Path, max_frames: int, chunk_size: int, rng: np.random.Generator) -> dict:
    """
    Load observation.state and action, build action chunks, subsample, normalize.
    Also returns the original row indices (`chosen_idx`) for video frame mapping.
    """
    info = json.loads((local / "meta" / "info.json").read_text())
    features = info.get("features", {})

    dfs = [pd.read_parquet(pq) for pq in sorted((local / "data").glob("**/*.parquet"))]
    df = pd.concat(dfs, ignore_index=True)
    n_total = len(df)
    print(f"  Total frames: {n_total:,}")

    state_col = next((c for c in df.columns if "observation.state" in c), None)
    action_col = next((c for c in df.columns if c == "action"), None)
    if state_col is None:
        raise RuntimeError(f"No observation.state column. Available: {list(df.columns)}")
    if action_col is None:
        raise RuntimeError(f"No action column. Available: {list(df.columns)}")

    ep_col = next((c for c in df.columns if c == "episode_index"), None)
    if ep_col is None:
        raise RuntimeError(f"No episode_index column. Available: {list(df.columns)}")

    state_all = np.stack(df[state_col].values).astype(np.float64)
    action_all = np.stack(df[action_col].values).astype(np.float64)
    episode_all = df[ep_col].values.astype(np.int64)

    n_dim = state_all.shape[1]
    act_dim = action_all.shape[1]
    print(f"  State dim: {n_dim}  Action dim: {act_dim}  Chunk size: {chunk_size}")
    print(f"  Action chunk dim: {chunk_size * act_dim}")

    left_idx, right_idx = _find_joint_indices(features, state_col, n_dim)

    print("  Building action chunks …")
    action_chunks, valid = _build_action_chunks(action_all, episode_all, chunk_size)
    valid_idx = np.where(valid)[0]
    print(f"  Valid frames (with full action chunk): {len(valid_idx):,} / {n_total:,}")

    if len(valid_idx) > max_frames:
        chosen = np.sort(rng.choice(valid_idx, max_frames, replace=False))
    else:
        chosen = valid_idx
    print(f"  Using {len(chosen):,} frames")

    state_raw = state_all[chosen]
    action_raw = action_chunks[chosen]
    episode_ids = episode_all[chosen]

    state_mean = state_raw.mean(axis=0)
    state_std = state_raw.std(axis=0)
    state_std[state_std < 1e-8] = 1.0
    state_norm = (state_raw - state_mean) / state_std

    action_mean = action_raw.mean(axis=0)
    action_std = action_raw.std(axis=0)
    action_std[action_std < 1e-8] = 1.0
    action_norm = (action_raw - action_mean) / action_std

    return {
        "state_raw": state_raw,
        "state_norm": state_norm,
        "action_raw": action_raw,
        "action_norm": action_norm,
        "episode_ids": episode_ids,
        "episode_all": episode_all,
        "left_joint_idx": left_idx,
        "right_joint_idx": right_idx,
        "n_total": n_total,
        "chosen_idx": chosen,
        "df": df,
    }


# ── Video → frame extraction ──────────────────────────────


def build_video_lookup(local: Path, camera_key: str) -> dict:
    """
    Build a mapping from episode_index → {video_path, fps, from_ts}.
    """
    info = json.loads((local / "meta" / "info.json").read_text())
    fps = info["fps"]
    video_template = info.get(
        "video_path",
        "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4",
    )

    ep_rows = []
    for pq in sorted((local / "meta" / "episodes").glob("**/*.parquet")):
        ep_rows.append(pd.read_parquet(pq))
    ep_df = pd.concat(ep_rows, ignore_index=True)

    chunk_col = f"videos/{camera_key}/chunk_index"
    file_col = f"videos/{camera_key}/file_index"
    ts_from = f"videos/{camera_key}/from_timestamp"
    if chunk_col not in ep_df.columns:
        chunk_col = f"{camera_key}/chunk_index"
        file_col = f"{camera_key}/file_index"
        ts_from = f"{camera_key}/from_timestamp"

    lookup: dict[int, dict] = {}
    for _, row in ep_df.iterrows():
        ci = int(row[chunk_col])
        fi = int(row[file_col])
        video_rel = video_template.format(video_key=camera_key, chunk_index=ci, file_index=fi)
        lookup[int(row["episode_index"])] = {
            "video_path": local / video_rel,
            "from_ts": float(row[ts_from]),
            "fps": fps,
        }
    return lookup


def extract_frames(
    chosen_idx: np.ndarray,
    episode_all: np.ndarray,
    video_lookup: dict,
) -> list[np.ndarray | None]:
    """
    Extract BGR frames for each chosen global index.
    Uses episode boundaries + fps to compute the seek timestamp.
    Returns list of (H, W, 3) BGR arrays (or None on failure).
    """
    # Build per-episode local frame index: for each row in the dataset,
    # its position within its episode
    unique_eps = np.unique(episode_all)
    ep_start: dict[int, int] = {}
    for ep in unique_eps:
        ep_start[int(ep)] = int(np.where(episode_all == ep)[0][0])

    frames: list[np.ndarray | None] = []
    # Group by video file for efficient sequential access
    jobs: list[tuple[int, int, str, float]] = []
    for out_i, global_i in enumerate(chosen_idx):
        ep = int(episode_all[global_i])
        info = video_lookup.get(ep)
        if info is None:
            jobs.append((out_i, -1, "", 0.0))
            continue
        local_frame = global_i - ep_start[ep]
        seek_ts = info["from_ts"] + local_frame / info["fps"]
        jobs.append((out_i, global_i, str(info["video_path"]), seek_ts))

    jobs.sort(key=lambda x: (x[2], x[3]))

    frames = [None] * len(chosen_idx)
    current_cap = None
    current_path = ""
    extracted = 0
    for out_i, _global_i, vpath, seek_ts in jobs:
        if not vpath:
            continue
        if vpath != current_path:
            if current_cap is not None:
                current_cap.release()
            current_cap = cv2.VideoCapture(vpath)
            current_path = vpath
        if current_cap is None or not current_cap.isOpened():
            continue
        current_cap.set(cv2.CAP_PROP_POS_MSEC, seek_ts * 1000.0)
        ret, frame = current_cap.read()
        if ret:
            frames[out_i] = frame
            extracted += 1
    if current_cap is not None:
        current_cap.release()

    print(f"  Extracted {extracted:,} / {len(chosen_idx):,} frames from video")
    return frames


# ── SigLIP encoding ─────────────────────────────────────


def encode_frames_siglip(
    frames: list[np.ndarray | None],
    model_name: str,
    batch_size: int,
    device: torch.device,
) -> np.ndarray:
    """
    Encode BGR frames through SigLIP vision encoder.
    Returns (N, embed_dim) float32 array. Frames that are None get a zero vector.
    """
    print(f"  Loading SigLIP model: {model_name} …")
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device).eval()
    embed_dim = model.config.vision_config.hidden_size

    n = len(frames)
    embeddings = np.zeros((n, embed_dim), dtype=np.float32)

    # Collect valid frame indices
    valid_indices = [i for i, f in enumerate(frames) if f is not None]
    print(f"  Encoding {len(valid_indices):,} valid frames in batches of {batch_size} …")

    for batch_start in range(0, len(valid_indices), batch_size):
        batch_idx = valid_indices[batch_start : batch_start + batch_size]
        pil_images = []
        for i in batch_idx:
            bgr = frames[i]
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            pil_images.append(Image.fromarray(rgb))

        inputs = processor(images=pil_images, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        # L2-normalize embeddings for cosine-like KNN
        image_features = torch.nn.functional.normalize(image_features, dim=-1)
        embeddings[batch_idx] = image_features.cpu().numpy()

        done = min(batch_start + batch_size, len(valid_indices))
        if done % (batch_size * 10) == 0 or done == len(valid_indices):
            print(f"    {done:,} / {len(valid_indices):,} encoded")

    del model, processor
    torch.cuda.empty_cache()
    return embeddings


# ── KNN consistency ─────────────────────────────────────


def compute_consistency(
    features: np.ndarray,
    action_norm: np.ndarray,
    episode_ids: np.ndarray,
    k: int,
    label: str = "",
) -> np.ndarray:
    """
    For each frame, find K nearest neighbors in feature space from other episodes.
    Return per-frame action variance (mean across action dims).
    """
    n = len(features)
    print(f"  Building KD-tree on {n:,} vectors ({label}) …")
    tree = cKDTree(features)

    k_query = min(k * 3, n - 1)
    print(f"  Querying {k_query} neighbors per frame …")
    _dists, indices = tree.query(features, k=k_query + 1)
    indices = indices[:, 1:]

    print(f"  Computing cross-episode action variance ({label}) …")
    variance = np.zeros(n)
    for i in range(n):
        ep_i = episode_ids[i]
        neighbors = indices[i]
        cross_ep = neighbors[episode_ids[neighbors] != ep_i][:k]
        if len(cross_ep) < 2:
            variance[i] = 0.0
            continue
        neighbor_actions = action_norm[cross_ep]
        variance[i] = np.mean(np.var(neighbor_actions, axis=0))

    return variance


# ── Visualization ───────────────────────────────────────


def _style_ax(ax: plt.Axes) -> None:
    ax.set_facecolor("#0d1117")
    ax.tick_params(colors="#555", labelsize=8)
    for spine in ax.spines.values():
        spine.set_color("#333")


def _plot_histogram(ax: plt.Axes, variance: np.ndarray, title: str, color: str) -> None:
    _style_ax(ax)
    median_var = np.median(variance)
    mean_var = np.mean(variance)
    nonzero = variance[variance > 0]
    if len(nonzero) > 0:
        bins = np.logspace(np.log10(nonzero.min().clip(1e-6)), np.log10(nonzero.max()), 60)
        ax.hist(nonzero, bins=bins, color=color, alpha=0.8, edgecolor="#222")
    ax.set_xscale("log")
    ax.axvline(median_var, color="#ff6600", linewidth=2, label=f"median={median_var:.3f}")
    ax.axvline(mean_var, color="#ff2222", linewidth=2, linestyle="--", label=f"mean={mean_var:.3f}")
    ax.set_xlabel("Action variance (log scale)", color="#888", fontsize=10)
    ax.set_ylabel("Frame count", color="#888", fontsize=10)
    ax.set_title(title, color="white", fontsize=11, pad=10)
    ax.legend(fontsize=8, facecolor="#1a1a2e", edgecolor="#333", labelcolor="white")


def _plot_episode_curves(
    ax: plt.Axes,
    var_state: np.ndarray,
    var_image: np.ndarray,
    episode_ids: np.ndarray,
    title: str,
) -> None:
    _style_ax(ax)
    unique_eps = np.unique(episode_ids)

    ep_means_s = np.array([var_state[episode_ids == ep].mean() for ep in unique_eps])
    ep_means_i = np.array([var_image[episode_ids == ep].mean() for ep in unique_eps])

    sorted_s = np.sort(ep_means_s)[::-1]
    sorted_i = np.sort(ep_means_i)[::-1]
    ep_x = np.arange(len(unique_eps))

    ax.fill_between(ep_x, sorted_s, alpha=0.2, color="#4363d8")
    ax.plot(ep_x, sorted_s, color="#4363d8", linewidth=1.2, label=f"State (med={np.median(ep_means_s):.3f})")
    ax.fill_between(ep_x, sorted_i, alpha=0.2, color="#e6194b")
    ax.plot(ep_x, sorted_i, color="#e6194b", linewidth=1.2, label=f"Image (med={np.median(ep_means_i):.3f})")

    ax.set_xlabel("Episode rank (worst → best)", color="#888", fontsize=10)
    ax.set_ylabel("Mean action variance", color="#888", fontsize=10)
    ax.set_title(title, color="white", fontsize=11, pad=10)
    ax.legend(fontsize=8, facecolor="#1a1a2e", edgecolor="#333", labelcolor="white")


def _plot_heatmap(
    ax: plt.Axes, fig: plt.Figure, tcp_xz: np.ndarray, variance: np.ndarray, title: str
) -> None:
    _style_ax(ax)
    order = np.argsort(variance)
    pts = tcp_xz[order]
    var_sorted = variance[order]
    vmin = np.percentile(variance[variance > 0], 5) if np.any(variance > 0) else 0
    vmax = np.percentile(variance[variance > 0], 95) if np.any(variance > 0) else 1
    sc = ax.scatter(
        pts[:, 0],
        pts[:, 1],
        c=var_sorted,
        cmap=CONSISTENCY_CMAP,
        s=0.5,
        alpha=0.6,
        vmin=vmin,
        vmax=vmax,
        rasterized=True,
    )
    ax.set_xlabel("X (m)", color="#888", fontsize=10)
    ax.set_ylabel("Z (m)", color="#888", fontsize=10)
    ax.set_title(title, color="white", fontsize=11, pad=10)
    ax.set_aspect("equal")
    cbar = fig.colorbar(sc, ax=ax, shrink=0.8, pad=0.02)
    cbar.set_label("Action variance", color="white", fontsize=9)
    cbar.ax.tick_params(colors="#aaa", labelsize=7)


def render(results: list[dict], out_path: Path) -> None:
    """
    4-row x N-column figure:
      Row 0: State-based variance histogram
      Row 1: Image-based variance histogram
      Row 2: Per-episode curves (both overlaid)
      Row 3: Spatial heatmap (image-based variance)
    """
    n_ds = len(results)
    fig, axes = plt.subplots(4, n_ds, figsize=(9 * n_ds, 24), facecolor="#0d1117")
    if n_ds == 1:
        axes = axes[:, np.newaxis]

    headline_parts = []
    for col, r in enumerate(results):
        label = r["label"]
        var_s = r["var_state"]
        var_i = r["var_image"]
        tcp_xz = r["tcp_xz"]
        episode_ids = r["episode_ids"]

        med_s = np.median(var_s)
        med_i = np.median(var_i)
        headline_parts.append(f"{label}: state={med_s:.3f}, image={med_i:.3f}")

        _plot_histogram(axes[0, col], var_s, f"{label}\nState-based variance (K={K_NEIGHBORS})", "#4363d8")
        _plot_histogram(
            axes[1, col], var_i, f"{label}\nImage-based variance (SigLIP, K={K_NEIGHBORS})", "#e6194b"
        )
        _plot_episode_curves(
            axes[2, col],
            var_s,
            var_i,
            episode_ids,
            f"{label}\nPer-episode inconsistency ({len(np.unique(episode_ids)):,} episodes)",
        )
        _plot_heatmap(
            axes[3, col],
            fig,
            tcp_xz,
            var_i,
            f"{label}\nImage-based variance by TCP position (XZ)",
        )

    fig.suptitle(
        f"Action Consistency: State vs Image  (chunk={ACTION_CHUNK_SIZE}, K={K_NEIGHBORS})\n"
        + "  |  ".join(headline_parts),
        color="white",
        fontsize=15,
        y=0.99,
    )
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(out_path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
    plt.close()
    print(f"\n✓ Saved: {out_path}")


# ── Main ────────────────────────────────────────────────


def main() -> None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    rng = np.random.default_rng(SEED)
    results = []

    for ds in DATASETS:
        repo_id, label = ds["repo_id"], ds["label"]
        print(f"\n{'=' * 60}")
        print(f"  {label}: {repo_id}")
        print(f"{'=' * 60}")

        local = download_data(repo_id, CAMERA_KEY)
        data = load_state_action_data(local, MAX_FRAMES, ACTION_CHUNK_SIZE, rng)

        # --- State-based KNN ---
        var_state = compute_consistency(
            data["state_norm"], data["action_norm"], data["episode_ids"], K_NEIGHBORS, "state"
        )
        print(
            f"  State variance: median={np.median(var_state):.4f}  "
            f"mean={np.mean(var_state):.4f}  p90={np.percentile(var_state, 90):.4f}"
        )

        # --- Image-based KNN ---
        print("\n  Preparing image embeddings …")
        video_lookup = build_video_lookup(local, CAMERA_KEY)
        frames = extract_frames(data["chosen_idx"], data["episode_all"], video_lookup)
        embeddings = encode_frames_siglip(frames, ENCODER_MODEL, ENCODE_BATCH_SIZE, device)
        del frames  # free memory

        var_image = compute_consistency(
            embeddings, data["action_norm"], data["episode_ids"], K_NEIGHBORS, "image"
        )
        print(
            f"  Image variance: median={np.median(var_image):.4f}  "
            f"mean={np.mean(var_image):.4f}  p90={np.percentile(var_image, 90):.4f}"
        )

        # FK for spatial heatmap
        print("  Computing FK for spatial heatmap …")
        left_raw = data["state_raw"][:, data["left_joint_idx"]]
        left_rad = _detect_and_convert(left_raw)
        left_tcp = batch_fk(LEFT_CHAIN, left_rad)
        tcp_xz = left_tcp[:, [0, 2]]

        results.append(
            {
                "label": label,
                "var_state": var_state,
                "var_image": var_image,
                "episode_ids": data["episode_ids"],
                "tcp_xz": tcp_xz,
                "n_total": data["n_total"],
            }
        )

    out = OUTPUT_DIR / "action_consistency_comparison.jpg"
    render(results, out)

    # Save worst-episodes summary (image-based, since that's the stronger signal)
    worst_summary = {}
    for r in results:
        unique_eps = np.unique(r["episode_ids"])
        ep_means = {int(ep): float(r["var_image"][r["episode_ids"] == ep].mean()) for ep in unique_eps}
        ranked = sorted(ep_means.items(), key=lambda x: x[1], reverse=True)[:50]
        worst_summary[r["label"]] = [{"episode": ep, "mean_variance": v} for ep, v in ranked]
    worst_path = OUTPUT_DIR / "action_consistency_worst_episodes.json"
    worst_path.write_text(json.dumps(worst_summary, indent=2))
    print(f"✓ Saved worst episodes: {worst_path}")


if __name__ == "__main__":
    main()