Add Robometer reward model

2026-07-10 11:31:57 +00:00 · 2026-05-17 14:59:23 +02:00
parent 9db9c35cb4
commit f6a13b1338
19 changed files with 2701 additions and 10 deletions
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Pinpoint exactly which rows of ``embed_tokens`` / ``lm_head`` differ.
+
+Useful follow-up to ``scripts/verify_robometer_export.py`` when the verifier
+reports a small tail of differing keys but you want to know whether the
+diff is:
+
+1. Concentrated in the 5 special-token rows added by ``resize_token_embeddings``
+   (expected non-determinism: mean-resize sampling differs between runs).
+2. Spread across the full vocabulary (would point to a real loading bug).
+
+Also confirms whether ``apply_upstream_checkpoint`` actually overwrites the
+embed/lm-head tensors when loading the upstream state dict (vs. silently
+skipping them due to a key mismatch).
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+import torch
+from safetensors.torch import load_file
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer._upstream_loader import (
+    _download_robometer_snapshot,
+    _remap_state_dict_keys,
+    _resolve_checkpoint_safetensors_files,
+    apply_upstream_checkpoint,
+)
+
+EMBED_KEY = "model.model.language_model.embed_tokens.weight"
+LMHEAD_KEY = "model.lm_head.weight"
+
+
+def _load_upstream(path: str) -> RobometerRewardModel:
+    cfg = RobometerConfig(pretrained_path=path, device="cpu")
+    model = RobometerRewardModel(cfg)
+    apply_upstream_checkpoint(model, path)
+    model.eval()
+    return model
+
+
+def _load_lerobot(path: str) -> RobometerRewardModel:
+    cfg = RewardModelConfig.from_pretrained(path)
+    if not isinstance(cfg, RobometerConfig):
+        raise TypeError(f"Expected RobometerConfig, got {type(cfg)}")
+    cfg.pretrained_path = path
+    cfg.device = "cpu"
+    return RobometerRewardModel.from_pretrained(path, config=cfg)
+
+
+def _inspect_upstream_state_dict(upstream_path: str, model: RobometerRewardModel) -> None:
+    """Dump the upstream state-dict view of the embed/lm-head tensors.
+
+    Loads the raw upstream safetensors (pre-remap), runs the remapper, and
+    reports whether the embed/lm-head keys survive into the merged dict that
+    eventually hits ``model.load_state_dict``.
+    """
+    snapshot_dir = _download_robometer_snapshot(upstream_path)
+    files = _resolve_checkpoint_safetensors_files(snapshot_dir)
+    merged: dict[str, torch.Tensor] = {}
+    for path in files:
+        merged.update(load_file(str(path)))
+    remapped = _remap_state_dict_keys(merged, model)
+
+    print(f"\n=== Upstream state-dict inspection (snapshot at {snapshot_dir}) ===")
+    print(f"raw keys (before remap)  : {len(merged)}")
+    print(f"keys after remap         : {len(remapped)}")
+    print(f"model expects (state_dict): {len(model.state_dict())}")
+
+    expected = set(model.state_dict())
+    present_after_remap = set(remapped) & expected
+    print(f"keys present after remap : {len(present_after_remap)}")
+
+    missing_keys = expected - set(remapped)
+    print(f"keys missing from remap  : {len(missing_keys)}")
+    if missing_keys:
+        sample = list(missing_keys)[:10]
+        print(f"  sample missing keys    : {sample}")
+
+    unexpected_keys = set(remapped) - expected
+    print(f"keys unexpected by model : {len(unexpected_keys)}")
+    if unexpected_keys:
+        sample = list(unexpected_keys)[:10]
+        print(f"  sample unexpected keys : {sample}")
+
+    for key in (EMBED_KEY, LMHEAD_KEY):
+        present = key in remapped
+        shape = tuple(remapped[key].shape) if present else None
+        print(f"  {key:60s}  present={present}, shape={shape}")
+
+
+def _diff_embed(name: str, a: torch.Tensor, b: torch.Tensor, special_token_count: int) -> None:
+    a = a.float()
+    b = b.float()
+    if a.shape != b.shape:
+        print(f"❌ {name} shape mismatch: {tuple(a.shape)} vs {tuple(b.shape)}")
+        return
+
+    abs_diff = (a - b).abs()
+    per_row_max = abs_diff.max(dim=1).values
+    nz_rows = (per_row_max > 0).nonzero(as_tuple=True)[0].tolist()
+    print(f"\n=== {name} (shape {tuple(a.shape)}) ===")
+    print(f"global max|Δ|         = {abs_diff.max().item():.3e}")
+    print(f"rows with any diff    = {len(nz_rows)}")
+    if nz_rows:
+        first = nz_rows[:10]
+        last = nz_rows[-10:]
+        print(f"  first nonzero rows  = {first}")
+        print(f"  last nonzero rows   = {last}")
+        vocab_size = a.shape[0]
+        base_vocab = vocab_size - special_token_count
+        special_rows = list(range(base_vocab, vocab_size))
+        in_special = [r for r in nz_rows if r in special_rows]
+        out_special = [r for r in nz_rows if r not in special_rows]
+        print(
+            f"  diffs in special-token rows ({base_vocab}..{vocab_size - 1}): {len(in_special)}/{special_token_count}"
+        )
+        print(f"  diffs in base-vocab rows  (0..{base_vocab - 1})           : {len(out_special)}")
+        for r in special_rows:
+            print(
+                f"    row {r}: max|Δ|={per_row_max[r].item():.3e}, "
+                f"upstream_norm={a[r].norm().item():.3e}, lerobot_norm={b[r].norm().item():.3e}"
+            )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--upstream", required=True)
+    parser.add_argument("--lerobot", required=True)
+    parser.add_argument(
+        "--special-token-count",
+        type=int,
+        default=5,
+        help="Number of special tokens Robometer adds. Defaults to len(ROBOMETER_SPECIAL_TOKENS)=5.",
+    )
+    args = parser.parse_args()
+
+    print(f"Loading upstream:        {args.upstream}")
+    upstream = _load_upstream(args.upstream)
+    print(f"Loading LeRobot-format:  {args.lerobot}")
+    lerobot = _load_lerobot(args.lerobot)
+
+    _inspect_upstream_state_dict(args.upstream, upstream)
+
+    sd_u, sd_l = upstream.state_dict(), lerobot.state_dict()
+
+    for key in (EMBED_KEY, LMHEAD_KEY):
+        if key not in sd_u or key not in sd_l:
+            print(f"❌ key missing: {key} (upstream={key in sd_u}, lerobot={key in sd_l})")
+            continue
+        _diff_embed(key, sd_u[key], sd_l[key], args.special_token_count)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Extract one LIBERO episode for Robometer parity testing.
+
+Loads a LeRobot LIBERO (or any video-bearing LeRobot) dataset, picks one
+episode, samples ``--num-frames`` frames uniformly across its duration
+(matching upstream Robometer's default of 8 frames), and saves them to
+``.npz`` plus a sidecar ``.txt`` task file.
+
+The ``.npz`` layout (``frames`` key, ``(T, H, W, C) uint8``) is what upstream
+``example_inference_local.py`` consumes, so the same file feeds both pipelines
+and frame sampling cannot drift.
+
+Workflow:
+
+1. Run this script (LeRobot env) to produce ``frames.npz`` + ``task.txt``.
+2. Pass them to upstream ``scripts/example_inference_local.py``
+   (upstream env) to produce reference progress / success outputs.
+3. Pass the same ``frames.npz`` to ``scripts/parity_robometer.py``
+   (LeRobot env) to compare both sides.
+
+Example:
+
+    uv run python scripts/extract_libero_episode_for_parity.py \\
+        --repo-id lerobot/libero_10_image \\
+        --episode 0 \\
+        --num-frames 8 \\
+        --out-dir /tmp/libero_ep0
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from lerobot.configs.types import FeatureType
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+
+def _pick_visual_feature(features: dict, requested: str | None) -> str:
+    """Return a visual feature key, preferring ``requested`` when given."""
+    visual_keys = [
+        key
+        for key, ft in features.items()
+        if getattr(ft, "type", None) == FeatureType.VISUAL or ft.get("dtype", "") == "video"
+    ]
+    if not visual_keys:
+        raise ValueError(f"Dataset has no visual feature; available: {list(features)}")
+    if requested is not None:
+        if requested not in visual_keys:
+            raise ValueError(f"Camera key {requested!r} not in dataset visual features {visual_keys}")
+        return requested
+    return visual_keys[0]
+
+
+def _frame_uint8_hwc(tensor: torch.Tensor) -> np.ndarray:
+    """Convert a LeRobotDataset video frame to ``uint8`` ``(H, W, C)`` RGB."""
+    arr = tensor.detach().cpu().numpy()
+    if arr.ndim == 3 and arr.shape[0] in (1, 3):
+        arr = arr.transpose(1, 2, 0)
+    if arr.dtype != np.uint8:
+        arr = np.clip(arr * 255.0 if arr.max() <= 1.0 + 1e-3 else arr, 0, 255).astype(np.uint8)
+    if arr.shape[-1] == 1:
+        arr = np.repeat(arr, 3, axis=-1)
+    return arr
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--repo-id",
+        default="lerobot/libero_10_image",
+        help="LeRobot LIBERO (or other) dataset repo id (default: lerobot/libero_10_image).",
+    )
+    parser.add_argument("--episode", type=int, default=0, help="Episode index.")
+    parser.add_argument(
+        "--camera-key",
+        default=None,
+        help="Visual feature key (e.g. observation.images.image). Auto-selects first if omitted.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=8,
+        help="Number of frames to sample uniformly (default: 8 — Robometer's training-time default).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=Path("outputs/robometer_parity/libero"),
+        help="Directory to write frames.npz / task.txt / frame_indices.npy.",
+    )
+    args = parser.parse_args()
+
+    print(f"Loading {args.repo_id} (episode {args.episode})...")
+    dataset = LeRobotDataset(args.repo_id, episodes=[args.episode])
+
+    camera_key = _pick_visual_feature(dataset.features, args.camera_key)
+    print(f"Using camera key: {camera_key}")
+
+    ep_from = int(dataset.episode_data_index["from"][0].item())
+    ep_to = int(dataset.episode_data_index["to"][0].item())
+    total_frames = ep_to - ep_from
+    if total_frames <= 0:
+        print(f"ERROR: episode {args.episode} has no frames.", file=sys.stderr)
+        return 1
+    print(f"Episode has {total_frames} frames; sampling {args.num_frames} uniformly.")
+
+    indices = np.linspace(0, total_frames - 1, num=min(args.num_frames, total_frames), dtype=int)
+    frames: list[np.ndarray] = []
+    task: str = ""
+    for offset in indices:
+        sample = dataset[ep_from + int(offset)]
+        frame_tensor = sample[camera_key]
+        frames.append(_frame_uint8_hwc(frame_tensor))
+        if not task:
+            task = sample.get("task", "") or ""
+
+    if not task:
+        print("ERROR: episode has no task description in metadata.", file=sys.stderr)
+        return 1
+
+    frames_array = np.stack(frames)
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    frames_path = args.out_dir / "frames.npz"
+    task_path = args.out_dir / "task.txt"
+    indices_path = args.out_dir / "frame_indices.npy"
+
+    np.savez(frames_path, frames=frames_array)
+    task_path.write_text(task + "\n", encoding="utf-8")
+    np.save(indices_path, indices)
+
+    print()
+    print(f"Wrote {frames_path} (shape={frames_array.shape}, dtype={frames_array.dtype})")
+    print(f"Wrote {task_path}   (task={task!r})")
+    print(f"Wrote {indices_path} (frame_indices={indices.tolist()})")
+    print()
+    print("Next steps:")
+    print("  # in upstream env (where `robometer` is importable):")
+    print(
+        f"  python third_party/robometer/scripts/example_inference_local.py \\\n"
+        f"      --model-path robometer/Robometer-4B \\\n"
+        f"      --video {frames_path} \\\n"
+        f'      --task "{task}" \\\n'
+        f"      --out {args.out_dir / 'upstream.npy'}"
+    )
+    print()
+    print("  # back in LeRobot env:")
+    print(
+        f"  uv run python scripts/parity_robometer.py \\\n"
+        f"      --frames {frames_path} \\\n"
+        f'      --task "{task}" \\\n'
+        f"      --upstream-progress {args.out_dir / 'upstream.npy'} \\\n"
+        f"      --upstream-success  {args.out_dir / 'upstream_success_probs.npy'}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,232 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Functional parity check: LeRobot Robometer vs. upstream Robometer.
+
+Runs the in-tree :class:`RobometerRewardModel` on the same frames + task that
+upstream Robometer was run on, and compares per-frame progress / success
+predictions against reference outputs saved by upstream's
+``scripts/example_inference_local.py``.
+
+Workflow:
+
+1. In the upstream Robometer environment (where ``robometer`` is importable),
+   run::
+
+       python third_party/robometer/scripts/example_inference_local.py \\
+           --model-path robometer/Robometer-4B \\
+           --video /path/to/episode.mp4 \\
+           --task "Open the drawer" \\
+           --fps 1.0 \\
+           --out /tmp/robometer_upstream.npy
+
+   This produces:
+   - ``/tmp/robometer_upstream.npy``               (progress predictions)
+   - ``/tmp/robometer_upstream_success_probs.npy`` (success probabilities)
+
+2. Extract the exact same frames the upstream script used, save as ``.npz``::
+
+       # quick helper: extract frames at the same fps and save as .npz
+       python -c "
+       from third_party.robometer.scripts.example_inference_local import load_frames_input
+       import numpy as np
+       frames = load_frames_input('/path/to/episode.mp4', fps=1.0, max_frames=512)
+       np.savez('/tmp/robometer_frames.npz', frames=frames)
+       "
+
+3. In this LeRobot env, run this script::
+
+       uv run python scripts/parity_robometer.py \\
+           --frames /tmp/robometer_frames.npz \\
+           --task "Open the drawer" \\
+           --upstream-progress /tmp/robometer_upstream.npy \\
+           --upstream-success  /tmp/robometer_upstream_success_probs.npy \\
+           --lerobot-model     lilkm/robometer-4b
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+import numpy as np
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+
+
+def _load_frames(path: str) -> np.ndarray:
+    """Load frames from .npy/.npz. Expects (T, H, W, C) uint8."""
+    if path.endswith(".npy"):
+        frames = np.load(path)
+    elif path.endswith(".npz"):
+        with np.load(path, allow_pickle=False) as npz:
+            frames = npz["frames"].copy() if "frames" in npz else next(iter(npz.values())).copy()
+    else:
+        raise ValueError(f"Frames must be .npy or .npz (got {path!r}).")
+
+    if frames.dtype != np.uint8:
+        frames = np.clip(frames, 0, 255).astype(np.uint8)
+    if frames.ndim != 4:
+        raise ValueError(f"Frames must be 4D (T,H,W,C); got shape {frames.shape}.")
+    if frames.shape[-1] not in (1, 3):
+        # Probably (T,C,H,W) — transpose
+        if frames.shape[1] in (1, 3):
+            frames = frames.transpose(0, 2, 3, 1)
+        else:
+            raise ValueError(f"Cannot interpret frame channel layout: {frames.shape}.")
+    return frames
+
+
+def _run_lerobot(
+    frames: np.ndarray,
+    task: str,
+    model_path: str,
+    device: str,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Run LeRobot's Robometer on the given frames; return (progress, success)."""
+    cfg = RobometerConfig(pretrained_path=model_path, device=device, max_frames=None)
+    model = RobometerRewardModel.from_pretrained(model_path, config=cfg)
+
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=model.config.base_model_id,
+        use_multi_image=model.config.use_multi_image,
+        use_per_frame_progress_token=model.config.use_per_frame_progress_token,
+        max_frames=None,
+    )
+    batch = encoder.encode_samples([(frames, task)])
+
+    model_device = next(model.model.parameters()).device
+    inputs = {key: value.to(model_device) if hasattr(value, "to") else value for key, value in batch.items()}
+
+    model.eval()
+    with torch.no_grad():
+        progress_logits, success_logits = model._compute_rbm_logits(inputs)
+
+    decoded = decode_progress_outputs(
+        progress_logits,
+        success_logits,
+        is_discrete_mode=model.config.use_discrete_progress,
+    )
+    progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
+    success = (
+        np.asarray(decoded["success_probs"][0], dtype=np.float32)
+        if decoded["success_probs"]
+        else np.array([], dtype=np.float32)
+    )
+    return progress, success
+
+
+def _compare(name: str, lerobot: np.ndarray, upstream: np.ndarray, atol: float, rtol: float) -> bool:
+    print(f"\n=== {name} ===")
+    if lerobot.shape != upstream.shape:
+        print(f"shape mismatch: lerobot={lerobot.shape}  upstream={upstream.shape}")
+        return False
+
+    abs_diff = np.abs(lerobot - upstream)
+    rel_diff = abs_diff / (np.abs(upstream) + 1e-12)
+    print(f"shape        : {lerobot.shape}")
+    print(f"max |Δ|      : {abs_diff.max():.3e}")
+    print(f"mean |Δ|     : {abs_diff.mean():.3e}")
+    print(f"max rel |Δ|  : {rel_diff.max():.3e}")
+    print(f"lerobot[:5]  : {lerobot[:5]}")
+    print(f"upstream[:5] : {upstream[:5]}")
+
+    within_tol = bool(np.allclose(lerobot, upstream, atol=atol, rtol=rtol))
+    print(f"allclose(atol={atol}, rtol={rtol}) -> {within_tol}")
+    return within_tol
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--frames",
+        required=True,
+        help=".npy / .npz file with the exact frames upstream was run on (T,H,W,C uint8).",
+    )
+    parser.add_argument("--task", required=True, help="Task instruction string.")
+    parser.add_argument(
+        "--upstream-progress",
+        required=True,
+        help="Reference progress .npy saved by upstream example_inference_local.py.",
+    )
+    parser.add_argument(
+        "--upstream-success",
+        default=None,
+        help="Optional reference success_probs .npy. If omitted, success comparison is skipped.",
+    )
+    parser.add_argument(
+        "--lerobot-model",
+        default="lilkm/robometer-4b",
+        help="LeRobot-format Robometer Hub repo id or local path.",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for the LeRobot model (default: cuda if available).",
+    )
+    parser.add_argument(
+        "--atol",
+        type=float,
+        default=1e-3,
+        help="Absolute tolerance for allclose (default: 1e-3; bf16 round-trip headroom).",
+    )
+    parser.add_argument(
+        "--rtol",
+        type=float,
+        default=1e-2,
+        help="Relative tolerance for allclose (default: 1e-2).",
+    )
+    parser.add_argument(
+        "--out-prefix",
+        default="lerobot_robometer_outputs",
+        help="Save the LeRobot outputs as <prefix>_progress.npy / <prefix>_success.npy.",
+    )
+    args = parser.parse_args()
+
+    # 0. Sanity: confirm the LeRobot config is a RobometerConfig.
+    cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
+    if not isinstance(cfg, RobometerConfig):
+        print(f"ERROR: {args.lerobot_model!r} does not resolve to a RobometerConfig.", file=sys.stderr)
+        return 2
+
+    # 1. Load frames + task + upstream reference outputs.
+    frames = _load_frames(args.frames)
+    upstream_progress = np.load(args.upstream_progress).astype(np.float32)
+    upstream_success = (
+        np.load(args.upstream_success).astype(np.float32) if args.upstream_success is not None else None
+    )
+
+    print(f"Loaded {frames.shape[0]} frames at {frames.shape[1:]}, task={args.task!r}")
+    print(f"LeRobot model: {args.lerobot_model}  device: {args.device}")
+
+    # 2. Run LeRobot pipeline.
+    progress, success = _run_lerobot(frames, args.task, args.lerobot_model, args.device)
+    np.save(f"{args.out_prefix}_progress.npy", progress)
+    if success.size > 0:
+        np.save(f"{args.out_prefix}_success.npy", success)
+    print(f"Saved LeRobot outputs to {args.out_prefix}_progress.npy / _success.npy")
+
+    # 3. Compare to upstream references.
+    progress_ok = _compare("progress", progress, upstream_progress, args.atol, args.rtol)
+    if upstream_success is not None and success.size > 0:
+        success_ok = _compare("success_probs", success, upstream_success, args.atol, args.rtol)
+    else:
+        success_ok = True
+        print("\n(skipping success comparison — upstream success file not provided)")
+
+    print()
+    if progress_ok and success_ok:
+        print("Parity check passed.")
+        return 0
+    print("Parity check FAILED.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+"""Verify that a LeRobot-format Robometer is byte-equivalent to its upstream source.
+
+Run this once after publishing a LeRobot-format Robometer to the Hub, before
+flipping the default `RobometerConfig.pretrained_path` to it. It loads both
+the upstream snapshot and the re-exported copy, compares state dicts, and
+prints a clear pass/fail summary.
+
+Example:
+
+    python scripts/verify_robometer_export.py \\
+        --upstream robometer/Robometer-4B \\
+        --lerobot  lerobot/robometer-4b
+
+    python scripts/verify_robometer_export.py \\
+        --upstream robometer/Robometer-4B \\
+        --lerobot  ./robometer-4b-lerobot   # local folder also works
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer._upstream_loader import apply_upstream_checkpoint
+
+
+def _load_upstream(path: str) -> RobometerRewardModel:
+    # Fresh ``RobometerConfig`` (``vlm_config=None``) triggers
+    # ``RobometerRewardModel.__init__``'s upstream-matching path: download
+    # base Qwen, resize for ROBOMETER_SPECIAL_TOKENS. The subsequent
+    # ``apply_upstream_checkpoint`` call resizes again if the checkpoint's
+    # vocab differs (e.g. upstream was trained against an older Qwen).
+    cfg = RobometerConfig(pretrained_path=path, device="cpu")
+    model = RobometerRewardModel(cfg)
+    apply_upstream_checkpoint(model, path)
+    model.eval()
+    return model
+
+
+def _load_lerobot(path: str) -> RobometerRewardModel:
+    cfg = RewardModelConfig.from_pretrained(path)
+    if not isinstance(cfg, RobometerConfig):
+        raise TypeError(f"Expected RobometerConfig in LeRobot export, got {type(cfg)}")
+    cfg.pretrained_path = path
+    cfg.device = "cpu"
+    return RobometerRewardModel.from_pretrained(path, config=cfg)
+
+
+def compare_state_dicts(a: RobometerRewardModel, b: RobometerRewardModel) -> bool:
+    sd_a, sd_b = a.state_dict(), b.state_dict()
+    keys_a, keys_b = set(sd_a), set(sd_b)
+
+    missing = keys_a - keys_b
+    extra = keys_b - keys_a
+    if missing:
+        print(f"❌ {len(missing)} keys missing in LeRobot-format model (sample: {list(missing)[:5]})")
+    if extra:
+        print(f"❌ {len(extra)} extra keys in LeRobot-format model (sample: {list(extra)[:5]})")
+    if missing or extra:
+        return False
+
+    diff_summary: list[tuple[str, float]] = []
+    for key in sorted(keys_a):
+        ta, tb = sd_a[key], sd_b[key]
+        if ta.shape != tb.shape:
+            print(f"❌ shape mismatch at {key}: {tuple(ta.shape)} vs {tuple(tb.shape)}")
+            return False
+        # Compare in float to avoid bfloat16 equality quirks.
+        max_abs = (ta.float() - tb.float()).abs().max().item()
+        if max_abs > 0:
+            diff_summary.append((key, max_abs))
+
+    if not diff_summary:
+        print(f"✅ All {len(keys_a)} parameters identical")
+        return True
+
+    # Some keys differ; show worst offenders.
+    diff_summary.sort(key=lambda kv: kv[1], reverse=True)
+    print(f"⚠️  {len(diff_summary)} keys differ. Top 10 by max abs diff:")
+    for key, value in diff_summary[:10]:
+        print(f"    {key:60s}  max|Δ| = {value:.3e}")
+
+    # Tolerance: bf16 round-trips can introduce ULP-level noise but no real
+    # change. Allow up to 1e-3 absolute difference; anything larger is a real
+    # divergence.
+    worst = diff_summary[0][1]
+    if worst < 1e-3:
+        print(f"✅ Worst diff {worst:.3e} is within bf16 round-trip tolerance")
+        return True
+    print(f"❌ Worst diff {worst:.3e} exceeds tolerance (1e-3)")
+    return False
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--upstream", required=True, help="Upstream Robometer repo id or local path.")
+    parser.add_argument("--lerobot", required=True, help="LeRobot-format Robometer repo id or local path.")
+    args = parser.parse_args()
+
+    print(f"Loading upstream:        {args.upstream}")
+    upstream = _load_upstream(args.upstream)
+    print(f"Loading LeRobot-format:  {args.lerobot}")
+    lerobot = _load_lerobot(args.lerobot)
+
+    print("\n=== Config comparison ===")
+    config_ok = True
+    for field in [
+        "base_model_id",
+        "torch_dtype",
+        "use_multi_image",
+        "use_per_frame_progress_token",
+        "average_temporal_patches",
+        "frame_pooling",
+        "frame_pooling_attn_temperature",
+        "progress_loss_type",
+        "progress_discrete_bins",
+    ]:
+        a, b = getattr(upstream.config, field), getattr(lerobot.config, field)
+        field_ok = a == b
+        config_ok = config_ok and field_ok
+        ok = "✅" if field_ok else "❌"
+        print(f"  {ok} {field}: upstream={a!r}, lerobot={b!r}")
+
+    print("\n=== State-dict comparison ===")
+    state_dict_ok = compare_state_dicts(upstream, lerobot)
+
+    print()
+    if config_ok and state_dict_ok:
+        print("🎉 Verification passed — safe to flip the default.")
+        return 0
+    print("⛔ Verification failed — DO NOT flip the default.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())