Frame count is now derived from the upstream .npy length

fix decord
scripts: add Robometer parity checks (upstream example videos + LIBERO)
2026-05-20 02:59:50 +00:00 · 2026-05-18 10:57:16 +02:00 · 2026-05-18 10:39:51 +02:00 · 2026-05-17 15:41:31 +02:00 · 2026-05-17 14:59:23 +02:00
20 changed files with 3063 additions and 10 deletions
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create videos with a Robometer progress overlay for one LeRobot dataset episode.
+
+This is a lightweight smoke-test utility for Robometer checkpoints. It downloads
+one episode video, samples a small number of frames, runs Robometer on those
+frames, and reuses the progress overlay renderer from
+``examples/dataset/create_progress_videos.py``.
+
+Example:
+
+    uv run python examples/dataset/create_robometer_progress_videos.py \\
+        --repo-id lerobot/aloha_mobile_cabinet \\
+        --episode 0 \\
+        --reward-model-path lilkm/robometer-4b \\
+        --device cuda
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+
+from examples.dataset.create_progress_videos import (
+    composite_progress_video,
+    convert_mp4_to_gif,
+    download_episode_metadata,
+    download_video_file,
+    load_episode_meta,
+)
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+from lerobot.utils.utils import init_logging
+
+
+def _default_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def sample_episode_frames(
+    video_path: Path,
+    *,
+    from_timestamp: float,
+    to_timestamp: float,
+    fps: float,
+    num_frames: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Sample RGB frames uniformly from an episode video segment.
+
+    Returns:
+        ``(frames, frame_indices)`` where ``frames`` is ``(T,H,W,C)`` uint8 RGB
+        and ``frame_indices`` are local episode frame indices used for overlay.
+    """
+    if num_frames <= 0:
+        raise ValueError(f"num_frames must be positive, got {num_frames}")
+
+    duration_seconds = to_timestamp - from_timestamp
+    total_frames = max(int(round(duration_seconds * fps)), 1)
+    frame_indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
+
+    capture = cv2.VideoCapture(str(video_path))
+    frames: list[np.ndarray] = []
+    try:
+        for frame_idx in frame_indices:
+            timestamp = from_timestamp + frame_idx / fps
+            capture.set(cv2.CAP_PROP_POS_MSEC, timestamp * 1000)
+            ret, frame_bgr = capture.read()
+            if not ret:
+                logging.warning("Could not read frame %d at %.3fs", frame_idx, timestamp)
+                continue
+            frames.append(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB))
+    finally:
+        capture.release()
+
+    if not frames:
+        raise RuntimeError(f"No frames could be sampled from {video_path}")
+
+    return np.stack(frames), frame_indices[: len(frames)]
+
+
+def predict_robometer_progress(
+    frames: np.ndarray,
+    *,
+    task: str,
+    reward_model_path: str,
+    device: str,
+) -> list[float]:
+    """Run Robometer and return per-sampled-frame progress predictions."""
+    config = RobometerConfig(pretrained_path=reward_model_path, device=device, max_frames=None)
+    model = RobometerRewardModel.from_pretrained(reward_model_path, config=config)
+
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=model.config.base_model_id,
+        use_multi_image=model.config.use_multi_image,
+        use_per_frame_progress_token=model.config.use_per_frame_progress_token,
+        max_frames=None,
+    )
+    batch = encoder.encode_samples([(frames, task)])
+
+    model_device = next(model.model.parameters()).device
+    inputs = {key: value.to(model_device) if hasattr(value, "to") else value for key, value in batch.items()}
+
+    model.eval()
+    with torch.no_grad():
+        progress_logits, success_logits = model._compute_rbm_logits(inputs)
+
+    decoded = decode_progress_outputs(
+        progress_logits,
+        success_logits,
+        is_discrete_mode=model.config.use_discrete_progress,
+    )
+    return decoded["progress_pred"][0]
+
+
+def process_dataset(
+    repo_id: str,
+    episode: int,
+    reward_model_path: str,
+    device: str,
+    camera_key: str | None,
+    output_dir: Path,
+    num_frames: int,
+    task: str | None = None,
+    create_gif: bool = False,
+) -> Path:
+    safe_name = repo_id.replace("/", "_")
+    logging.info("Processing %s episode %d with Robometer %s", repo_id, episode, reward_model_path)
+
+    local_path = download_episode_metadata(repo_id, episode)
+    episode_meta = load_episode_meta(local_path, episode, camera_key)
+    video_path = download_video_file(repo_id, local_path, episode_meta["video_rel"])
+
+    task_name = task or episode_meta.get("task_name", "")
+    if not task_name:
+        raise ValueError("No task found in dataset metadata. Pass --task explicitly.")
+
+    frames, frame_indices = sample_episode_frames(
+        video_path,
+        from_timestamp=episode_meta["from_ts"],
+        to_timestamp=episode_meta["to_ts"],
+        fps=episode_meta["fps"],
+        num_frames=num_frames,
+    )
+    logging.info("Sampled %d frames for Robometer inference", len(frames))
+
+    progress = predict_robometer_progress(
+        frames,
+        task=task_name,
+        reward_model_path=reward_model_path,
+        device=device,
+    )
+    progress_data = np.stack([frame_indices, np.asarray(progress, dtype=np.float32)], axis=1)
+    logging.info("Progress predictions: %s", [round(float(value), 3) for value in progress])
+
+    output_path = output_dir / f"{safe_name}_ep{episode}_robometer_progress.mp4"
+    final_path = composite_progress_video(
+        video_path=video_path,
+        from_timestamp=episode_meta["from_ts"],
+        to_timestamp=episode_meta["to_ts"],
+        progress_data=progress_data,
+        output_path=output_path,
+        fps=episode_meta["fps"],
+        task_name=task_name,
+    )
+
+    if create_gif:
+        final_path = convert_mp4_to_gif(final_path)
+    return final_path
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Create MP4/GIF videos with Robometer progress overlay for dataset episodes."
+    )
+    parser.add_argument("--repo-id", required=True, help="Hugging Face LeRobot dataset repo id.")
+    parser.add_argument("--episode", type=int, required=True, help="Episode index to visualize.")
+    parser.add_argument(
+        "--reward-model-path",
+        default="lilkm/robometer-4b",
+        help="Robometer checkpoint path or Hub repo id (e.g. lilkm/robometer-4b).",
+    )
+    parser.add_argument("--device", default=_default_device(), help="Torch device for Robometer inference.")
+    parser.add_argument(
+        "--camera-key",
+        default=None,
+        help="Camera observation key (e.g. observation.images.top). Auto-selects first camera if omitted.",
+    )
+    parser.add_argument(
+        "--task", default=None, help="Task description override if dataset metadata lacks one."
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=8,
+        help="Number of episode frames to sample for Robometer inference.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("progress_videos"),
+        help="Directory to write output files.",
+    )
+    parser.add_argument("--gif", action="store_true", help="Also generate a GIF from the MP4 output.")
+    args = parser.parse_args()
+
+    init_logging()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = process_dataset(
+        repo_id=args.repo_id,
+        episode=args.episode,
+        reward_model_path=args.reward_model_path,
+        device=args.device,
+        camera_key=args.camera_key,
+        output_dir=args.output_dir,
+        num_frames=args.num_frames,
+        task=args.task,
+        create_gif=args.gif,
+    )
+    logging.info("Output: %s", result)
+
+
+if __name__ == "__main__":
+    main()
@@ -204,6 +204,7 @@ groot = [
    "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
 ]
 sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
+robometer = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]", "lerobot[peft-dep]"]
 xvla = ["lerobot[transformers-dep]"]
 eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
 hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -302,6 +303,7 @@ lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main"
 lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main"
 lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main"
 lerobot-rollout="lerobot.scripts.lerobot_rollout:main"
+lerobot-export-robometer="lerobot.scripts.lerobot_export_robometer:main"

 # ---------------- Tool Configurations ----------------

@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Pinpoint exactly which rows of ``embed_tokens`` / ``lm_head`` differ.
+
+Useful follow-up to ``scripts/verify_robometer_export.py`` when the verifier
+reports a small tail of differing keys but you want to know whether the
+diff is:
+
+1. Concentrated in the 5 special-token rows added by ``resize_token_embeddings``
+   (expected non-determinism: mean-resize sampling differs between runs).
+2. Spread across the full vocabulary (would point to a real loading bug).
+
+Also confirms whether ``apply_upstream_checkpoint`` actually overwrites the
+embed/lm-head tensors when loading the upstream state dict (vs. silently
+skipping them due to a key mismatch).
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+import torch
+from safetensors.torch import load_file
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer._upstream_loader import (
+    _download_robometer_snapshot,
+    _remap_state_dict_keys,
+    _resolve_checkpoint_safetensors_files,
+    apply_upstream_checkpoint,
+)
+
+EMBED_KEY = "model.model.language_model.embed_tokens.weight"
+LMHEAD_KEY = "model.lm_head.weight"
+
+
+def _load_upstream(path: str) -> RobometerRewardModel:
+    cfg = RobometerConfig(pretrained_path=path, device="cpu")
+    model = RobometerRewardModel(cfg)
+    apply_upstream_checkpoint(model, path)
+    model.eval()
+    return model
+
+
+def _load_lerobot(path: str) -> RobometerRewardModel:
+    cfg = RewardModelConfig.from_pretrained(path)
+    if not isinstance(cfg, RobometerConfig):
+        raise TypeError(f"Expected RobometerConfig, got {type(cfg)}")
+    cfg.pretrained_path = path
+    cfg.device = "cpu"
+    return RobometerRewardModel.from_pretrained(path, config=cfg)
+
+
+def _inspect_upstream_state_dict(upstream_path: str, model: RobometerRewardModel) -> None:
+    """Dump the upstream state-dict view of the embed/lm-head tensors.
+
+    Loads the raw upstream safetensors (pre-remap), runs the remapper, and
+    reports whether the embed/lm-head keys survive into the merged dict that
+    eventually hits ``model.load_state_dict``.
+    """
+    snapshot_dir = _download_robometer_snapshot(upstream_path)
+    files = _resolve_checkpoint_safetensors_files(snapshot_dir)
+    merged: dict[str, torch.Tensor] = {}
+    for path in files:
+        merged.update(load_file(str(path)))
+    remapped = _remap_state_dict_keys(merged, model)
+
+    print(f"\n=== Upstream state-dict inspection (snapshot at {snapshot_dir}) ===")
+    print(f"raw keys (before remap)  : {len(merged)}")
+    print(f"keys after remap         : {len(remapped)}")
+    print(f"model expects (state_dict): {len(model.state_dict())}")
+
+    expected = set(model.state_dict())
+    present_after_remap = set(remapped) & expected
+    print(f"keys present after remap : {len(present_after_remap)}")
+
+    missing_keys = expected - set(remapped)
+    print(f"keys missing from remap  : {len(missing_keys)}")
+    if missing_keys:
+        sample = list(missing_keys)[:10]
+        print(f"  sample missing keys    : {sample}")
+
+    unexpected_keys = set(remapped) - expected
+    print(f"keys unexpected by model : {len(unexpected_keys)}")
+    if unexpected_keys:
+        sample = list(unexpected_keys)[:10]
+        print(f"  sample unexpected keys : {sample}")
+
+    for key in (EMBED_KEY, LMHEAD_KEY):
+        present = key in remapped
+        shape = tuple(remapped[key].shape) if present else None
+        print(f"  {key:60s}  present={present}, shape={shape}")
+
+
+def _diff_embed(name: str, a: torch.Tensor, b: torch.Tensor, special_token_count: int) -> None:
+    a = a.float()
+    b = b.float()
+    if a.shape != b.shape:
+        print(f"❌ {name} shape mismatch: {tuple(a.shape)} vs {tuple(b.shape)}")
+        return
+
+    abs_diff = (a - b).abs()
+    per_row_max = abs_diff.max(dim=1).values
+    nz_rows = (per_row_max > 0).nonzero(as_tuple=True)[0].tolist()
+    print(f"\n=== {name} (shape {tuple(a.shape)}) ===")
+    print(f"global max|Δ|         = {abs_diff.max().item():.3e}")
+    print(f"rows with any diff    = {len(nz_rows)}")
+    if nz_rows:
+        first = nz_rows[:10]
+        last = nz_rows[-10:]
+        print(f"  first nonzero rows  = {first}")
+        print(f"  last nonzero rows   = {last}")
+        vocab_size = a.shape[0]
+        base_vocab = vocab_size - special_token_count
+        special_rows = list(range(base_vocab, vocab_size))
+        in_special = [r for r in nz_rows if r in special_rows]
+        out_special = [r for r in nz_rows if r not in special_rows]
+        print(
+            f"  diffs in special-token rows ({base_vocab}..{vocab_size - 1}): {len(in_special)}/{special_token_count}"
+        )
+        print(f"  diffs in base-vocab rows  (0..{base_vocab - 1})           : {len(out_special)}")
+        for r in special_rows:
+            print(
+                f"    row {r}: max|Δ|={per_row_max[r].item():.3e}, "
+                f"upstream_norm={a[r].norm().item():.3e}, lerobot_norm={b[r].norm().item():.3e}"
+            )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--upstream", required=True)
+    parser.add_argument("--lerobot", required=True)
+    parser.add_argument(
+        "--special-token-count",
+        type=int,
+        default=5,
+        help="Number of special tokens Robometer adds. Defaults to len(ROBOMETER_SPECIAL_TOKENS)=5.",
+    )
+    args = parser.parse_args()
+
+    print(f"Loading upstream:        {args.upstream}")
+    upstream = _load_upstream(args.upstream)
+    print(f"Loading LeRobot-format:  {args.lerobot}")
+    lerobot = _load_lerobot(args.lerobot)
+
+    _inspect_upstream_state_dict(args.upstream, upstream)
+
+    sd_u, sd_l = upstream.state_dict(), lerobot.state_dict()
+
+    for key in (EMBED_KEY, LMHEAD_KEY):
+        if key not in sd_u or key not in sd_l:
+            print(f"❌ key missing: {key} (upstream={key in sd_u}, lerobot={key in sd_l})")
+            continue
+        _diff_embed(key, sd_u[key], sd_l[key], args.special_token_count)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Extract one LIBERO episode for Robometer parity testing.
+
+Loads a LeRobot LIBERO (or any video-bearing LeRobot) dataset, picks one
+episode, samples ``--num-frames`` frames uniformly across its duration
+(matching upstream Robometer's default of 8 frames), and saves them to
+``.npz`` plus a sidecar ``.txt`` task file.
+
+The ``.npz`` layout (``frames`` key, ``(T, H, W, C) uint8``) is what upstream
+``example_inference_local.py`` consumes, so the same file feeds both pipelines
+and frame sampling cannot drift.
+
+Workflow:
+
+1. Run this script (LeRobot env) to produce ``frames.npz`` + ``task.txt``.
+2. Pass them to upstream ``scripts/example_inference_local.py``
+   (upstream env) to produce reference progress / success outputs.
+3. Pass the same ``frames.npz`` to ``scripts/parity_robometer.py``
+   (LeRobot env) to compare both sides.
+
+Example:
+
+    uv run python scripts/extract_libero_episode_for_parity.py \\
+        --repo-id lerobot/libero_10_image \\
+        --episode 0 \\
+        --num-frames 8 \\
+        --out-dir /tmp/libero_ep0
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from lerobot.configs.types import FeatureType
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+
+def _pick_visual_feature(features: dict, requested: str | None) -> str:
+    """Return a visual feature key, preferring ``requested`` when given."""
+    visual_keys = [
+        key
+        for key, ft in features.items()
+        if getattr(ft, "type", None) == FeatureType.VISUAL or ft.get("dtype", "") == "video"
+    ]
+    if not visual_keys:
+        raise ValueError(f"Dataset has no visual feature; available: {list(features)}")
+    if requested is not None:
+        if requested not in visual_keys:
+            raise ValueError(f"Camera key {requested!r} not in dataset visual features {visual_keys}")
+        return requested
+    return visual_keys[0]
+
+
+def _frame_uint8_hwc(tensor: torch.Tensor) -> np.ndarray:
+    """Convert a LeRobotDataset video frame to ``uint8`` ``(H, W, C)`` RGB."""
+    arr = tensor.detach().cpu().numpy()
+    if arr.ndim == 3 and arr.shape[0] in (1, 3):
+        arr = arr.transpose(1, 2, 0)
+    if arr.dtype != np.uint8:
+        arr = np.clip(arr * 255.0 if arr.max() <= 1.0 + 1e-3 else arr, 0, 255).astype(np.uint8)
+    if arr.shape[-1] == 1:
+        arr = np.repeat(arr, 3, axis=-1)
+    return arr
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--repo-id",
+        default="lerobot/libero_10_image",
+        help="LeRobot LIBERO (or other) dataset repo id (default: lerobot/libero_10_image).",
+    )
+    parser.add_argument("--episode", type=int, default=0, help="Episode index.")
+    parser.add_argument(
+        "--camera-key",
+        default=None,
+        help="Visual feature key (e.g. observation.images.image). Auto-selects first if omitted.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=8,
+        help="Number of frames to sample uniformly (default: 8 — Robometer's training-time default).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=Path("outputs/robometer_parity/libero"),
+        help="Directory to write frames.npz / task.txt / frame_indices.npy.",
+    )
+    args = parser.parse_args()
+
+    print(f"Loading {args.repo_id} (episode {args.episode})...")
+    dataset = LeRobotDataset(args.repo_id, episodes=[args.episode])
+
+    camera_key = _pick_visual_feature(dataset.features, args.camera_key)
+    print(f"Using camera key: {camera_key}")
+
+    ep_from = int(dataset.episode_data_index["from"][0].item())
+    ep_to = int(dataset.episode_data_index["to"][0].item())
+    total_frames = ep_to - ep_from
+    if total_frames <= 0:
+        print(f"ERROR: episode {args.episode} has no frames.", file=sys.stderr)
+        return 1
+    print(f"Episode has {total_frames} frames; sampling {args.num_frames} uniformly.")
+
+    indices = np.linspace(0, total_frames - 1, num=min(args.num_frames, total_frames), dtype=int)
+    frames: list[np.ndarray] = []
+    task: str = ""
+    for offset in indices:
+        sample = dataset[ep_from + int(offset)]
+        frame_tensor = sample[camera_key]
+        frames.append(_frame_uint8_hwc(frame_tensor))
+        if not task:
+            task = sample.get("task", "") or ""
+
+    if not task:
+        print("ERROR: episode has no task description in metadata.", file=sys.stderr)
+        return 1
+
+    frames_array = np.stack(frames)
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    frames_path = args.out_dir / "frames.npz"
+    task_path = args.out_dir / "task.txt"
+    indices_path = args.out_dir / "frame_indices.npy"
+
+    np.savez(frames_path, frames=frames_array)
+    task_path.write_text(task + "\n", encoding="utf-8")
+    np.save(indices_path, indices)
+
+    print()
+    print(f"Wrote {frames_path} (shape={frames_array.shape}, dtype={frames_array.dtype})")
+    print(f"Wrote {task_path}   (task={task!r})")
+    print(f"Wrote {indices_path} (frame_indices={indices.tolist()})")
+    print()
+    print("Next steps:")
+    print("  # in upstream env (where `robometer` is importable):")
+    print(
+        f"  python third_party/robometer/scripts/example_inference_local.py \\\n"
+        f"      --model-path robometer/Robometer-4B \\\n"
+        f"      --video {frames_path} \\\n"
+        f'      --task "{task}" \\\n'
+        f"      --out {args.out_dir / 'upstream.npy'}"
+    )
+    print()
+    print("  # back in LeRobot env:")
+    print(
+        f"  uv run python scripts/parity_robometer.py \\\n"
+        f"      --frames {frames_path} \\\n"
+        f'      --task "{task}" \\\n'
+        f"      --upstream-progress {args.out_dir / 'upstream.npy'} \\\n"
+        f"      --upstream-success  {args.out_dir / 'upstream_success_probs.npy'}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,232 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Functional parity check: LeRobot Robometer vs. upstream Robometer.
+
+Runs the in-tree :class:`RobometerRewardModel` on the same frames + task that
+upstream Robometer was run on, and compares per-frame progress / success
+predictions against reference outputs saved by upstream's
+``scripts/example_inference_local.py``.
+
+Workflow:
+
+1. In the upstream Robometer environment (where ``robometer`` is importable),
+   run::
+
+       python third_party/robometer/scripts/example_inference_local.py \\
+           --model-path robometer/Robometer-4B \\
+           --video /path/to/episode.mp4 \\
+           --task "Open the drawer" \\
+           --fps 1.0 \\
+           --out /tmp/robometer_upstream.npy
+
+   This produces:
+   - ``/tmp/robometer_upstream.npy``               (progress predictions)
+   - ``/tmp/robometer_upstream_success_probs.npy`` (success probabilities)
+
+2. Extract the exact same frames the upstream script used, save as ``.npz``::
+
+       # quick helper: extract frames at the same fps and save as .npz
+       python -c "
+       from third_party.robometer.scripts.example_inference_local import load_frames_input
+       import numpy as np
+       frames = load_frames_input('/path/to/episode.mp4', fps=1.0, max_frames=512)
+       np.savez('/tmp/robometer_frames.npz', frames=frames)
+       "
+
+3. In this LeRobot env, run this script::
+
+       uv run python scripts/parity_robometer.py \\
+           --frames /tmp/robometer_frames.npz \\
+           --task "Open the drawer" \\
+           --upstream-progress /tmp/robometer_upstream.npy \\
+           --upstream-success  /tmp/robometer_upstream_success_probs.npy \\
+           --lerobot-model     lilkm/robometer-4b
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+import numpy as np
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+
+
+def _load_frames(path: str) -> np.ndarray:
+    """Load frames from .npy/.npz. Expects (T, H, W, C) uint8."""
+    if path.endswith(".npy"):
+        frames = np.load(path)
+    elif path.endswith(".npz"):
+        with np.load(path, allow_pickle=False) as npz:
+            frames = npz["frames"].copy() if "frames" in npz else next(iter(npz.values())).copy()
+    else:
+        raise ValueError(f"Frames must be .npy or .npz (got {path!r}).")
+
+    if frames.dtype != np.uint8:
+        frames = np.clip(frames, 0, 255).astype(np.uint8)
+    if frames.ndim != 4:
+        raise ValueError(f"Frames must be 4D (T,H,W,C); got shape {frames.shape}.")
+    if frames.shape[-1] not in (1, 3):
+        # Probably (T,C,H,W) — transpose
+        if frames.shape[1] in (1, 3):
+            frames = frames.transpose(0, 2, 3, 1)
+        else:
+            raise ValueError(f"Cannot interpret frame channel layout: {frames.shape}.")
+    return frames
+
+
+def _run_lerobot(
+    frames: np.ndarray,
+    task: str,
+    model_path: str,
+    device: str,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Run LeRobot's Robometer on the given frames; return (progress, success)."""
+    cfg = RobometerConfig(pretrained_path=model_path, device=device, max_frames=None)
+    model = RobometerRewardModel.from_pretrained(model_path, config=cfg)
+
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=model.config.base_model_id,
+        use_multi_image=model.config.use_multi_image,
+        use_per_frame_progress_token=model.config.use_per_frame_progress_token,
+        max_frames=None,
+    )
+    batch = encoder.encode_samples([(frames, task)])
+
+    model_device = next(model.model.parameters()).device
+    inputs = {key: value.to(model_device) if hasattr(value, "to") else value for key, value in batch.items()}
+
+    model.eval()
+    with torch.no_grad():
+        progress_logits, success_logits = model._compute_rbm_logits(inputs)
+
+    decoded = decode_progress_outputs(
+        progress_logits,
+        success_logits,
+        is_discrete_mode=model.config.use_discrete_progress,
+    )
+    progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
+    success = (
+        np.asarray(decoded["success_probs"][0], dtype=np.float32)
+        if decoded["success_probs"]
+        else np.array([], dtype=np.float32)
+    )
+    return progress, success
+
+
+def _compare(name: str, lerobot: np.ndarray, upstream: np.ndarray, atol: float, rtol: float) -> bool:
+    print(f"\n=== {name} ===")
+    if lerobot.shape != upstream.shape:
+        print(f"shape mismatch: lerobot={lerobot.shape}  upstream={upstream.shape}")
+        return False
+
+    abs_diff = np.abs(lerobot - upstream)
+    rel_diff = abs_diff / (np.abs(upstream) + 1e-12)
+    print(f"shape        : {lerobot.shape}")
+    print(f"max |Δ|      : {abs_diff.max():.3e}")
+    print(f"mean |Δ|     : {abs_diff.mean():.3e}")
+    print(f"max rel |Δ|  : {rel_diff.max():.3e}")
+    print(f"lerobot[:5]  : {lerobot[:5]}")
+    print(f"upstream[:5] : {upstream[:5]}")
+
+    within_tol = bool(np.allclose(lerobot, upstream, atol=atol, rtol=rtol))
+    print(f"allclose(atol={atol}, rtol={rtol}) -> {within_tol}")
+    return within_tol
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--frames",
+        required=True,
+        help=".npy / .npz file with the exact frames upstream was run on (T,H,W,C uint8).",
+    )
+    parser.add_argument("--task", required=True, help="Task instruction string.")
+    parser.add_argument(
+        "--upstream-progress",
+        required=True,
+        help="Reference progress .npy saved by upstream example_inference_local.py.",
+    )
+    parser.add_argument(
+        "--upstream-success",
+        default=None,
+        help="Optional reference success_probs .npy. If omitted, success comparison is skipped.",
+    )
+    parser.add_argument(
+        "--lerobot-model",
+        default="lilkm/robometer-4b",
+        help="LeRobot-format Robometer Hub repo id or local path.",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for the LeRobot model (default: cuda if available).",
+    )
+    parser.add_argument(
+        "--atol",
+        type=float,
+        default=1e-3,
+        help="Absolute tolerance for allclose (default: 1e-3; bf16 round-trip headroom).",
+    )
+    parser.add_argument(
+        "--rtol",
+        type=float,
+        default=1e-2,
+        help="Relative tolerance for allclose (default: 1e-2).",
+    )
+    parser.add_argument(
+        "--out-prefix",
+        default="lerobot_robometer_outputs",
+        help="Save the LeRobot outputs as <prefix>_progress.npy / <prefix>_success.npy.",
+    )
+    args = parser.parse_args()
+
+    # 0. Sanity: confirm the LeRobot config is a RobometerConfig.
+    cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
+    if not isinstance(cfg, RobometerConfig):
+        print(f"ERROR: {args.lerobot_model!r} does not resolve to a RobometerConfig.", file=sys.stderr)
+        return 2
+
+    # 1. Load frames + task + upstream reference outputs.
+    frames = _load_frames(args.frames)
+    upstream_progress = np.load(args.upstream_progress).astype(np.float32)
+    upstream_success = (
+        np.load(args.upstream_success).astype(np.float32) if args.upstream_success is not None else None
+    )
+
+    print(f"Loaded {frames.shape[0]} frames at {frames.shape[1:]}, task={args.task!r}")
+    print(f"LeRobot model: {args.lerobot_model}  device: {args.device}")
+
+    # 2. Run LeRobot pipeline.
+    progress, success = _run_lerobot(frames, args.task, args.lerobot_model, args.device)
+    np.save(f"{args.out_prefix}_progress.npy", progress)
+    if success.size > 0:
+        np.save(f"{args.out_prefix}_success.npy", success)
+    print(f"Saved LeRobot outputs to {args.out_prefix}_progress.npy / _success.npy")
+
+    # 3. Compare to upstream references.
+    progress_ok = _compare("progress", progress, upstream_progress, args.atol, args.rtol)
+    if upstream_success is not None and success.size > 0:
+        success_ok = _compare("success_probs", success, upstream_success, args.atol, args.rtol)
+    else:
+        success_ok = True
+        print("\n(skipping success comparison — upstream success file not provided)")
+
+    print()
+    if progress_ok and success_ok:
+        print("Parity check passed.")
+        return 0
+    print("Parity check FAILED.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+"""Run LeRobot Robometer parity against upstream Robometer's bundled examples.
+
+Upstream Robometer ships three reference videos with their pre-computed
+progress / success outputs at
+``third_party/robometer/scripts/example_videos/``::
+
+    soar_put_green_stick_in_brown_bowl.mp4
+        + soar_put_green_stick_in_brown_bowl_rewards.npy            (progress)
+        + soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy (success)
+    berkeley_rpt_stack_cup.mp4
+        + berkeley_rpt_stack_cup_rewards.npy
+        + berkeley_rpt_stack_cup_rewards_success_probs.npy
+    jaco_play_pick_up_green_cup.mp4
+        + pick_up_green_cup_rewards.npy
+        + pick_up_green_cup_rewards_success_probs.npy
+
+This script:
+1. Decodes each video at upstream's sampling fps using ``av`` (PyAV), with the
+   same linspace-over-total-frames logic as upstream's ``extract_frames``.
+2. Runs the LeRobot ``RobometerRewardModel`` on those frames + the task from
+   upstream's README.
+3. Compares per-frame progress / success to the pre-saved upstream outputs.
+
+This means you do **not** need to install upstream Robometer to confirm parity.
+
+Run::
+
+    uv run python scripts/parity_robometer_upstream_examples.py \\
+        --lerobot-model lilkm/robometer-4b \\
+        --device cuda \\
+        --decoder decord
+
+The number of frames sampled per video is derived from the length of each
+upstream ``.npy`` reference, so the script does not need a ``--fps`` argument
+(the README documents ``fps=3`` for SOAR / Berkeley, but the Jaco Play
+reference was generated with a different fps).
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+
+try:
+    import decord  # type: ignore
+
+    _HAS_DECORD = True
+except ImportError:
+    decord = None  # type: ignore
+    _HAS_DECORD = False
+
+try:
+    import av
+
+    _HAS_AV = True
+except ImportError:
+    av = None  # type: ignore
+    _HAS_AV = False
+
+EXAMPLES = [
+    {
+        "name": "soar_put_green_stick_in_brown_bowl",
+        "video": "soar_put_green_stick_in_brown_bowl.mp4",
+        "task": "Put green stick in brown bowl",
+        "progress_npy": "soar_put_green_stick_in_brown_bowl_rewards.npy",
+        "success_npy": "soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy",
+    },
+    {
+        "name": "berkeley_rpt_stack_cup",
+        "video": "berkeley_rpt_stack_cup.mp4",
+        "task": "Pick up the yellow cup and stack it on the other cup",
+        "progress_npy": "berkeley_rpt_stack_cup_rewards.npy",
+        "success_npy": "berkeley_rpt_stack_cup_rewards_success_probs.npy",
+    },
+    {
+        "name": "jaco_play_pick_up_green_cup",
+        "video": "jaco_play_pick_up_green_cup.mp4",
+        "task": "Pick up the green cup",
+        "progress_npy": "pick_up_green_cup_rewards.npy",
+        "success_npy": "pick_up_green_cup_rewards_success_probs.npy",
+    },
+]
+
+
+def _extract_frames_decord(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
+    """Sample ``num_frames`` indices uniformly from the video using decord.
+
+    Mirrors upstream's ``extract_frames`` indexing
+    (``third_party/robometer/scripts/example_inference.py``): a
+    ``np.linspace(0, total_frames-1, num_frames)`` lookup over decord's
+    ``VideoReader``. We pass ``num_frames`` explicitly (derived from the
+    upstream reference output length) so we don't have to guess what ``fps``
+    upstream actually used when generating each saved ``.npy`` — the file
+    length is the ground truth.
+    """
+    vr = decord.VideoReader(str(video_path), num_threads=1)
+    total_frames = len(vr)
+    if total_frames == 0:
+        raise RuntimeError(f"No decodable frames in {video_path}.")
+    desired_frames = max(1, min(int(num_frames), total_frames))
+    indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int).tolist()
+    frames = vr.get_batch(indices).asnumpy()
+    native_fps = float(vr.get_avg_fps()) or 1.0
+    return frames, f"decord total={total_frames} native_fps={native_fps:.3f}"
+
+
+def _extract_frames_av(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
+    """PyAV fallback for environments without decord.
+
+    PyAV and decord can disagree on ``total_frames`` for the same container,
+    so the sampled frame indices can drift. Install ``decord`` for a real
+    parity check; this fallback is for smoke tests only.
+    """
+    container = av.open(str(video_path))
+    stream = container.streams.video[0]
+    native_fps = float(stream.average_rate) if stream.average_rate else float(stream.guessed_rate or 30.0)
+    rgb_frames: list[np.ndarray] = []
+    for frame in container.decode(stream):
+        rgb_frames.append(frame.to_ndarray(format="rgb24"))
+    container.close()
+    total_frames = len(rgb_frames)
+    if total_frames == 0:
+        raise RuntimeError(f"No decodable frames in {video_path}.")
+    desired_frames = max(1, min(int(num_frames), total_frames))
+    indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int)
+    frames = np.stack([rgb_frames[i] for i in indices])
+    return frames, f"av total={total_frames} native_fps={native_fps:.3f}"
+
+
+def _extract_frames(video_path: Path, num_frames: int, prefer: str) -> tuple[np.ndarray, str]:
+    """Decoder dispatch. ``prefer`` is ``"decord"`` | ``"av"`` | ``"auto"``."""
+    if prefer == "decord":
+        if not _HAS_DECORD:
+            raise RuntimeError("decord requested but not installed (`uv pip install decord`).")
+        return _extract_frames_decord(video_path, num_frames)
+    if prefer == "av":
+        if not _HAS_AV:
+            raise RuntimeError("av requested but not installed.")
+        return _extract_frames_av(video_path, num_frames)
+    # auto
+    if _HAS_DECORD:
+        return _extract_frames_decord(video_path, num_frames)
+    if _HAS_AV:
+        return _extract_frames_av(video_path, num_frames)
+    raise RuntimeError("No video decoder available (install `decord` or `av`).")
+
+
+def _pearson(a: np.ndarray, b: np.ndarray) -> float:
+    """Pearson correlation; returns 1.0 for constant inputs (no signal to align)."""
+    a = a.astype(np.float64)
+    b = b.astype(np.float64)
+    if a.size < 2:
+        return 1.0
+    da = a - a.mean()
+    db = b - b.mean()
+    denom = float(np.sqrt((da * da).sum()) * np.sqrt((db * db).sum()))
+    if denom == 0:
+        return 1.0
+    return float((da * db).sum() / denom)
+
+
+def _run_lerobot(
+    model: RobometerRewardModel,
+    encoder: RobometerEncoderProcessorStep,
+    frames: np.ndarray,
+    task: str,
+) -> tuple[np.ndarray, np.ndarray]:
+    batch = encoder.encode_samples([(frames, task)])
+    device = next(model.model.parameters()).device
+    inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in batch.items()}
+    model.eval()
+    with torch.no_grad():
+        progress_logits, success_logits = model._compute_rbm_logits(inputs)
+    decoded = decode_progress_outputs(
+        progress_logits, success_logits, is_discrete_mode=model.config.use_discrete_progress
+    )
+    progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
+    success = (
+        np.asarray(decoded["success_probs"][0], dtype=np.float32)
+        if decoded["success_probs"]
+        else np.array([], dtype=np.float32)
+    )
+    return progress, success
+
+
+def _compare(
+    name: str,
+    lerobot: np.ndarray,
+    upstream: np.ndarray,
+    *,
+    atol: float,
+    pearson_min: float,
+) -> bool:
+    if lerobot.shape != upstream.shape:
+        print(f"  {name:8s}  SHAPE MISMATCH lerobot={lerobot.shape} upstream={upstream.shape}")
+        return False
+    abs_diff = np.abs(lerobot - upstream)
+    pearson = _pearson(lerobot, upstream)
+    abs_ok = bool(abs_diff.max() <= atol)
+    pearson_ok = bool(pearson >= pearson_min)
+    verdict = "PASS" if (abs_ok or pearson_ok) else "FAIL"
+    print(
+        f"  {name:8s}  shape={lerobot.shape}  max|Δ|={abs_diff.max():.3e}  "
+        f"mean|Δ|={abs_diff.mean():.3e}  pearson={pearson:.4f}  "
+        f"(atol={atol:.0e} pearson_min={pearson_min:.3f}) -> {verdict}"
+    )
+    return abs_ok or pearson_ok
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--examples-dir",
+        type=Path,
+        default=Path("third_party/robometer/scripts/example_videos"),
+        help="Directory containing the upstream Robometer example mp4s + .npy outputs.",
+    )
+    parser.add_argument(
+        "--lerobot-model",
+        default="lilkm/robometer-4b",
+        help="LeRobot-format Robometer Hub repo id or local path.",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for the LeRobot model.",
+    )
+    parser.add_argument(
+        "--decoder",
+        choices=("auto", "decord", "av"),
+        default="auto",
+        help=(
+            "Video decoder. ``auto`` prefers decord (matches upstream) and falls back to av. "
+            "Force ``decord`` for a clean parity check."
+        ),
+    )
+    parser.add_argument(
+        "--progress-atol",
+        type=float,
+        default=1e-2,
+        help="Absolute tolerance for the progress array. Default 1e-2 covers CUDA bf16 noise.",
+    )
+    parser.add_argument(
+        "--success-atol",
+        type=float,
+        default=1e-1,
+        help=(
+            "Absolute tolerance for the success array. Looser than progress because "
+            "``sigmoid`` amplifies logit-space noise near 0.5."
+        ),
+    )
+    parser.add_argument(
+        "--pearson-min",
+        type=float,
+        default=0.99,
+        help="Minimum Pearson correlation for a PASS verdict (per array).",
+    )
+    args = parser.parse_args()
+
+    if args.decoder == "av" or (args.decoder == "auto" and not _HAS_DECORD):
+        print(
+            "WARNING: using PyAV decoder. PyAV's total-frame count can differ from decord's, "
+            "which propagates into different sampled-frame indices. Install `decord` and "
+            "re-run for a clean parity check.",
+            file=sys.stderr,
+        )
+
+    examples_dir = args.examples_dir.resolve()
+    if not examples_dir.is_dir():
+        print(f"ERROR: examples dir {examples_dir} does not exist.", file=sys.stderr)
+        return 2
+
+    # Sanity-check the LeRobot config is a RobometerConfig before loading weights.
+    cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
+    if not isinstance(cfg, RobometerConfig):
+        print(f"ERROR: {args.lerobot_model!r} did not resolve to a RobometerConfig.", file=sys.stderr)
+        return 2
+
+    print(f"Loading LeRobot Robometer from {args.lerobot_model} on {args.device}...")
+    cfg.pretrained_path = args.lerobot_model
+    cfg.device = args.device
+    model = RobometerRewardModel.from_pretrained(args.lerobot_model, config=cfg)
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=model.config.base_model_id,
+        use_multi_image=model.config.use_multi_image,
+        use_per_frame_progress_token=model.config.use_per_frame_progress_token,
+        max_frames=None,
+    )
+
+    all_ok = True
+    for ex in EXAMPLES:
+        video_path = examples_dir / ex["video"]
+        upstream_progress_path = examples_dir / ex["progress_npy"]
+        upstream_success_path = examples_dir / ex["success_npy"]
+
+        missing = [p for p in (video_path, upstream_progress_path, upstream_success_path) if not p.exists()]
+        if missing:
+            print(f"[skip] {ex['name']}: missing {[str(m) for m in missing]}")
+            all_ok = False
+            continue
+
+        print(f"\n=== {ex['name']} ===")
+        print(f"  task: {ex['task']!r}")
+
+        # Trust the upstream reference array as the source of truth for how
+        # many frames to sample. The README documents fps=3 for SOAR/Berkeley
+        # but Jaco Play was generated with a different fps, so any hardcoded
+        # ``--fps`` mismatches at least one example. The npy length always
+        # tells us what upstream actually used.
+        upstream_progress = np.load(upstream_progress_path).astype(np.float32)
+        upstream_success = np.load(upstream_success_path).astype(np.float32)
+        target_num_frames = int(upstream_progress.shape[0])
+        frames, decoder_info = _extract_frames(video_path, target_num_frames, prefer=args.decoder)
+        print(
+            f"  decoded {frames.shape[0]} frames (matches upstream npy length); "
+            f"shape={frames.shape}  [{decoder_info}]"
+        )
+
+        progress, success = _run_lerobot(model, encoder, frames, ex["task"])
+
+        progress_ok = _compare(
+            "progress",
+            progress,
+            upstream_progress,
+            atol=args.progress_atol,
+            pearson_min=args.pearson_min,
+        )
+        success_ok = _compare(
+            "success",
+            success,
+            upstream_success,
+            atol=args.success_atol,
+            pearson_min=args.pearson_min,
+        )
+        verdict = "PASS" if (progress_ok and success_ok) else "FAIL"
+        print(f"  -> {verdict}")
+        all_ok = all_ok and progress_ok and success_ok
+
+    print()
+    if all_ok:
+        print("All upstream example parity checks passed.")
+        return 0
+    print("Some upstream example parity checks FAILED.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+"""Verify that a LeRobot-format Robometer is byte-equivalent to its upstream source.
+
+Run this once after publishing a LeRobot-format Robometer to the Hub, before
+flipping the default `RobometerConfig.pretrained_path` to it. It loads both
+the upstream snapshot and the re-exported copy, compares state dicts, and
+prints a clear pass/fail summary.
+
+Example:
+
+    python scripts/verify_robometer_export.py \\
+        --upstream robometer/Robometer-4B \\
+        --lerobot  lerobot/robometer-4b
+
+    python scripts/verify_robometer_export.py \\
+        --upstream robometer/Robometer-4B \\
+        --lerobot  ./robometer-4b-lerobot   # local folder also works
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer._upstream_loader import apply_upstream_checkpoint
+
+
+def _load_upstream(path: str) -> RobometerRewardModel:
+    # Fresh ``RobometerConfig`` (``vlm_config=None``) triggers
+    # ``RobometerRewardModel.__init__``'s upstream-matching path: download
+    # base Qwen, resize for ROBOMETER_SPECIAL_TOKENS. The subsequent
+    # ``apply_upstream_checkpoint`` call resizes again if the checkpoint's
+    # vocab differs (e.g. upstream was trained against an older Qwen).
+    cfg = RobometerConfig(pretrained_path=path, device="cpu")
+    model = RobometerRewardModel(cfg)
+    apply_upstream_checkpoint(model, path)
+    model.eval()
+    return model
+
+
+def _load_lerobot(path: str) -> RobometerRewardModel:
+    cfg = RewardModelConfig.from_pretrained(path)
+    if not isinstance(cfg, RobometerConfig):
+        raise TypeError(f"Expected RobometerConfig in LeRobot export, got {type(cfg)}")
+    cfg.pretrained_path = path
+    cfg.device = "cpu"
+    return RobometerRewardModel.from_pretrained(path, config=cfg)
+
+
+def compare_state_dicts(a: RobometerRewardModel, b: RobometerRewardModel) -> bool:
+    sd_a, sd_b = a.state_dict(), b.state_dict()
+    keys_a, keys_b = set(sd_a), set(sd_b)
+
+    missing = keys_a - keys_b
+    extra = keys_b - keys_a
+    if missing:
+        print(f"❌ {len(missing)} keys missing in LeRobot-format model (sample: {list(missing)[:5]})")
+    if extra:
+        print(f"❌ {len(extra)} extra keys in LeRobot-format model (sample: {list(extra)[:5]})")
+    if missing or extra:
+        return False
+
+    diff_summary: list[tuple[str, float]] = []
+    for key in sorted(keys_a):
+        ta, tb = sd_a[key], sd_b[key]
+        if ta.shape != tb.shape:
+            print(f"❌ shape mismatch at {key}: {tuple(ta.shape)} vs {tuple(tb.shape)}")
+            return False
+        # Compare in float to avoid bfloat16 equality quirks.
+        max_abs = (ta.float() - tb.float()).abs().max().item()
+        if max_abs > 0:
+            diff_summary.append((key, max_abs))
+
+    if not diff_summary:
+        print(f"✅ All {len(keys_a)} parameters identical")
+        return True
+
+    # Some keys differ; show worst offenders.
+    diff_summary.sort(key=lambda kv: kv[1], reverse=True)
+    print(f"⚠️  {len(diff_summary)} keys differ. Top 10 by max abs diff:")
+    for key, value in diff_summary[:10]:
+        print(f"    {key:60s}  max|Δ| = {value:.3e}")
+
+    # Tolerance: bf16 round-trips can introduce ULP-level noise but no real
+    # change. Allow up to 1e-3 absolute difference; anything larger is a real
+    # divergence.
+    worst = diff_summary[0][1]
+    if worst < 1e-3:
+        print(f"✅ Worst diff {worst:.3e} is within bf16 round-trip tolerance")
+        return True
+    print(f"❌ Worst diff {worst:.3e} exceeds tolerance (1e-3)")
+    return False
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--upstream", required=True, help="Upstream Robometer repo id or local path.")
+    parser.add_argument("--lerobot", required=True, help="LeRobot-format Robometer repo id or local path.")
+    args = parser.parse_args()
+
+    print(f"Loading upstream:        {args.upstream}")
+    upstream = _load_upstream(args.upstream)
+    print(f"Loading LeRobot-format:  {args.lerobot}")
+    lerobot = _load_lerobot(args.lerobot)
+
+    print("\n=== Config comparison ===")
+    config_ok = True
+    for field in [
+        "base_model_id",
+        "torch_dtype",
+        "use_multi_image",
+        "use_per_frame_progress_token",
+        "average_temporal_patches",
+        "frame_pooling",
+        "frame_pooling_attn_temperature",
+        "progress_loss_type",
+        "progress_discrete_bins",
+    ]:
+        a, b = getattr(upstream.config, field), getattr(lerobot.config, field)
+        field_ok = a == b
+        config_ok = config_ok and field_ok
+        ok = "✅" if field_ok else "❌"
+        print(f"  {ok} {field}: upstream={a!r}, lerobot={b!r}")
+
+    print("\n=== State-dict comparison ===")
+    state_dict_ok = compare_state_dicts(upstream, lerobot)
+
+    print()
+    if config_ok and state_dict_ok:
+        print("🎉 Verification passed — safe to flip the default.")
+        return 0
+    print("⛔ Verification failed — DO NOT flip the default.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -89,9 +89,16 @@ class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
    def reward_delta_indices(self) -> list | None:  # type: ignore[type-arg]
        return None

-    @abc.abstractmethod
-    def get_optimizer_preset(self) -> OptimizerConfig:
-        raise NotImplementedError
+    def get_optimizer_preset(self) -> OptimizerConfig | None:
+        """Default optimizer for this reward model, or ``None`` for zero-shot models.
+
+        Trainable reward models (e.g. SARM, Classifier) must override this with a
+        concrete optimizer config. Zero-shot reward models (e.g. Robometer) leave
+        the default ``None`` — they error out earlier via the
+        :attr:`~lerobot.rewards.pretrained.PreTrainedRewardModel.is_trainable`
+        check in ``lerobot-train``.
+        """
+        return None

    def get_scheduler_preset(self) -> LRSchedulerConfig | None:
        return None
@@ -20,11 +20,13 @@ from .factory import (
    make_reward_pre_post_processors as make_reward_pre_post_processors,
 )
 from .pretrained import PreTrainedRewardModel as PreTrainedRewardModel
+from .robometer.configuration_robometer import RobometerConfig as RobometerConfig
 from .sarm.configuration_sarm import SARMConfig as SARMConfig

 __all__ = [
    # Configuration classes
    "RewardClassifierConfig",
+    "RobometerConfig",
    "SARMConfig",
    # Base class
    "PreTrainedRewardModel",
@@ -24,6 +24,7 @@ from lerobot.configs.rewards import RewardModelConfig
 from lerobot.processor import PolicyAction, PolicyProcessorPipeline
 from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
 from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
 from lerobot.rewards.sarm.configuration_sarm import SARMConfig


@@ -36,7 +37,7 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:

    Args:
        name: The name of the reward model. Supported names are "reward_classifier",
-              "sarm".
+              "sarm", "robometer".

    Returns:
        The reward model class corresponding to the given name.
@@ -52,6 +53,10 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
        from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel

        return SARMRewardModel
+    elif name == "robometer":
+        from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+        return RobometerRewardModel
    else:
        try:
            return _get_reward_model_cls_from_name(name=name)
@@ -68,7 +73,7 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:

    Args:
        reward_type: The type of the reward model. Supported types include
-                     "reward_classifier", "sarm".
+                     "reward_classifier", "sarm", "robometer".
        **kwargs: Keyword arguments to be passed to the configuration class constructor.

    Returns:
@@ -81,6 +86,8 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
        return RewardClassifierConfig(**kwargs)
    elif reward_type == "sarm":
        return SARMConfig(**kwargs)
+    elif reward_type == "robometer":
+        return RobometerConfig(**kwargs)
    else:
        try:
            config_cls = RewardModelConfig.get_choice_class(reward_type)
@@ -160,6 +167,13 @@ def make_reward_pre_post_processors(
            dataset_stats=kwargs.get("dataset_stats"),
            dataset_meta=kwargs.get("dataset_meta"),
        )
+    elif isinstance(reward_cfg, RobometerConfig):
+        from lerobot.rewards.robometer.processor_robometer import make_robometer_pre_post_processors
+
+        return make_robometer_pre_post_processors(
+            config=reward_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+        )

    else:
        try:
@@ -0,0 +1,19 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_robometer import RobometerConfig
+from .modeling_robometer import RobometerRewardModel
+from .processor_robometer import make_robometer_pre_post_processors
+
+__all__ = ["RobometerConfig", "RobometerRewardModel", "make_robometer_pre_post_processors"]
@@ -0,0 +1,229 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Upstream/legacy Robometer checkpoint loader.
+
+This module is **only** used by the one-time conversion tooling
+(:mod:`lerobot.scripts.lerobot_export_robometer` and
+``scripts/verify_robometer_export.py``). It supports:
+
+- Sharded upstream checkpoints (``model-0000X-of-Y.safetensors`` + index).
+- PEFT/LoRA adapter checkpoints (``adapter_config.json`` + adapter weights).
+- Local snapshot directories or Hugging Face Hub repo ids.
+
+Once :class:`~lerobot.rewards.robometer.RobometerRewardModel` is loaded
+through this module, calling ``save_pretrained`` writes the canonical
+LeRobot-native layout (single ``model.safetensors`` + ``config.json``) that
+the base loader understands.
+
+The runtime path
+(:meth:`~lerobot.rewards.pretrained.PreTrainedRewardModel.from_pretrained`)
+does **not** import this file. It is safe to delete once you no longer need
+the conversion tooling.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from torch import Tensor, nn
+
+from lerobot.utils.import_utils import require_package
+
+logger = logging.getLogger(__name__)
+
+
+def _download_robometer_snapshot(
+    pretrained_path: str,
+    *,
+    hub_token: str | None = None,
+) -> Path:
+    """Resolve a Robometer snapshot directory.
+
+    - If ``pretrained_path`` is an existing local directory, return it directly.
+    - Otherwise treat ``pretrained_path`` as a Hugging Face repo id (optionally
+      with ``@revision``) and download it via ``snapshot_download``.
+    """
+    local_candidate = Path(pretrained_path)
+    if local_candidate.is_dir():
+        return local_candidate
+
+    if "@" in pretrained_path:
+        repo_id, revision = pretrained_path.split("@", 1)
+    else:
+        repo_id, revision = pretrained_path, None
+
+    return Path(
+        snapshot_download(
+            repo_id=repo_id,
+            revision=revision,
+            token=hub_token,
+            allow_patterns=[
+                "*.json",
+                "*.safetensors",
+                "*.bin",
+                "*.txt",
+                "*.model",
+                "tokenizer*",
+                "special_tokens_map.json",
+            ],
+        )
+    )
+
+
+def _maybe_apply_peft(base_model: Any, snapshot_dir: Path) -> Any:
+    adapter_config = snapshot_dir / "adapter_config.json"
+    if not adapter_config.exists():
+        return base_model
+
+    require_package("peft", extra="peft-dep")
+    from peft import PeftModel
+
+    return PeftModel.from_pretrained(base_model, str(snapshot_dir))
+
+
+def _remap_state_dict_keys(state_dict: dict[str, Tensor], model: nn.Module) -> dict[str, Tensor]:
+    """Try a few common prefix swaps so PEFT-wrapped checkpoints load cleanly."""
+    model_keys = set(model.state_dict().keys())
+    remapped: dict[str, Tensor] = {}
+
+    for key, value in state_dict.items():
+        if key in model_keys:
+            remapped[key] = value
+            continue
+
+        candidates: list[str] = []
+        if key.startswith("model.model."):
+            candidates.append(key.replace("model.model.", "model.base_model.model.model.", 1))
+            candidates.append(key.replace("model.model.", "model.", 1))
+        if key.startswith("model."):
+            candidates.append(f"model.{key}")
+            candidates.append(key.replace("model.", "", 1))
+        else:
+            candidates.append(f"model.{key}")
+        if key.startswith("model.") and not key.startswith("model.base_model."):
+            parts = key.split(".", 1)
+            if len(parts) == 2:
+                candidates.append(f"model.base_model.{parts[1]}")
+
+        for candidate in candidates:
+            if candidate in model_keys:
+                remapped[candidate] = value
+                break
+        else:
+            remapped[key] = value
+
+    return remapped
+
+
+def _resolve_checkpoint_safetensors_files(snapshot_dir: Path) -> list[Path]:
+    """Pick the safetensors files that hold the full model weights.
+
+    When ``model.safetensors.index.json`` is present, only the files it lists are
+    loaded. Otherwise any ``model*.safetensors`` shards are preferred over
+    sidecar files. Falls back to every ``*.safetensors`` in the snapshot.
+    """
+    index_path = snapshot_dir / "model.safetensors.index.json"
+    if index_path.exists():
+        with index_path.open() as f:
+            weight_map = json.load(f).get("weight_map", {})
+        indexed = sorted(
+            {snapshot_dir / name for name in weight_map.values() if (snapshot_dir / name).exists()}
+        )
+        if indexed:
+            return indexed
+
+    model_shards = sorted(snapshot_dir.glob("model*.safetensors"))
+    if model_shards:
+        return model_shards
+
+    return sorted(snapshot_dir.glob("*.safetensors"))
+
+
+def apply_upstream_checkpoint(
+    model: nn.Module,
+    pretrained_path: str,
+    *,
+    hub_token: str | None = None,
+) -> None:
+    """Load an upstream (sharded / PEFT) Robometer checkpoint into ``model``.
+
+    Downloads the snapshot, optionally applies PEFT wrapping, merges sharded
+    ``.safetensors`` files in memory, remaps PEFT-prefixed keys, and loads them
+    into ``model`` non-strictly. ``model`` must already be constructed with the
+    matching Robometer architecture (e.g. via
+    :class:`~lerobot.rewards.robometer.RobometerRewardModel` ``__init__``).
+    """
+    snapshot_dir = _download_robometer_snapshot(pretrained_path, hub_token=hub_token)
+
+    # PEFT adapter checkpoints wrap the base model before weight loading so the
+    # remapper can place adapter tensors at the right prefix.
+    base_model = getattr(model, "model", None)
+    if base_model is not None:
+        wrapped = _maybe_apply_peft(base_model, snapshot_dir)
+        if wrapped is not base_model:
+            model.model = wrapped
+
+    files = _resolve_checkpoint_safetensors_files(snapshot_dir)
+    if not files:
+        logger.warning("No *.safetensors files in %s; using freshly initialised heads", snapshot_dir)
+        return
+
+    merged: dict[str, Tensor] = {}
+    for path in files:
+        merged.update(load_file(str(path)))
+
+    remapped = _remap_state_dict_keys(merged, model)
+
+    # Defensive vocab-match. With the corrected resize logic
+    # (``_resize_embeddings_for_robometer`` uses ``len(tokenizer) + 5``),
+    # a freshly built ``RobometerRewardModel`` should already share the same
+    # vocabulary as the upstream checkpoint (e.g. 151,674 for
+    # ``robometer/Robometer-4B``). This block stays in place as a safety net
+    # in case a future upstream variant uses a different vocab — we never
+    # want ``load_state_dict`` to trip on a silent shape mismatch.
+    base_model = getattr(model, "model", None)
+    if base_model is not None and hasattr(base_model, "get_input_embeddings"):
+        for key in (
+            "model.model.language_model.embed_tokens.weight",
+            "model.language_model.embed_tokens.weight",
+            "model.embed_tokens.weight",
+        ):
+            tensor = remapped.get(key)
+            if tensor is None:
+                continue
+            ckpt_vocab = int(tensor.shape[0])
+            current_vocab = int(base_model.get_input_embeddings().num_embeddings)
+            if ckpt_vocab != current_vocab:
+                logger.info(
+                    "Resizing model embed table %d -> %d to match upstream checkpoint vocab "
+                    "(upstream was trained against a different Qwen revision).",
+                    current_vocab,
+                    ckpt_vocab,
+                )
+                base_model.resize_token_embeddings(ckpt_vocab)
+            break
+
+    missing, unexpected = model.load_state_dict(remapped, strict=False)
+    if missing:
+        logger.debug("Robometer checkpoint missing %d keys (sample: %s)", len(missing), missing[:5])
+    if unexpected:
+        logger.debug(
+            "Robometer checkpoint had %d unexpected keys (sample: %s)", len(unexpected), unexpected[:5]
+        )
@@ -0,0 +1,162 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.utils.constants import OBS_IMAGES
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoConfig, AutoTokenizer
+else:
+    AutoConfig = None  # type: ignore[assignment]
+    AutoTokenizer = None  # type: ignore[assignment]
+
+
+@RewardModelConfig.register_subclass("robometer")
+@dataclass
+class RobometerConfig(RewardModelConfig):
+    """Configuration for the Robometer reward model."""
+
+    pretrained_path: str | None = "lilkm/Robometer-4B"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+
+    max_frames: int | None = 8
+    reward_output: str = "progress"  # "progress" or "success"
+    success_threshold: float = 0.5
+
+    license: str | None = "apache-2.0"
+    tags: list[str] | None = field(
+        default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
+    )
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    torch_dtype: str = "bfloat16"
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    average_temporal_patches: bool = True
+    frame_pooling: str = "mean"  # "mean" | "boundary" | "attention"
+    frame_pooling_attn_temperature: float = 1.0
+    progress_loss_type: str = "discrete"  # "l1" | "l2" | "discrete"
+    progress_discrete_bins: int = 10
+
+    # Serialised Qwen backbone config (post-resize). Always populated by
+    # ``__post_init__`` from ``base_model_id`` + ``len(tokenizer) + 5``, so it
+    # is never ``None`` after construction (EO-1 style). Saved into
+    # ``config.json`` automatically by the base ``_save_pretrained``.
+    vlm_config: dict[str, Any] | None = None
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "REWARD": NormalizationMode.IDENTITY,
+        }
+    )
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.reward_output not in {"progress", "success"}:
+            raise ValueError(f"reward_output must be 'progress' or 'success', got {self.reward_output!r}")
+        if self.max_frames is not None and self.max_frames < 1:
+            raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
+        if self.frame_pooling not in {"mean", "boundary", "attention"}:
+            raise ValueError(f"frame_pooling must be mean/boundary/attention; got {self.frame_pooling!r}")
+        if self.frame_pooling_attn_temperature <= 0:
+            raise ValueError("frame_pooling_attn_temperature must be > 0")
+        if self.progress_loss_type not in {"l1", "l2", "discrete"}:
+            raise ValueError(f"progress_loss_type must be l1/l2/discrete; got {self.progress_loss_type!r}")
+        if self.use_per_frame_progress_token and not self.use_multi_image:
+            raise ValueError("use_per_frame_progress_token=True requires use_multi_image=True")
+
+        if self.image_key not in self.input_features:
+            self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
+        self.output_features.setdefault("progress", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+        self.output_features.setdefault("success", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+
+        # Deterministically populate ``vlm_config`` so it is never ``None``
+        # after construction (mirrors EO-1's ``__post_init__`` snapshot).
+        # The target vocab matches upstream Robometer's runtime resize
+        # ``base_model.resize_token_embeddings(len(processor.tokenizer))`` —
+        # see ``third_party/robometer/.../setup_utils.py`` —
+        # i.e. ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``.
+        #
+        # For ``Qwen/Qwen3-VL-4B-Instruct`` this gives 151,669 + 5 = 151,674,
+        # which is exactly the published ``robometer/Robometer-4B`` checkpoint
+        # vocab. NB: ``text_config.vocab_size`` in the raw Qwen config is the
+        # padded embedding-table size (151,936), not the tokenizer length —
+        # we override it with the tokenizer-driven value to stay consistent
+        # with upstream.
+        if self.vlm_config is None:
+            require_package("transformers", extra="robometer")
+            # Local import avoids a top-level cycle (modeling_robometer imports
+            # this module). ``ROBOMETER_SPECIAL_TOKENS`` is the single source
+            # of truth for the resize delta.
+            from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_SPECIAL_TOKENS
+
+            vlm = AutoConfig.from_pretrained(self.base_model_id).to_dict()
+            tokenizer = AutoTokenizer.from_pretrained(self.base_model_id)
+            text_config = vlm.get("text_config")
+            if not isinstance(text_config, dict):
+                raise ValueError(
+                    f"Backbone config for {self.base_model_id!r} has no nested `text_config`; "
+                    "Robometer expects a Qwen-VL-style config."
+                )
+            text_config["vocab_size"] = len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)
+            self.vlm_config = vlm
+
+    @property
+    def use_discrete_progress(self) -> bool:
+        """Whether the progress head outputs distribution logits over bins."""
+        return self.progress_loss_type.lower() == "discrete"
+
+    @property
+    def vlm_backbone_config(self):
+        """Reconstruct the Qwen backbone config from :attr:`vlm_config`.
+
+        ``vlm_config`` is always populated after :meth:`__post_init__`
+        (either fresh, computed from the tokenizer, or loaded from a saved
+        ``config.json`` via draccus).
+        """
+        require_package("transformers", extra="robometer")
+        config_dict = deepcopy(self.vlm_config)
+        model_type = config_dict.pop("model_type", None)
+        if model_type is None:
+            raise ValueError("vlm_config must include `model_type` to reconstruct the backbone config")
+        return AutoConfig.for_model(model_type, **config_dict)
+
+    @property
+    def observation_delta_indices(self) -> list[int] | None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> None:
+        return None
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def validate_features(self) -> None:
+        if self.image_key not in self.input_features:
+            raise ValueError(f"Robometer requires image input feature {self.image_key!r}")
@@ -0,0 +1,493 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Robometer reward model.
+
+- Qwen3-VL backbone (default: ``Qwen/Qwen3-VL-4B-Instruct``).
+- Progress + success heads at inference; the preference head is preserved in the
+  state dict but not queried.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import Tensor, nn
+
+from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.utils.constants import OBS_PREFIX
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoModelForImageTextToText
+else:
+    AutoModelForImageTextToText = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+# Namespace for Robometer's pre-encoded Qwen-VL observation tensors. The
+# processor writes both Qwen-VL tensors and Robometer-specific token ids /
+# metadata here; the model reads them at inference (no tokenizer needed in
+# the model — EO1-style separation).
+ROBOMETER_FEATURE_PREFIX = f"{OBS_PREFIX}robometer."
+ROBOMETER_QWEN_INPUT_KEYS = (
+    "input_ids",
+    "attention_mask",
+    "pixel_values",
+    "pixel_values_videos",
+    "image_grid_thw",
+    "video_grid_thw",
+    "second_per_grid_ts",
+)
+ROBOMETER_METADATA_KEYS = (
+    "prog_token_id",
+    "vision_start_token_id",
+    "vision_end_token_id",
+    "video_merge_size",
+)
+ROBOMETER_INPUT_KEYS = ROBOMETER_QWEN_INPUT_KEYS + ROBOMETER_METADATA_KEYS
+
+# Order matters: the released checkpoint resized `embed_tokens` after adding
+# these tokens in this order, so changing the set or order would silently
+# misalign the saved embedding rows with their token ids. `<|reward_token|>`
+# and `<|sim_token|>` are vestigial (never read by any head) but still occupy
+# rows the checkpoint expects.
+ROBOMETER_SPECIAL_TOKENS = (
+    "<|split_token|>",
+    "<|reward_token|>",
+    "<|pref_token|>",
+    "<|sim_token|>",
+    "<|prog_token|>",
+)
+
+
+def convert_bins_to_continuous(bin_logits: Tensor) -> Tensor:
+    """Collapse per-bin logits into a single value in ``[0, 1]``.
+
+    The discrete progress head outputs ``num_bins`` logits per frame. Bins are
+    evenly spaced centers in ``[0, 1]``; the continuous prediction is the
+    softmax-weighted mean of those centers.
+    """
+    bin_probs = torch.softmax(bin_logits, dim=-1)
+    num_bins = bin_logits.shape[-1]
+    bin_centers = torch.linspace(0.0, 1.0, num_bins, device=bin_logits.device, dtype=bin_logits.dtype)
+    return (bin_probs * bin_centers).sum(dim=-1)
+
+
+def squeeze_last_safe(x: Tensor) -> Tensor:
+    """Drop a trailing singleton dim only when present.
+
+    Matches the upstream helper of the same name in
+    ``robometer.models.rbm`` (kept module-level and non-underscored to mirror
+    upstream).
+    """
+    return x.squeeze(-1) if x.ndim > 1 and x.shape[-1] == 1 else x
+
+
+def _torch_dtype(name: str) -> torch.dtype:
+    dtype = getattr(torch, name, None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    raise ValueError(f"Unknown torch dtype: {name!r}")
+
+
+class RobometerPredictionHead(nn.Sequential):
+    """Small MLP head used for Robometer's progress / success / preference outputs.
+
+    Subclasses ``nn.Sequential`` (not ``nn.Module``) so the ``state_dict`` keys
+    stay flat (``progress_head.0.weight``, ``progress_head.1.weight``, ...) and
+    remain byte-compatible with the published ``lilkm/robometer-4b`` checkpoint.
+    """
+
+    def __init__(self, hidden_dim: int, output_size: int, *, dropout: float, with_sigmoid: bool) -> None:
+        layers: list[nn.Module] = [
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_size),
+        ]
+        if with_sigmoid:
+            layers.append(nn.Sigmoid())
+        super().__init__(*layers)
+
+
+def decode_progress_outputs(
+    progress_logits: Tensor | None,
+    success_logits: Tensor | None,
+    *,
+    is_discrete_mode: bool,
+) -> dict[str, list[list[float]]]:
+    """Decode RBM head outputs into per-frame floats.
+
+    Args:
+        progress_logits: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        success_logits: ``(B, T)`` raw logits, ``sigmoid``-ed to probabilities.
+        is_discrete_mode: if True the progress logits get a softmax over bins
+            and are projected onto bin centers via :func:`convert_bins_to_continuous`.
+
+    Returns:
+        Dict with ``progress_pred`` and ``success_probs``, each a list of
+        length ``B`` of per-frame float lists.
+    """
+    progress_pred: list[list[float]] = []
+    success_probs: list[list[float]] = []
+
+    if progress_logits is not None:
+        for sample_logits in progress_logits:
+            if is_discrete_mode:
+                continuous = convert_bins_to_continuous(sample_logits.detach().float().cpu())
+                progress_pred.append(continuous.flatten().tolist())
+            else:
+                progress_pred.append(sample_logits.detach().float().cpu().flatten().tolist())
+
+    if success_logits is not None:
+        for sample_logits in success_logits:
+            success_probs.append(torch.sigmoid(sample_logits.detach().float().cpu()).flatten().tolist())
+
+    return {"progress_pred": progress_pred, "success_probs": success_probs}
+
+
+class RobometerRewardModel(PreTrainedRewardModel):
+    """Robometer reward model: Qwen3-VL backbone + progress/success heads."""
+
+    name = "robometer"
+    config_class = RobometerConfig
+
+    def __init__(self, config: RobometerConfig, *, dropout: float = 0.1) -> None:
+        require_package("transformers", extra="robometer")
+        super().__init__(config)
+        self.config = config
+
+        # Two backbone-build paths (EO-1 style, branched on ``pretrained_path``):
+        #
+        #   - Fresh training (``pretrained_path is None``): download the base
+        #     Qwen weights and resize the embed table to match
+        #     ``vlm_config.text_config.vocab_size`` — populated deterministically
+        #     in ``RobometerConfig.__post_init__`` as
+        #     ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``, mirroring
+        #     upstream Robometer's ``_add_special_tokens_and_resize`` in
+        #     ``third_party/robometer/.../setup_utils.py``.
+        #
+        #   - Loading a saved checkpoint (``pretrained_path`` is set): rebuild
+        #     the empty architecture from ``vlm_config`` via
+        #     ``AutoModelForImageTextToText.from_config`` so the subsequent
+        #     ``model.safetensors`` load is a direct fill of the right shape —
+        #     no redundant Qwen weight download.
+        torch_dtype = _torch_dtype(config.torch_dtype)
+        if config.pretrained_path is None:
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                config.base_model_id,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+            target_vocab = config.vlm_config["text_config"]["vocab_size"]
+            self.model.resize_token_embeddings(target_vocab)
+        else:
+            self.model = AutoModelForImageTextToText.from_config(
+                config.vlm_backbone_config,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+
+        # All Qwen-VL backbones Robometer supports expose `text_config.hidden_size`.
+        # Falls back to the top-level `hidden_size` so future non-multimodal
+        # variants would still resolve.
+        backbone_config = self.model.config
+        text_config = getattr(backbone_config, "text_config", None)
+        hidden_size = getattr(text_config, "hidden_size", None) if text_config is not None else None
+        if hidden_size is None:
+            hidden_size = getattr(backbone_config, "hidden_size", None)
+        if hidden_size is None:
+            raise AttributeError(
+                f"Could not infer hidden_size from backbone config of {config.base_model_id}"
+            )
+        hidden_dim = int(hidden_size)
+
+        # Robometer's three prediction heads + frame-pool attention. The
+        # preference head is preserved to match the published state-dict layout
+        # even though only progress + success are consumed at inference, and
+        # `frame_pool_attn` is always allocated so checkpoints trained with
+        # `frame_pooling="attention"` load without remapping.
+        progress_output = config.progress_discrete_bins if config.use_discrete_progress else 1
+        self.progress_head = RobometerPredictionHead(
+            hidden_dim,
+            progress_output,
+            dropout=dropout,
+            with_sigmoid=not config.use_discrete_progress,
+        )
+        self.preference_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.success_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.frame_pool_attn = nn.Linear(hidden_dim, 1, bias=False)
+
+        # Match the dtype of the loaded base model so weight loading is a no-op cast.
+        model_dtype = next(self.model.parameters()).dtype
+        self.progress_head.to(dtype=model_dtype)
+        self.preference_head.to(dtype=model_dtype)
+        self.success_head.to(dtype=model_dtype)
+        self.frame_pool_attn.to(dtype=model_dtype)
+
+    def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
+        inputs = {
+            key: batch[f"{ROBOMETER_FEATURE_PREFIX}{key}"]
+            for key in ROBOMETER_INPUT_KEYS
+            if f"{ROBOMETER_FEATURE_PREFIX}{key}" in batch
+        }
+        if "input_ids" not in inputs:
+            raise KeyError(
+                f"Robometer batch missing pre-encoded inputs (expected "
+                f"`{ROBOMETER_FEATURE_PREFIX}input_ids`). Make sure the "
+                "RobometerEncoderProcessorStep ran before `compute_reward`."
+            )
+
+        device = next(self.model.parameters()).device
+        inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
+
+        self.eval()
+        with torch.no_grad():
+            progress_logits, success_logits = self._compute_rbm_logits(inputs)
+
+        decoded = decode_progress_outputs(
+            progress_logits,
+            success_logits,
+            is_discrete_mode=self.config.use_discrete_progress,
+        )
+        values = (
+            decoded["success_probs"] if self.config.reward_output == "success" else decoded["progress_pred"]
+        )
+
+        rewards = torch.stack([torch.as_tensor(seq, dtype=torch.float32)[-1] for seq in values])
+        if self.config.reward_output == "success":
+            rewards = (rewards > self.config.success_threshold).float()
+        return rewards.to(self.config.device or "cpu")
+
+    def _compute_rbm_logits(
+        self,
+        inputs: dict[str, Any],
+    ) -> tuple[Tensor, Tensor]:
+        """Run the Qwen3-VL backbone and apply Robometer's heads.
+
+        ``inputs`` is the encoded batch produced by
+        :class:`RobometerEncoderProcessorStep`. It carries Qwen tensors as well
+        as Robometer-specific metadata (``prog_token_id``,
+        ``vision_start_token_id``, ``vision_end_token_id``, ``video_merge_size``)
+        — the metadata is popped here so the rest can be forwarded straight to
+        the Qwen model.
+
+        Returns ``(progress_logits, success_logits)``. Shapes:
+
+        - ``progress_logits``: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        - ``success_logits``: ``(B, T)`` raw logits (sigmoid happens at decode time).
+        """
+        prog_token_id = inputs.pop("prog_token_id", None)
+        vision_start_token_id = inputs.pop("vision_start_token_id", None)
+        vision_end_token_id = inputs.pop("vision_end_token_id", None)
+        video_merge_size = inputs.pop("video_merge_size", 14)
+
+        # Qwen3-VL doesn't reliably populate `last_hidden_state`; ask for the
+        # full hidden-state tuple and take the last layer. This matches the
+        # `is_qwen3` path in upstream Robometer's `RBM.forward_qwen` (main).
+        outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        hidden_state = (
+            outputs.hidden_states[-1]
+            if getattr(outputs, "hidden_states", None)
+            else outputs.last_hidden_state
+        )
+
+        input_ids = inputs["input_ids"]
+        if self.config.use_per_frame_progress_token:
+            if prog_token_id is None:
+                raise KeyError("`prog_token_id` missing in batch (run RobometerEncoderProcessorStep first)")
+            return self._process_token_extraction(hidden_state, input_ids, prog_token_id=prog_token_id)
+        if self.config.use_multi_image:
+            if vision_start_token_id is None or vision_end_token_id is None:
+                raise KeyError(
+                    "`vision_start_token_id` / `vision_end_token_id` missing in batch "
+                    "(run RobometerEncoderProcessorStep first)"
+                )
+            return self._process_multi_image_frames(
+                hidden_state,
+                input_ids,
+                start_id=vision_start_token_id,
+                end_id=vision_end_token_id,
+            )
+        video_grid_thw = inputs.get("video_grid_thw")
+        if video_grid_thw is None:
+            raise ValueError("video_grid_thw is required for video-mode Robometer inference")
+        if vision_start_token_id is None:
+            raise KeyError("`vision_start_token_id` missing in batch")
+        return self._process_video_frames(
+            hidden_state,
+            input_ids,
+            video_grid_thw,
+            start_id=vision_start_token_id,
+            merge_size=video_merge_size,
+        )
+
+    def _apply_heads_to_hidden_states(self, frame_embeddings: Tensor) -> tuple[Tensor, Tensor]:
+        """Apply progress + success heads to a tensor of frame embeddings.
+
+        Mirrors upstream ``RBM._apply_heads_to_hidden_states``.
+        """
+        progress_out = self.progress_head(frame_embeddings)
+        progress = progress_out if self.config.use_discrete_progress else squeeze_last_safe(progress_out)
+        success = squeeze_last_safe(self.success_head(frame_embeddings))
+        return progress, success
+
+    def _process_token_extraction(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        prog_token_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success from ``<|prog_token|>`` positions.
+
+        Mirrors the progress-sample branch of upstream
+        ``RBM._process_token_extraction``.
+        """
+        token_mask = input_ids == prog_token_id
+        batch_indices, positions = token_mask.nonzero(as_tuple=True)
+        if positions.numel() == 0:
+            raise ValueError("`<|prog_token|>` not found in any sequence")
+
+        per_sample_hidden = [
+            hidden_state[i, positions[batch_indices == i]] for i in range(input_ids.shape[0])
+        ]
+        progress_list, success_list = [], []
+        for embeddings in per_sample_hidden:
+            if embeddings.shape[0] == 0:
+                raise ValueError("`<|prog_token|>` missing in a sequence")
+            progress, success = self._apply_heads_to_hidden_states(embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _process_multi_image_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        start_id: int,
+        end_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in multi-image mode (Qwen-VL).
+
+        Mirrors upstream ``RBM._process_multi_image_frames`` (progress-sample
+        branch only — we don't run preference at inference).
+        """
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            frame_embeddings = self._extract_hidden_states_from_token_pairs(
+                seq_hidden, seq_ids, start_id, end_id
+            )
+            progress, success = self._apply_heads_to_hidden_states(frame_embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _extract_hidden_states_from_token_pairs(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        start_id: int,
+        end_id: int,
+    ) -> Tensor:
+        start_positions = (input_ids == start_id).nonzero(as_tuple=True)[0]
+        end_positions = (input_ids == end_id).nonzero(as_tuple=True)[0]
+        if start_positions.numel() == 0:
+            raise ValueError("`<|vision_start|>` not found in sequence")
+        if start_positions.numel() != end_positions.numel():
+            raise ValueError(
+                f"Mismatched vision token counts: {start_positions.numel()} start vs "
+                f"{end_positions.numel()} end"
+            )
+
+        frames: list[Tensor] = []
+        for start, end in zip(start_positions.tolist(), end_positions.tolist(), strict=True):
+            if start >= end:
+                raise ValueError(f"Invalid vision token pair: start={start} end={end}")
+            patch_tokens = hidden_state[start + 1 : end]
+            if patch_tokens.shape[0] == 0:
+                frames.append((hidden_state[start] + hidden_state[end]) / 2.0)
+                continue
+
+            pooling = self.config.frame_pooling
+            if pooling == "mean":
+                frames.append(patch_tokens.mean(dim=0))
+            elif pooling == "boundary":
+                frames.append(patch_tokens[-1])
+            else:  # attention
+                scores = (
+                    self.frame_pool_attn(patch_tokens).squeeze(-1)
+                    / self.config.frame_pooling_attn_temperature
+                )
+                weights = torch.softmax(scores, dim=0).unsqueeze(-1)
+                frames.append((weights * patch_tokens).sum(dim=0))
+
+        return torch.stack(frames)
+
+    def _process_video_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        video_grid_thw: Tensor,
+        *,
+        start_id: int,
+        merge_size: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in video mode (Qwen-VL).
+
+        Mirrors upstream ``RBM._process_video_frames`` /
+        ``RBM._extract_progress_from_trajectory`` (progress-sample branch
+        only — preference is not run at inference). In particular,
+        ``average_temporal_patches=False`` reads the *boundary* token at
+        ``cursor + tokens_per_frame`` to match upstream byte-for-byte.
+        """
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            start_positions = (seq_ids == start_id).nonzero(as_tuple=True)[0]
+            if start_positions.numel() == 0:
+                raise ValueError("`<|vision_start|>` not found in sequence")
+            t_dim, h_dim, w_dim = (int(x) for x in video_grid_thw[batch_idx].tolist())
+            tokens_per_frame = (h_dim * w_dim) // (merge_size**2)
+
+            cursor = start_positions[0].item()
+            frame_embeddings: list[Tensor] = []
+            for _ in range(t_dim):
+                if self.config.average_temporal_patches:
+                    patch = seq_hidden[cursor : cursor + tokens_per_frame]
+                    frame_embeddings.append(patch.mean(dim=0))
+                else:
+                    # Upstream takes the position *one past* the patch span as
+                    # the per-frame boundary; see
+                    # `RBM._extract_progress_from_trajectory`.
+                    frame_embeddings.append(seq_hidden[cursor + tokens_per_frame])
+                cursor += tokens_per_frame
+
+            stacked = torch.stack(frame_embeddings)
+            progress, success = self._apply_heads_to_hidden_states(stacked)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
@@ -0,0 +1,348 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Robometer pre/post processing pipelines."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+from PIL import Image
+from torch import Tensor
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    policy_action_to_transition,
+)
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.rewards.robometer.modeling_robometer import (
+    ROBOMETER_FEATURE_PREFIX,
+    ROBOMETER_SPECIAL_TOKENS,
+)
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_IMAGES,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoProcessor
+else:
+    AutoProcessor = None
+
+PROGRESS_PROMPT = (
+    "The task for the robot is '{task}'. Given the trajectory video, predict "
+    "the task progress at each frame, how far along the robot is towards "
+    "completing the task, a float between 0 and 1, where 0 is the starting "
+    "state and 1 is when the task is completed. If the robot is not "
+    "performing the same task, predict 0 progress."
+)
+
+
+def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
+    """Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
+    if frames.ndim != 4:
+        raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
+    if frames.dtype != np.uint8:
+        frames = np.clip(frames, 0, 255).astype(np.uint8)
+    return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
+
+
+def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray:
+    """Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array."""
+    if max_frames is not None:
+        video = video[-max_frames:]
+    if video.shape[1] in (1, 3):
+        video = video.permute(0, 2, 3, 1)
+    elif video.shape[-1] not in (1, 3):
+        raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
+
+    array = video.detach().cpu().numpy()
+    if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0:
+        array = array * 255.0
+    return np.clip(array, 0, 255).astype(np.uint8)
+
+
+def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
+    if task is None:
+        task = default
+    if task is None:
+        raise KeyError("Robometer expected a task description in complementary data")
+    if isinstance(task, str):
+        return [task] * batch_size
+    if isinstance(task, tuple):
+        task = list(task)
+    if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
+        raise TypeError(f"Robometer task must be a string or list of strings, got {type(task)}")
+    if len(task) == 1 and batch_size > 1:
+        return task * batch_size
+    if len(task) != batch_size:
+        raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
+    return task
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="robometer_encoder")
+class RobometerEncoderProcessorStep(ProcessorStep):
+    """Encode raw frames + task into Qwen-VL tensors for the Robometer model.
+
+    Loads a :class:`~transformers.AutoProcessor` matching ``base_model_id`` and
+    registers Robometer's special tokens on the tokenizer. The matching
+    embedding resize happens model-side in
+    :meth:`RobometerRewardModel.__init__`. This step owns the tokenizer — the
+    model itself never needs one — and is the EO1-style boundary between
+    pre-processing and modeling.
+
+    At call time the step reads:
+
+    - ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
+    - ``complementary_data[task_key]``: a string or list of strings.
+
+    and writes ``observation[f"{ROBOMETER_FEATURE_PREFIX}<name>"]`` for:
+
+    - the Qwen-VL processor outputs: ``input_ids``, ``attention_mask``,
+      ``pixel_values``, ``image_grid_thw``, ``video_grid_thw``, ...
+    - Robometer-specific token ids consumed by the model heads:
+      ``prog_token_id``, ``vision_start_token_id``, ``vision_end_token_id``,
+      ``video_merge_size``.
+    """
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 8
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    max_length: int = 1024
+
+    _processor: Any = field(default=None, init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="robometer")
+        require_package("qwen-vl-utils", extra="robometer", import_name="qwen_vl_utils")
+
+        self._processor = AutoProcessor.from_pretrained(
+            self.base_model_id,
+            trust_remote_code=True,
+            do_sample_frames=False,
+            padding_side="right",
+        )
+
+        # Register Robometer's special tokens on the tokenizer. The matching
+        # embedding resize happens model-side in `RobometerRewardModel.__init__`.
+        tokenizer = self._processor.tokenizer
+        # Qwen tokenizers may not define a pad token, but batched prompts/videos
+        # require padding, so reuse EOS as the padding token.
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        for token in ROBOMETER_SPECIAL_TOKENS:
+            if token not in tokenizer.get_vocab():
+                tokenizer.add_special_tokens({"additional_special_tokens": [token]})
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        observation = transition.get(TransitionKey.OBSERVATION)
+        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        if not isinstance(observation, dict):
+            raise ValueError("RobometerEncoderProcessorStep requires an observation dict")
+
+        if self.image_key not in observation:
+            raise KeyError(f"Robometer expected image key {self.image_key!r} in observation")
+
+        frames = observation[self.image_key]
+        tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
+        if tensor.ndim == 4:
+            tensor = tensor.unsqueeze(1)
+        elif tensor.ndim != 5:
+            raise ValueError(
+                f"Expected Robometer frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
+            )
+
+        batch_size = tensor.shape[0]
+        tasks = _expand_tasks(
+            complementary.get(self.task_key, self.default_task),
+            batch_size=batch_size,
+            default=self.default_task,
+        )
+
+        samples = [
+            (_video_to_numpy(tensor[i], max_frames=self.max_frames), tasks[i]) for i in range(batch_size)
+        ]
+        encoded = self.encode_samples(samples)
+
+        new_observation = dict(observation)
+        for key, value in encoded.items():
+            new_observation[f"{ROBOMETER_FEATURE_PREFIX}{key}"] = value
+
+        new_transition = transition.copy()
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        return new_transition
+
+    def encode_samples(self, samples: list[tuple[np.ndarray, str]]) -> dict[str, Tensor]:
+        """Run the Qwen-VL processor on a list of ``(frames, task)`` samples.
+
+        Used internally by ``__call__`` and exposed for callers that want to
+        run the encoder on a single trajectory without building an
+        :class:`EnvTransition` (see ``examples/dataset/create_robometer_progress_videos.py``).
+        """
+        from qwen_vl_utils import process_vision_info
+
+        conversations = [self._build_conversation(frames, task) for frames, task in samples]
+
+        texts = [
+            self._processor.apply_chat_template(
+                msg,
+                tokenize=False,
+                add_generation_prompt=False,
+                add_vision_id=True,
+                enable_thinking=False,
+                fps=1,
+            )
+            for msg in conversations
+        ]
+
+        process_kwargs: dict[str, Any] = {
+            "return_video_kwargs": True,
+            "return_video_metadata": True,
+        }
+        image_processor = getattr(self._processor, "image_processor", None)
+        if image_processor is not None and hasattr(image_processor, "patch_size"):
+            process_kwargs["image_patch_size"] = image_processor.patch_size
+
+        image_inputs, video_inputs, video_kwargs = process_vision_info(conversations, **process_kwargs)
+
+        videos: list[Any] | None = None
+        video_metadatas: list[Any] | None = None
+        if video_inputs:
+            if isinstance(video_inputs[0], tuple) and len(video_inputs[0]) == 2:
+                videos_seq, metadatas_seq = zip(*video_inputs, strict=False)
+                videos = list(videos_seq)
+                video_metadatas = list(metadatas_seq)
+            else:
+                videos = list(video_inputs)
+
+        processor_kwargs: dict[str, Any] = {
+            "text": texts,
+            "images": image_inputs,
+            "padding": True,
+            "truncation": False,
+            "max_length": self.max_length,
+            "return_tensors": "pt",
+            "do_resize": False,
+        }
+        if videos is not None:
+            processor_kwargs["videos"] = videos
+        if video_metadatas is not None:
+            processor_kwargs["video_metadata"] = video_metadatas
+        if video_kwargs:
+            processor_kwargs.update(video_kwargs)
+
+        encoded = self._processor(**processor_kwargs)
+
+        # Write Robometer-specific token ids and the video patch merge size into
+        # the encoded batch so `RobometerRewardModel` doesn't need its own
+        # tokenizer at inference (EO1-style separation: the processor owns the
+        # tokenizer, the model owns the backbone and heads).
+        tokenizer = self._processor.tokenizer
+        encoded["prog_token_id"] = tokenizer.convert_tokens_to_ids("<|prog_token|>")
+        encoded["vision_start_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        encoded["vision_end_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+        video_processor = getattr(self._processor, "video_processor", None)
+        encoded["video_merge_size"] = int(getattr(video_processor, "merge_size", 14))
+        return encoded
+
+    def _build_conversation(self, frames: np.ndarray, task: str) -> list[dict[str, Any]]:
+        pil_frames = _frames_to_pil(frames)
+        prompt = PROGRESS_PROMPT.format(task=task)
+        content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
+
+        if self.use_multi_image:
+            for image in pil_frames:
+                content.append({"type": "image", "image": image})
+                if self.use_per_frame_progress_token:
+                    content.append({"type": "text", "text": "<|prog_token|>"})
+        else:
+            content.append({"type": "video", "video": pil_frames, "sample_fps": 1.0})
+
+        return [{"role": "user", "content": content}]
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        # The Qwen-VL processor produces variable-length sequence tensors that
+        # don't fit the static `PolicyFeature(shape=...)` mould; we deliberately
+        # do not advertise the new observation keys here.
+        return features
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "base_model_id": self.base_model_id,
+            "image_key": self.image_key,
+            "task_key": self.task_key,
+            "default_task": self.default_task,
+            "max_frames": self.max_frames,
+            "use_multi_image": self.use_multi_image,
+            "use_per_frame_progress_token": self.use_per_frame_progress_token,
+            "max_length": self.max_length,
+        }
+
+
+def make_robometer_pre_post_processors(
+    config: RobometerConfig,
+    dataset_stats: dict[str, dict[str, Any]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
+
+    The preprocessor adds a batch dimension if needed, runs Robometer's
+    encoder, and moves everything to the configured device. The
+    postprocessor is the identity since Robometer outputs a single reward
+    tensor (no action to un-normalise).
+    """
+    del dataset_stats  # Robometer has its own normalisation inside the Qwen-VL processor.
+
+    preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+        steps=[
+            AddBatchDimensionProcessorStep(),
+            RobometerEncoderProcessorStep(
+                base_model_id=config.base_model_id,
+                image_key=config.image_key,
+                task_key=config.task_key,
+                default_task=config.default_task,
+                max_frames=config.max_frames,
+                use_multi_image=config.use_multi_image,
+                use_per_frame_progress_token=config.use_per_frame_progress_token,
+            ),
+            DeviceProcessorStep(device=config.device or "cpu"),
+        ],
+        name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+    )
+    postprocessor = PolicyProcessorPipeline(
+        name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+        to_transition=policy_action_to_transition,
+    )
+    return preprocessor, postprocessor
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Re-save a Robometer checkpoint in LeRobot HF format.
+
+LeRobot's reward model format is ``config.json`` (a draccus-encoded
+:class:`~lerobot.rewards.robometer.RobometerConfig`) plus a single
+``model.safetensors`` containing the merged base + heads weights. The
+released checkpoint at ``lilkm/robometer-4b`` already follows this layout;
+this script is for converting other Robometer variants (e.g. a future
+upstream release or a local training run) into the same format.
+
+Example:
+
+.. code-block:: shell
+
+   lerobot-export-robometer \\
+       --src robometer/Robometer-4B \\
+       --dst ./robometer-4b-lerobot
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+from lerobot.rewards.robometer._upstream_loader import apply_upstream_checkpoint
+from lerobot.utils.utils import init_logging
+
+
+def export_robometer_to_lerobot(
+    src: str,
+    dst: str | Path,
+    *,
+    device: str = "cpu",
+    dataset_repo_id: str = "",
+    write_model_card: bool = True,
+) -> Path:
+    """Load Robometer from ``src`` and re-save it under ``dst`` in LeRobot HF format.
+
+    Produces ``config.json``, ``model.safetensors``, and (optionally) ``README.md``.
+
+    Args:
+        src: Upstream source. Hugging Face repo id (``"robometer/Robometer-4B"``,
+            optionally ``"...@revision"``) or a local snapshot directory.
+        dst: Output directory. ``config.json`` and ``model.safetensors`` are
+            written here.
+        device: Where to place the model during loading. Defaults to CPU; use
+            ``"cuda"`` if you want to verify on GPU before saving.
+        dataset_repo_id: Hugging Face dataset id the model was trained on
+            (e.g. ``"robometer/RBM-1M"``). Written into the model card's
+            ``datasets:`` metadata. Leave empty if not applicable.
+        write_model_card: Generate a ``README.md`` using LeRobot's reward
+            model card template. Disable if you want to write the README
+            yourself.
+
+    Returns:
+        The resolved output directory.
+    """
+    # A fresh ``RobometerConfig`` has ``vlm_config=None``, which routes
+    # ``__init__`` through the upstream-matching path: download base Qwen,
+    # resize embeddings per ``ROBOMETER_SPECIAL_TOKENS``. ``apply_upstream_checkpoint``
+    # then resizes again (if needed) to match the upstream checkpoint's vocab
+    # and overlays its weights. ``_save_pretrained`` snapshots the resulting
+    # post-resize architecture into ``vlm_config`` for fast future loads.
+    cfg = RobometerConfig(pretrained_path=src, device=device)
+    model = RobometerRewardModel(cfg)
+    apply_upstream_checkpoint(model, src)
+    model.to(device)
+    model.eval()
+
+    dst = Path(dst)
+    dst.mkdir(parents=True, exist_ok=True)
+    model.save_pretrained(str(dst))
+
+    if write_model_card:
+        card = model.generate_model_card(
+            dataset_repo_id=dataset_repo_id,
+            model_type=model.config.type,
+            license=model.config.license,
+            tags=model.config.tags,
+        )
+        card.save(str(dst / "README.md"))
+
+    return dst
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--src",
+        default="robometer/Robometer-4B",
+        help="Upstream Robometer source (HF repo id or local directory).",
+    )
+    parser.add_argument(
+        "--dst",
+        required=True,
+        help="Output directory for the LeRobot-format checkpoint.",
+    )
+    parser.add_argument(
+        "--device",
+        default="cpu",
+        help="Torch device to load the model on (default: cpu). Conversion only "
+        "needs CPU; use cuda if you also want to smoke-test inference.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default="",
+        help="Optional Hugging Face dataset id used for training "
+        "(e.g. `robometer/RBM-1M`). Written into the auto-generated model card's "
+        "`datasets:` metadata.",
+    )
+    parser.add_argument(
+        "--no-readme",
+        action="store_true",
+        help="Skip writing README.md. Use if you want to author the model card by hand.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    init_logging()
+    args = _parse_args()
+    out = export_robometer_to_lerobot(
+        src=args.src,
+        dst=args.dst,
+        device=args.device,
+        dataset_repo_id=args.dataset,
+        write_model_card=not args.no_readme,
+    )
+    logging.info("Saved LeRobot-format Robometer checkpoint to %s", out)
+
+
+if __name__ == "__main__":
+    main()
@@ -41,8 +41,6 @@ For more details, see the [Physical Intelligence π₀ blog post](https://www.ph
 For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
 {% elif model_name == "gaussian_actor" %}
 This is a Gaussian Actor policy (Gaussian policy with a tanh squash) — the policy-side component used by [Soft Actor-Critic (SAC)](https://huggingface.co/papers/1801.01290) and related maximum-entropy continuous-control algorithms.
-{% elif model_name == "reward_classifier" %}
-A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
 {% else %}
 _Model type not recognized — please update this template._
 {% endif %}
@@ -13,6 +13,8 @@
 A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
 {% elif model_name == "sarm" %}
 A Success-Aware Reward Model (SARM) predicts a dense reward signal from observations, typically used downstream for reinforcement learning or human-in-the-loop fine-tuning when task success is not directly observable.
+{% elif model_name == "robometer" %}
+Robometer is a zero-shot general-purpose robotic reward model built on a fine-tuned Qwen3-VL backbone with progress, preference, and success heads. Given a video and a task description it outputs a per-frame progress signal in [0, 1] and a per-frame success probability — suitable for offline reward labelling and for low-frequency reward signals during RL fine-tuning of robot policies.
 {% else %}
 _Reward model type not recognized — please update this template._
 {% endif %}
@@ -0,0 +1,299 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the in-tree Robometer reward model."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
+from lerobot.rewards.robometer import RobometerConfig
+from lerobot.rewards.robometer.modeling_robometer import (
+    ROBOMETER_FEATURE_PREFIX,
+    convert_bins_to_continuous,
+    decode_progress_outputs,
+)
+
+
+class _FakeQwenConfig:
+    """Stand-in for a Qwen3-VL config (the `model.config` attribute).
+
+    ``to_dict`` matches HF's ``PretrainedConfig.to_dict`` closely enough for
+    ``RobometerRewardModel._save_pretrained`` to snapshot a meaningful
+    ``vlm_config`` into the saved ``config.json`` and for the reload path
+    to round-trip through ``AutoConfig.for_model``.
+    """
+
+    def __init__(self, hidden_dim: int = 8, vocab_size: int = 100) -> None:
+        self.text_config = SimpleNamespace(hidden_size=hidden_dim, vocab_size=vocab_size)
+        self._hidden_dim = hidden_dim
+        self._vocab_size = vocab_size
+
+    def to_dict(self) -> dict:
+        return {
+            "model_type": "fake_qwen",
+            "text_config": {
+                "hidden_size": self._hidden_dim,
+                "vocab_size": self._vocab_size,
+            },
+        }
+
+
+class _FakeEmbeddings(torch.nn.Module):
+    def __init__(self, num_embeddings: int = 100) -> None:
+        super().__init__()
+        self.num_embeddings = num_embeddings
+
+
+class _FakeBaseModel(torch.nn.Module):
+    """Stand-in for the Qwen3-VL backbone during tests.
+
+    Provides the minimum surface `RobometerRewardModel.__init__` and
+    `_compute_rbm_logits` rely on: a `parameters()` iterator (for dtype +
+    device), a `config.text_config.hidden_size`, a `config.to_dict()` so
+    `_save_pretrained` can snapshot `vlm_config`,
+    `get_input_embeddings()` / `resize_token_embeddings()` so the fresh-init
+    embed resize is a no-op, and a forward that returns a `SimpleNamespace`
+    with a `hidden_states` tuple.
+    """
+
+    def __init__(self, hidden_dim: int = 8) -> None:
+        super().__init__()
+        self._param = torch.nn.Parameter(torch.zeros(1))
+        self.hidden_dim = hidden_dim
+        self.config = _FakeQwenConfig(hidden_dim)
+        self._embeddings = _FakeEmbeddings()
+
+    def get_input_embeddings(self) -> _FakeEmbeddings:
+        return self._embeddings
+
+    def resize_token_embeddings(self, new_size: int) -> None:
+        self._embeddings.num_embeddings = new_size
+
+    def forward(self, **kwargs):  # noqa: ARG002 - intentional kwargs sink
+        input_ids = kwargs["input_ids"]
+        return SimpleNamespace(
+            hidden_states=(torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),),
+            last_hidden_state=torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),
+        )
+
+
+class _FakeTokenizer:
+    """Minimal stand-in for an HF tokenizer.
+
+    ``RobometerConfig.__post_init__`` uses ``len(tokenizer)`` to compute the
+    deterministic resize target ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``,
+    so a working ``__len__`` is all we need.
+    """
+
+    def __init__(self, length: int = 100) -> None:
+        self._length = length
+
+    def __len__(self) -> int:
+        return self._length
+
+
+def _patch_build(monkeypatch) -> None:
+    """Stub out the HF AutoX calls so Robometer construction stays cheap in tests.
+
+    Covers (EO-1 style — no model-side override hooks):
+    * ``AutoConfig.from_pretrained`` (config side) — used by
+      ``RobometerConfig.__post_init__`` to snapshot the backbone config.
+    * ``AutoTokenizer.from_pretrained`` (config side) — used by
+      ``__post_init__`` to compute ``len(tokenizer) + 5``.
+    * ``AutoConfig.for_model``                       — used by
+      ``RobometerConfig.vlm_backbone_config`` when rebuilding for ``from_config``.
+    * ``AutoModelForImageTextToText.from_pretrained`` — fresh-training path
+      (``pretrained_path is None``).
+    * ``AutoModelForImageTextToText.from_config``    — checkpoint-reload path
+      (``pretrained_path`` is set).
+    """
+    from lerobot.rewards.robometer import configuration_robometer, modeling_robometer
+
+    monkeypatch.setattr(
+        modeling_robometer.AutoModelForImageTextToText,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        modeling_robometer.AutoModelForImageTextToText,
+        "from_config",
+        lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoConfig,
+        "for_model",
+        lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoConfig,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoTokenizer,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeTokenizer(length=100),
+    )
+
+
+def _make_batch(features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """Build a `compute_reward`-ready batch using Robometer's namespaced keys."""
+    return {f"{ROBOMETER_FEATURE_PREFIX}{key}": value for key, value in features.items()}
+
+
+def test_robometer_config_registered(monkeypatch):
+    _patch_build(monkeypatch)
+    assert "robometer" in RewardModelConfig.get_known_choices()
+    assert RewardModelConfig.get_choice_class("robometer") is RobometerConfig
+    assert isinstance(make_reward_model_config("robometer", device="cpu"), RobometerConfig)
+
+
+def test_robometer_factory_returns_in_tree_class():
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    assert get_reward_model_class("robometer") is RobometerRewardModel
+
+
+def test_convert_bins_to_continuous_returns_expected_values():
+    # Two frames: first peaks at bin 0 (center 0.0), second peaks at bin 9 (center 1.0).
+    bin_logits = torch.full((2, 10), -10.0)
+    bin_logits[0, 0] = 10.0
+    bin_logits[1, -1] = 10.0
+    values = convert_bins_to_continuous(bin_logits)
+    assert values.shape == (2,)
+    assert torch.allclose(values, torch.tensor([0.0, 1.0]), atol=1e-3)
+
+
+def test_decode_progress_outputs_returns_last_frame_values():
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
+
+    outputs = decode_progress_outputs(progress, success_logits, is_discrete_mode=False)
+
+    assert outputs["progress_pred"] == [pytest.approx([0.1, 0.9]), pytest.approx([0.4, 0.6])]
+    assert outputs["success_probs"][0][-1] == pytest.approx(torch.sigmoid(torch.tensor(5.0)).item(), abs=1e-3)
+    assert outputs["success_probs"][1][-1] == pytest.approx(
+        torch.sigmoid(torch.tensor(-5.0)).item(), abs=1e-3
+    )
+
+
+def test_decode_progress_outputs_discrete_mode_softmaxes_over_bins():
+    # 2 frames, peaks at bin 0 and bin 9 → continuous predictions 0.0 and 1.0
+    bin_logits = torch.full((1, 2, 10), -10.0)
+    bin_logits[0, 0, 0] = 10.0
+    bin_logits[0, 1, -1] = 10.0
+
+    outputs = decode_progress_outputs(bin_logits, success_logits=None, is_discrete_mode=True)
+
+    assert outputs["success_probs"] == []
+    assert outputs["progress_pred"][0] == pytest.approx([0.0, 1.0], abs=1e-3)
+
+
+def test_robometer_compute_reward_reads_pre_encoded_inputs(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(device="cpu", reward_output="progress", progress_loss_type="l2")
+    model = RobometerRewardModel(cfg)
+    # Bypass the Qwen3-VL forward + head extraction with deterministic logits.
+    monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
+
+    batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
+    rewards = model.compute_reward(batch)
+
+    assert torch.allclose(rewards, torch.tensor([0.9, 0.6]))
+
+
+def test_robometer_compute_reward_can_return_binary_success(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])  # sigmoid(5) > 0.5; sigmoid(-5) < 0.5
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(
+        device="cpu",
+        reward_output="success",
+        success_threshold=0.5,
+        progress_loss_type="l2",
+    )
+    model = RobometerRewardModel(cfg)
+    monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
+
+    batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
+    rewards = model.compute_reward(batch)
+
+    assert torch.equal(rewards, torch.tensor([1.0, 0.0]))
+
+
+def test_robometer_compute_reward_errors_when_inputs_missing(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(device="cpu", progress_loss_type="l2")
+    model = RobometerRewardModel(cfg)
+
+    with pytest.raises(KeyError, match=r"observation\.robometer\.input_ids"):
+        model.compute_reward({})
+
+
+def test_robometer_save_pretrained_roundtrips(monkeypatch, tmp_path):
+    """Saving and reloading a Robometer model in LeRobot HF format must produce
+    a single ``model.safetensors`` + ``config.json`` (no Hydra ``config.yaml``).
+    """
+    from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
+
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = RobometerConfig(
+        device="cpu",
+        pretrained_path="robometer/Robometer-4B",
+        # Knobs the user might tweak — must survive the round-trip.
+        image_key="observation.images.cam_top",
+        task_key="task",
+        reward_output="success",
+        success_threshold=0.7,
+        progress_loss_type="l2",
+    )
+    model = RobometerRewardModel(cfg)
+    model.save_pretrained(str(tmp_path))
+
+    # Exactly the files LeRobot's HubMixin promises.
+    assert (tmp_path / CONFIG_NAME).exists()
+    assert (tmp_path / SAFETENSORS_SINGLE_FILE).exists()
+    assert not (tmp_path / "config.yaml").exists()  # we want HF-style, not Hydra
+
+    # Reload from the local directory: no Hub fetch, no YAML overlay. The
+    # base class drives subclass dispatch via the `type` field in config.json.
+    reloaded_cfg = RewardModelConfig.from_pretrained(str(tmp_path))
+    assert isinstance(reloaded_cfg, RobometerConfig)
+    reloaded_cfg.pretrained_path = str(tmp_path)  # mimic lerobot-train's `validate()`
+    reloaded = RobometerRewardModel.from_pretrained(str(tmp_path), config=reloaded_cfg)
+
+    assert reloaded.config.image_key == "observation.images.cam_top"
+    assert reloaded.config.task_key == "task"
+    assert reloaded.config.reward_output == "success"
+    assert reloaded.config.success_threshold == 0.7
+    assert reloaded.config.progress_loss_type == "l2"  # came back from config.json
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.12"
 resolution-markers = [
    "(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
@@ -1142,7 +1142,7 @@ name = "decord"
 version = "0.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "(platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "numpy", marker = "(platform_machine != 'arm64' and platform_machine != 's390x' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 's390x' and sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" },
@@ -2972,6 +2972,11 @@ qwen-vl-utils-dep = [
 reachy2 = [
    { name = "reachy2-sdk" },
 ]
+robometer = [
+    { name = "peft" },
+    { name = "qwen-vl-utils" },
+    { name = "transformers" },
+]
 robstride = [
    { name = "python-can" },
 ]
@@ -3122,6 +3127,7 @@ requires-dist = [
    { name = "lerobot", extras = ["peft"], marker = "extra == 'all'" },
    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'groot'" },
    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'peft'" },
+    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'robometer'" },
    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'wallx'" },
    { name = "lerobot", extras = ["phone"], marker = "extra == 'all'" },
    { name = "lerobot", extras = ["pi"], marker = "extra == 'all'" },
@@ -3139,6 +3145,7 @@ requires-dist = [
    { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'lekiwi'" },
    { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" },
    { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" },
+    { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'robometer'" },
    { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
    { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
    { name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
@@ -3160,6 +3167,7 @@ requires-dist = [
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'multi-task-dit'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'peft'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'pi'" },
+    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'robometer'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'sarm'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'smolvla'" },
    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'wallx'" },
@@ -3227,7 +3235,7 @@ requires-dist = [
    { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
    { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "robometer", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]

 [[package]]
 name = "librt"
Author	SHA1	Message	Date
Khalil Meftah	015c88cf0d	Frame count is now derived from the upstream .npy length	2026-05-18 10:57:16 +02:00
Khalil Meftah	0164725af8	fix decord	2026-05-18 10:39:51 +02:00
Khalil Meftah	34274c6f70	scripts: add Robometer parity checks (upstream example videos + LIBERO)	2026-05-17 15:41:31 +02:00
Khalil Meftah	f6a13b1338	Add Robometer reward model	2026-05-17 14:59:23 +02:00