lerobot/scripts/parity_robometer_upstream_examples.py

#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
"""Run LeRobot Robometer parity against upstream Robometer's bundled examples.

Upstream Robometer ships three reference videos with their pre-computed
progress / success outputs at
``third_party/robometer/scripts/example_videos/``::

    soar_put_green_stick_in_brown_bowl.mp4
        + soar_put_green_stick_in_brown_bowl_rewards.npy            (progress)
        + soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy (success)
    berkeley_rpt_stack_cup.mp4
        + berkeley_rpt_stack_cup_rewards.npy
        + berkeley_rpt_stack_cup_rewards_success_probs.npy
    jaco_play_pick_up_green_cup.mp4
        + pick_up_green_cup_rewards.npy
        + pick_up_green_cup_rewards_success_probs.npy

This script:
1. Decodes each video at upstream's sampling fps using ``av`` (PyAV), with the
   same linspace-over-total-frames logic as upstream's ``extract_frames``.
2. Runs the LeRobot ``RobometerRewardModel`` on those frames + the task from
   upstream's README.
3. Compares per-frame progress / success to the pre-saved upstream outputs.

This means you do **not** need to install upstream Robometer to confirm parity.

Run::

    uv run python scripts/parity_robometer_upstream_examples.py \\
        --lerobot-model lilkm/robometer-4b \\
        --device cuda \\
        --decoder decord

The number of frames sampled per video is derived from the length of each
upstream ``.npy`` reference, so the script does not need a ``--fps`` argument
(the README documents ``fps=3`` for SOAR / Berkeley, but the Jaco Play
reference was generated with a different fps).
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

import numpy as np
import torch

from lerobot.configs.rewards import RewardModelConfig
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep

try:
    import decord  # type: ignore

    _HAS_DECORD = True
except ImportError:
    decord = None  # type: ignore
    _HAS_DECORD = False

try:
    import av

    _HAS_AV = True
except ImportError:
    av = None  # type: ignore
    _HAS_AV = False

EXAMPLES = [
    {
        "name": "soar_put_green_stick_in_brown_bowl",
        "video": "soar_put_green_stick_in_brown_bowl.mp4",
        "task": "Put green stick in brown bowl",
        "progress_npy": "soar_put_green_stick_in_brown_bowl_rewards.npy",
        "success_npy": "soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy",
    },
    {
        "name": "berkeley_rpt_stack_cup",
        "video": "berkeley_rpt_stack_cup.mp4",
        "task": "Pick up the yellow cup and stack it on the other cup",
        "progress_npy": "berkeley_rpt_stack_cup_rewards.npy",
        "success_npy": "berkeley_rpt_stack_cup_rewards_success_probs.npy",
    },
    {
        "name": "jaco_play_pick_up_green_cup",
        "video": "jaco_play_pick_up_green_cup.mp4",
        "task": "Pick up the green cup",
        "progress_npy": "pick_up_green_cup_rewards.npy",
        "success_npy": "pick_up_green_cup_rewards_success_probs.npy",
    },
]


def _extract_frames_decord(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
    """Sample ``num_frames`` indices uniformly from the video using decord.

    Mirrors upstream's ``extract_frames`` indexing
    (``third_party/robometer/scripts/example_inference.py``): a
    ``np.linspace(0, total_frames-1, num_frames)`` lookup over decord's
    ``VideoReader``. We pass ``num_frames`` explicitly (derived from the
    upstream reference output length) so we don't have to guess what ``fps``
    upstream actually used when generating each saved ``.npy`` — the file
    length is the ground truth.
    """
    vr = decord.VideoReader(str(video_path), num_threads=1)
    total_frames = len(vr)
    if total_frames == 0:
        raise RuntimeError(f"No decodable frames in {video_path}.")
    desired_frames = max(1, min(int(num_frames), total_frames))
    indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int).tolist()
    frames = vr.get_batch(indices).asnumpy()
    native_fps = float(vr.get_avg_fps()) or 1.0
    return frames, f"decord total={total_frames} native_fps={native_fps:.3f}"


def _extract_frames_av(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
    """PyAV fallback for environments without decord.

    PyAV and decord can disagree on ``total_frames`` for the same container,
    so the sampled frame indices can drift. Install ``decord`` for a real
    parity check; this fallback is for smoke tests only.
    """
    container = av.open(str(video_path))
    stream = container.streams.video[0]
    native_fps = float(stream.average_rate) if stream.average_rate else float(stream.guessed_rate or 30.0)
    rgb_frames: list[np.ndarray] = []
    for frame in container.decode(stream):
        rgb_frames.append(frame.to_ndarray(format="rgb24"))
    container.close()
    total_frames = len(rgb_frames)
    if total_frames == 0:
        raise RuntimeError(f"No decodable frames in {video_path}.")
    desired_frames = max(1, min(int(num_frames), total_frames))
    indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int)
    frames = np.stack([rgb_frames[i] for i in indices])
    return frames, f"av total={total_frames} native_fps={native_fps:.3f}"


def _extract_frames(video_path: Path, num_frames: int, prefer: str) -> tuple[np.ndarray, str]:
    """Decoder dispatch. ``prefer`` is ``"decord"`` | ``"av"`` | ``"auto"``."""
    if prefer == "decord":
        if not _HAS_DECORD:
            raise RuntimeError("decord requested but not installed (`uv pip install decord`).")
        return _extract_frames_decord(video_path, num_frames)
    if prefer == "av":
        if not _HAS_AV:
            raise RuntimeError("av requested but not installed.")
        return _extract_frames_av(video_path, num_frames)
    # auto
    if _HAS_DECORD:
        return _extract_frames_decord(video_path, num_frames)
    if _HAS_AV:
        return _extract_frames_av(video_path, num_frames)
    raise RuntimeError("No video decoder available (install `decord` or `av`).")


def _pearson(a: np.ndarray, b: np.ndarray) -> float:
    """Pearson correlation; returns 1.0 for constant inputs (no signal to align)."""
    a = a.astype(np.float64)
    b = b.astype(np.float64)
    if a.size < 2:
        return 1.0
    da = a - a.mean()
    db = b - b.mean()
    denom = float(np.sqrt((da * da).sum()) * np.sqrt((db * db).sum()))
    if denom == 0:
        return 1.0
    return float((da * db).sum() / denom)


def _run_lerobot(
    model: RobometerRewardModel,
    encoder: RobometerEncoderProcessorStep,
    frames: np.ndarray,
    task: str,
) -> tuple[np.ndarray, np.ndarray]:
    batch = encoder.encode_samples([(frames, task)])
    device = next(model.model.parameters()).device
    inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in batch.items()}
    model.eval()
    with torch.no_grad():
        progress_logits, success_logits = model._compute_rbm_logits(inputs)
    decoded = decode_progress_outputs(
        progress_logits, success_logits, is_discrete_mode=model.config.use_discrete_progress
    )
    progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
    success = (
        np.asarray(decoded["success_probs"][0], dtype=np.float32)
        if decoded["success_probs"]
        else np.array([], dtype=np.float32)
    )
    return progress, success


def _compare(
    name: str,
    lerobot: np.ndarray,
    upstream: np.ndarray,
    *,
    atol: float,
    pearson_min: float,
) -> bool:
    if lerobot.shape != upstream.shape:
        print(f"  {name:8s}  SHAPE MISMATCH lerobot={lerobot.shape} upstream={upstream.shape}")
        return False
    abs_diff = np.abs(lerobot - upstream)
    pearson = _pearson(lerobot, upstream)
    abs_ok = bool(abs_diff.max() <= atol)
    pearson_ok = bool(pearson >= pearson_min)
    verdict = "PASS" if (abs_ok or pearson_ok) else "FAIL"
    print(
        f"  {name:8s}  shape={lerobot.shape}  max|Δ|={abs_diff.max():.3e}  "
        f"mean|Δ|={abs_diff.mean():.3e}  pearson={pearson:.4f}  "
        f"(atol={atol:.0e} pearson_min={pearson_min:.3f}) -> {verdict}"
    )
    return abs_ok or pearson_ok


def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "--examples-dir",
        type=Path,
        default=Path("third_party/robometer/scripts/example_videos"),
        help="Directory containing the upstream Robometer example mp4s + .npy outputs.",
    )
    parser.add_argument(
        "--lerobot-model",
        default="lilkm/robometer-4b",
        help="LeRobot-format Robometer Hub repo id or local path.",
    )
    parser.add_argument(
        "--device",
        default="cuda" if torch.cuda.is_available() else "cpu",
        help="Device for the LeRobot model.",
    )
    parser.add_argument(
        "--decoder",
        choices=("auto", "decord", "av"),
        default="auto",
        help=(
            "Video decoder. ``auto`` prefers decord (matches upstream) and falls back to av. "
            "Force ``decord`` for a clean parity check."
        ),
    )
    parser.add_argument(
        "--progress-atol",
        type=float,
        default=1e-2,
        help="Absolute tolerance for the progress array. Default 1e-2 covers CUDA bf16 noise.",
    )
    parser.add_argument(
        "--success-atol",
        type=float,
        default=1e-1,
        help=(
            "Absolute tolerance for the success array. Looser than progress because "
            "``sigmoid`` amplifies logit-space noise near 0.5."
        ),
    )
    parser.add_argument(
        "--pearson-min",
        type=float,
        default=0.99,
        help="Minimum Pearson correlation for a PASS verdict (per array).",
    )
    args = parser.parse_args()

    if args.decoder == "av" or (args.decoder == "auto" and not _HAS_DECORD):
        print(
            "WARNING: using PyAV decoder. PyAV's total-frame count can differ from decord's, "
            "which propagates into different sampled-frame indices. Install `decord` and "
            "re-run for a clean parity check.",
            file=sys.stderr,
        )

    examples_dir = args.examples_dir.resolve()
    if not examples_dir.is_dir():
        print(f"ERROR: examples dir {examples_dir} does not exist.", file=sys.stderr)
        return 2

    # Sanity-check the LeRobot config is a RobometerConfig before loading weights.
    cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
    if not isinstance(cfg, RobometerConfig):
        print(f"ERROR: {args.lerobot_model!r} did not resolve to a RobometerConfig.", file=sys.stderr)
        return 2

    print(f"Loading LeRobot Robometer from {args.lerobot_model} on {args.device}...")
    cfg.pretrained_path = args.lerobot_model
    cfg.device = args.device
    model = RobometerRewardModel.from_pretrained(args.lerobot_model, config=cfg)
    encoder = RobometerEncoderProcessorStep(
        base_model_id=model.config.base_model_id,
        use_multi_image=model.config.use_multi_image,
        use_per_frame_progress_token=model.config.use_per_frame_progress_token,
        max_frames=None,
    )

    all_ok = True
    for ex in EXAMPLES:
        video_path = examples_dir / ex["video"]
        upstream_progress_path = examples_dir / ex["progress_npy"]
        upstream_success_path = examples_dir / ex["success_npy"]

        missing = [p for p in (video_path, upstream_progress_path, upstream_success_path) if not p.exists()]
        if missing:
            print(f"[skip] {ex['name']}: missing {[str(m) for m in missing]}")
            all_ok = False
            continue

        print(f"\n=== {ex['name']} ===")
        print(f"  task: {ex['task']!r}")

        # Trust the upstream reference array as the source of truth for how
        # many frames to sample. The README documents fps=3 for SOAR/Berkeley
        # but Jaco Play was generated with a different fps, so any hardcoded
        # ``--fps`` mismatches at least one example. The npy length always
        # tells us what upstream actually used.
        upstream_progress = np.load(upstream_progress_path).astype(np.float32)
        upstream_success = np.load(upstream_success_path).astype(np.float32)
        target_num_frames = int(upstream_progress.shape[0])
        frames, decoder_info = _extract_frames(video_path, target_num_frames, prefer=args.decoder)
        print(
            f"  decoded {frames.shape[0]} frames (matches upstream npy length); "
            f"shape={frames.shape}  [{decoder_info}]"
        )

        progress, success = _run_lerobot(model, encoder, frames, ex["task"])

        progress_ok = _compare(
            "progress",
            progress,
            upstream_progress,
            atol=args.progress_atol,
            pearson_min=args.pearson_min,
        )
        success_ok = _compare(
            "success",
            success,
            upstream_success,
            atol=args.success_atol,
            pearson_min=args.pearson_min,
        )
        verdict = "PASS" if (progress_ok and success_ok) else "FAIL"
        print(f"  -> {verdict}")
        all_ok = all_ok and progress_ok and success_ok

    print()
    if all_ok:
        print("All upstream example parity checks passed.")
        return 0
    print("Some upstream example parity checks FAILED.")
    return 1


if __name__ == "__main__":
    sys.exit(main())