mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 11:09:59 +00:00
363 lines
13 KiB
Python
363 lines
13 KiB
Python
#!/usr/bin/env python
|
|
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
|
"""Run LeRobot Robometer parity against upstream Robometer's bundled examples.
|
|
|
|
Upstream Robometer ships three reference videos with their pre-computed
|
|
progress / success outputs at
|
|
``third_party/robometer/scripts/example_videos/``::
|
|
|
|
soar_put_green_stick_in_brown_bowl.mp4
|
|
+ soar_put_green_stick_in_brown_bowl_rewards.npy (progress)
|
|
+ soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy (success)
|
|
berkeley_rpt_stack_cup.mp4
|
|
+ berkeley_rpt_stack_cup_rewards.npy
|
|
+ berkeley_rpt_stack_cup_rewards_success_probs.npy
|
|
jaco_play_pick_up_green_cup.mp4
|
|
+ pick_up_green_cup_rewards.npy
|
|
+ pick_up_green_cup_rewards_success_probs.npy
|
|
|
|
This script:
|
|
1. Decodes each video at upstream's sampling fps using ``av`` (PyAV), with the
|
|
same linspace-over-total-frames logic as upstream's ``extract_frames``.
|
|
2. Runs the LeRobot ``RobometerRewardModel`` on those frames + the task from
|
|
upstream's README.
|
|
3. Compares per-frame progress / success to the pre-saved upstream outputs.
|
|
|
|
This means you do **not** need to install upstream Robometer to confirm parity.
|
|
|
|
Run::
|
|
|
|
uv run python scripts/parity_robometer_upstream_examples.py \\
|
|
--lerobot-model lilkm/robometer-4b \\
|
|
--device cuda \\
|
|
--decoder decord
|
|
|
|
The number of frames sampled per video is derived from the length of each
|
|
upstream ``.npy`` reference, so the script does not need a ``--fps`` argument
|
|
(the README documents ``fps=3`` for SOAR / Berkeley, but the Jaco Play
|
|
reference was generated with a different fps).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
from lerobot.configs.rewards import RewardModelConfig
|
|
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
|
from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
|
|
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
|
|
|
|
try:
|
|
import decord # type: ignore
|
|
|
|
_HAS_DECORD = True
|
|
except ImportError:
|
|
decord = None # type: ignore
|
|
_HAS_DECORD = False
|
|
|
|
try:
|
|
import av
|
|
|
|
_HAS_AV = True
|
|
except ImportError:
|
|
av = None # type: ignore
|
|
_HAS_AV = False
|
|
|
|
EXAMPLES = [
|
|
{
|
|
"name": "soar_put_green_stick_in_brown_bowl",
|
|
"video": "soar_put_green_stick_in_brown_bowl.mp4",
|
|
"task": "Put green stick in brown bowl",
|
|
"progress_npy": "soar_put_green_stick_in_brown_bowl_rewards.npy",
|
|
"success_npy": "soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy",
|
|
},
|
|
{
|
|
"name": "berkeley_rpt_stack_cup",
|
|
"video": "berkeley_rpt_stack_cup.mp4",
|
|
"task": "Pick up the yellow cup and stack it on the other cup",
|
|
"progress_npy": "berkeley_rpt_stack_cup_rewards.npy",
|
|
"success_npy": "berkeley_rpt_stack_cup_rewards_success_probs.npy",
|
|
},
|
|
{
|
|
"name": "jaco_play_pick_up_green_cup",
|
|
"video": "jaco_play_pick_up_green_cup.mp4",
|
|
"task": "Pick up the green cup",
|
|
"progress_npy": "pick_up_green_cup_rewards.npy",
|
|
"success_npy": "pick_up_green_cup_rewards_success_probs.npy",
|
|
},
|
|
]
|
|
|
|
|
|
def _extract_frames_decord(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
|
|
"""Sample ``num_frames`` indices uniformly from the video using decord.
|
|
|
|
Mirrors upstream's ``extract_frames`` indexing
|
|
(``third_party/robometer/scripts/example_inference.py``): a
|
|
``np.linspace(0, total_frames-1, num_frames)`` lookup over decord's
|
|
``VideoReader``. We pass ``num_frames`` explicitly (derived from the
|
|
upstream reference output length) so we don't have to guess what ``fps``
|
|
upstream actually used when generating each saved ``.npy`` — the file
|
|
length is the ground truth.
|
|
"""
|
|
vr = decord.VideoReader(str(video_path), num_threads=1)
|
|
total_frames = len(vr)
|
|
if total_frames == 0:
|
|
raise RuntimeError(f"No decodable frames in {video_path}.")
|
|
desired_frames = max(1, min(int(num_frames), total_frames))
|
|
indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int).tolist()
|
|
frames = vr.get_batch(indices).asnumpy()
|
|
native_fps = float(vr.get_avg_fps()) or 1.0
|
|
return frames, f"decord total={total_frames} native_fps={native_fps:.3f}"
|
|
|
|
|
|
def _extract_frames_av(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
|
|
"""PyAV fallback for environments without decord.
|
|
|
|
PyAV and decord can disagree on ``total_frames`` for the same container,
|
|
so the sampled frame indices can drift. Install ``decord`` for a real
|
|
parity check; this fallback is for smoke tests only.
|
|
"""
|
|
container = av.open(str(video_path))
|
|
stream = container.streams.video[0]
|
|
native_fps = float(stream.average_rate) if stream.average_rate else float(stream.guessed_rate or 30.0)
|
|
rgb_frames: list[np.ndarray] = []
|
|
for frame in container.decode(stream):
|
|
rgb_frames.append(frame.to_ndarray(format="rgb24"))
|
|
container.close()
|
|
total_frames = len(rgb_frames)
|
|
if total_frames == 0:
|
|
raise RuntimeError(f"No decodable frames in {video_path}.")
|
|
desired_frames = max(1, min(int(num_frames), total_frames))
|
|
indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int)
|
|
frames = np.stack([rgb_frames[i] for i in indices])
|
|
return frames, f"av total={total_frames} native_fps={native_fps:.3f}"
|
|
|
|
|
|
def _extract_frames(video_path: Path, num_frames: int, prefer: str) -> tuple[np.ndarray, str]:
|
|
"""Decoder dispatch. ``prefer`` is ``"decord"`` | ``"av"`` | ``"auto"``."""
|
|
if prefer == "decord":
|
|
if not _HAS_DECORD:
|
|
raise RuntimeError("decord requested but not installed (`uv pip install decord`).")
|
|
return _extract_frames_decord(video_path, num_frames)
|
|
if prefer == "av":
|
|
if not _HAS_AV:
|
|
raise RuntimeError("av requested but not installed.")
|
|
return _extract_frames_av(video_path, num_frames)
|
|
# auto
|
|
if _HAS_DECORD:
|
|
return _extract_frames_decord(video_path, num_frames)
|
|
if _HAS_AV:
|
|
return _extract_frames_av(video_path, num_frames)
|
|
raise RuntimeError("No video decoder available (install `decord` or `av`).")
|
|
|
|
|
|
def _pearson(a: np.ndarray, b: np.ndarray) -> float:
|
|
"""Pearson correlation; returns 1.0 for constant inputs (no signal to align)."""
|
|
a = a.astype(np.float64)
|
|
b = b.astype(np.float64)
|
|
if a.size < 2:
|
|
return 1.0
|
|
da = a - a.mean()
|
|
db = b - b.mean()
|
|
denom = float(np.sqrt((da * da).sum()) * np.sqrt((db * db).sum()))
|
|
if denom == 0:
|
|
return 1.0
|
|
return float((da * db).sum() / denom)
|
|
|
|
|
|
def _run_lerobot(
|
|
model: RobometerRewardModel,
|
|
encoder: RobometerEncoderProcessorStep,
|
|
frames: np.ndarray,
|
|
task: str,
|
|
) -> tuple[np.ndarray, np.ndarray]:
|
|
batch = encoder.encode_samples([(frames, task)])
|
|
device = next(model.model.parameters()).device
|
|
inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in batch.items()}
|
|
model.eval()
|
|
with torch.no_grad():
|
|
progress_logits, success_logits = model._compute_rbm_logits(inputs)
|
|
decoded = decode_progress_outputs(
|
|
progress_logits, success_logits, is_discrete_mode=model.config.use_discrete_progress
|
|
)
|
|
progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
|
|
success = (
|
|
np.asarray(decoded["success_probs"][0], dtype=np.float32)
|
|
if decoded["success_probs"]
|
|
else np.array([], dtype=np.float32)
|
|
)
|
|
return progress, success
|
|
|
|
|
|
def _compare(
|
|
name: str,
|
|
lerobot: np.ndarray,
|
|
upstream: np.ndarray,
|
|
*,
|
|
atol: float,
|
|
pearson_min: float,
|
|
) -> bool:
|
|
if lerobot.shape != upstream.shape:
|
|
print(f" {name:8s} SHAPE MISMATCH lerobot={lerobot.shape} upstream={upstream.shape}")
|
|
return False
|
|
abs_diff = np.abs(lerobot - upstream)
|
|
pearson = _pearson(lerobot, upstream)
|
|
abs_ok = bool(abs_diff.max() <= atol)
|
|
pearson_ok = bool(pearson >= pearson_min)
|
|
verdict = "PASS" if (abs_ok or pearson_ok) else "FAIL"
|
|
print(
|
|
f" {name:8s} shape={lerobot.shape} max|Δ|={abs_diff.max():.3e} "
|
|
f"mean|Δ|={abs_diff.mean():.3e} pearson={pearson:.4f} "
|
|
f"(atol={atol:.0e} pearson_min={pearson_min:.3f}) -> {verdict}"
|
|
)
|
|
return abs_ok or pearson_ok
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
parser.add_argument(
|
|
"--examples-dir",
|
|
type=Path,
|
|
default=Path("third_party/robometer/scripts/example_videos"),
|
|
help="Directory containing the upstream Robometer example mp4s + .npy outputs.",
|
|
)
|
|
parser.add_argument(
|
|
"--lerobot-model",
|
|
default="lilkm/robometer-4b",
|
|
help="LeRobot-format Robometer Hub repo id or local path.",
|
|
)
|
|
parser.add_argument(
|
|
"--device",
|
|
default="cuda" if torch.cuda.is_available() else "cpu",
|
|
help="Device for the LeRobot model.",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder",
|
|
choices=("auto", "decord", "av"),
|
|
default="auto",
|
|
help=(
|
|
"Video decoder. ``auto`` prefers decord (matches upstream) and falls back to av. "
|
|
"Force ``decord`` for a clean parity check."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--progress-atol",
|
|
type=float,
|
|
default=1e-2,
|
|
help="Absolute tolerance for the progress array. Default 1e-2 covers CUDA bf16 noise.",
|
|
)
|
|
parser.add_argument(
|
|
"--success-atol",
|
|
type=float,
|
|
default=1e-1,
|
|
help=(
|
|
"Absolute tolerance for the success array. Looser than progress because "
|
|
"``sigmoid`` amplifies logit-space noise near 0.5."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--pearson-min",
|
|
type=float,
|
|
default=0.99,
|
|
help="Minimum Pearson correlation for a PASS verdict (per array).",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.decoder == "av" or (args.decoder == "auto" and not _HAS_DECORD):
|
|
print(
|
|
"WARNING: using PyAV decoder. PyAV's total-frame count can differ from decord's, "
|
|
"which propagates into different sampled-frame indices. Install `decord` and "
|
|
"re-run for a clean parity check.",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
examples_dir = args.examples_dir.resolve()
|
|
if not examples_dir.is_dir():
|
|
print(f"ERROR: examples dir {examples_dir} does not exist.", file=sys.stderr)
|
|
return 2
|
|
|
|
# Sanity-check the LeRobot config is a RobometerConfig before loading weights.
|
|
cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
|
|
if not isinstance(cfg, RobometerConfig):
|
|
print(f"ERROR: {args.lerobot_model!r} did not resolve to a RobometerConfig.", file=sys.stderr)
|
|
return 2
|
|
|
|
print(f"Loading LeRobot Robometer from {args.lerobot_model} on {args.device}...")
|
|
cfg.pretrained_path = args.lerobot_model
|
|
cfg.device = args.device
|
|
model = RobometerRewardModel.from_pretrained(args.lerobot_model, config=cfg)
|
|
encoder = RobometerEncoderProcessorStep(
|
|
base_model_id=model.config.base_model_id,
|
|
use_multi_image=model.config.use_multi_image,
|
|
use_per_frame_progress_token=model.config.use_per_frame_progress_token,
|
|
max_frames=None,
|
|
)
|
|
|
|
all_ok = True
|
|
for ex in EXAMPLES:
|
|
video_path = examples_dir / ex["video"]
|
|
upstream_progress_path = examples_dir / ex["progress_npy"]
|
|
upstream_success_path = examples_dir / ex["success_npy"]
|
|
|
|
missing = [p for p in (video_path, upstream_progress_path, upstream_success_path) if not p.exists()]
|
|
if missing:
|
|
print(f"[skip] {ex['name']}: missing {[str(m) for m in missing]}")
|
|
all_ok = False
|
|
continue
|
|
|
|
print(f"\n=== {ex['name']} ===")
|
|
print(f" task: {ex['task']!r}")
|
|
|
|
# Trust the upstream reference array as the source of truth for how
|
|
# many frames to sample. The README documents fps=3 for SOAR/Berkeley
|
|
# but Jaco Play was generated with a different fps, so any hardcoded
|
|
# ``--fps`` mismatches at least one example. The npy length always
|
|
# tells us what upstream actually used.
|
|
upstream_progress = np.load(upstream_progress_path).astype(np.float32)
|
|
upstream_success = np.load(upstream_success_path).astype(np.float32)
|
|
target_num_frames = int(upstream_progress.shape[0])
|
|
frames, decoder_info = _extract_frames(video_path, target_num_frames, prefer=args.decoder)
|
|
print(
|
|
f" decoded {frames.shape[0]} frames (matches upstream npy length); "
|
|
f"shape={frames.shape} [{decoder_info}]"
|
|
)
|
|
|
|
progress, success = _run_lerobot(model, encoder, frames, ex["task"])
|
|
|
|
progress_ok = _compare(
|
|
"progress",
|
|
progress,
|
|
upstream_progress,
|
|
atol=args.progress_atol,
|
|
pearson_min=args.pearson_min,
|
|
)
|
|
success_ok = _compare(
|
|
"success",
|
|
success,
|
|
upstream_success,
|
|
atol=args.success_atol,
|
|
pearson_min=args.pearson_min,
|
|
)
|
|
verdict = "PASS" if (progress_ok and success_ok) else "FAIL"
|
|
print(f" -> {verdict}")
|
|
all_ok = all_ok and progress_ok and success_ok
|
|
|
|
print()
|
|
if all_ok:
|
|
print("All upstream example parity checks passed.")
|
|
return 0
|
|
print("Some upstream example parity checks FAILED.")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|