mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-20 02:59:50 +00:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 015c88cf0d | |||
| 0164725af8 | |||
| 34274c6f70 | |||
| f6a13b1338 |
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Create videos with a Robometer progress overlay for one LeRobot dataset episode.
|
||||
|
||||
This is a lightweight smoke-test utility for Robometer checkpoints. It downloads
|
||||
one episode video, samples a small number of frames, runs Robometer on those
|
||||
frames, and reuses the progress overlay renderer from
|
||||
``examples/dataset/create_progress_videos.py``.
|
||||
|
||||
Example:
|
||||
|
||||
uv run python examples/dataset/create_robometer_progress_videos.py \\
|
||||
--repo-id lerobot/aloha_mobile_cabinet \\
|
||||
--episode 0 \\
|
||||
--reward-model-path lilkm/robometer-4b \\
|
||||
--device cuda
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from examples.dataset.create_progress_videos import (
|
||||
composite_progress_video,
|
||||
convert_mp4_to_gif,
|
||||
download_episode_metadata,
|
||||
download_video_file,
|
||||
load_episode_meta,
|
||||
)
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
|
||||
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
|
||||
def _default_device() -> str:
|
||||
return "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
def sample_episode_frames(
|
||||
video_path: Path,
|
||||
*,
|
||||
from_timestamp: float,
|
||||
to_timestamp: float,
|
||||
fps: float,
|
||||
num_frames: int,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Sample RGB frames uniformly from an episode video segment.
|
||||
|
||||
Returns:
|
||||
``(frames, frame_indices)`` where ``frames`` is ``(T,H,W,C)`` uint8 RGB
|
||||
and ``frame_indices`` are local episode frame indices used for overlay.
|
||||
"""
|
||||
if num_frames <= 0:
|
||||
raise ValueError(f"num_frames must be positive, got {num_frames}")
|
||||
|
||||
duration_seconds = to_timestamp - from_timestamp
|
||||
total_frames = max(int(round(duration_seconds * fps)), 1)
|
||||
frame_indices = np.linspace(0, total_frames - 1, num=min(num_frames, total_frames), dtype=int)
|
||||
|
||||
capture = cv2.VideoCapture(str(video_path))
|
||||
frames: list[np.ndarray] = []
|
||||
try:
|
||||
for frame_idx in frame_indices:
|
||||
timestamp = from_timestamp + frame_idx / fps
|
||||
capture.set(cv2.CAP_PROP_POS_MSEC, timestamp * 1000)
|
||||
ret, frame_bgr = capture.read()
|
||||
if not ret:
|
||||
logging.warning("Could not read frame %d at %.3fs", frame_idx, timestamp)
|
||||
continue
|
||||
frames.append(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB))
|
||||
finally:
|
||||
capture.release()
|
||||
|
||||
if not frames:
|
||||
raise RuntimeError(f"No frames could be sampled from {video_path}")
|
||||
|
||||
return np.stack(frames), frame_indices[: len(frames)]
|
||||
|
||||
|
||||
def predict_robometer_progress(
|
||||
frames: np.ndarray,
|
||||
*,
|
||||
task: str,
|
||||
reward_model_path: str,
|
||||
device: str,
|
||||
) -> list[float]:
|
||||
"""Run Robometer and return per-sampled-frame progress predictions."""
|
||||
config = RobometerConfig(pretrained_path=reward_model_path, device=device, max_frames=None)
|
||||
model = RobometerRewardModel.from_pretrained(reward_model_path, config=config)
|
||||
|
||||
encoder = RobometerEncoderProcessorStep(
|
||||
base_model_id=model.config.base_model_id,
|
||||
use_multi_image=model.config.use_multi_image,
|
||||
use_per_frame_progress_token=model.config.use_per_frame_progress_token,
|
||||
max_frames=None,
|
||||
)
|
||||
batch = encoder.encode_samples([(frames, task)])
|
||||
|
||||
model_device = next(model.model.parameters()).device
|
||||
inputs = {key: value.to(model_device) if hasattr(value, "to") else value for key, value in batch.items()}
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
progress_logits, success_logits = model._compute_rbm_logits(inputs)
|
||||
|
||||
decoded = decode_progress_outputs(
|
||||
progress_logits,
|
||||
success_logits,
|
||||
is_discrete_mode=model.config.use_discrete_progress,
|
||||
)
|
||||
return decoded["progress_pred"][0]
|
||||
|
||||
|
||||
def process_dataset(
|
||||
repo_id: str,
|
||||
episode: int,
|
||||
reward_model_path: str,
|
||||
device: str,
|
||||
camera_key: str | None,
|
||||
output_dir: Path,
|
||||
num_frames: int,
|
||||
task: str | None = None,
|
||||
create_gif: bool = False,
|
||||
) -> Path:
|
||||
safe_name = repo_id.replace("/", "_")
|
||||
logging.info("Processing %s episode %d with Robometer %s", repo_id, episode, reward_model_path)
|
||||
|
||||
local_path = download_episode_metadata(repo_id, episode)
|
||||
episode_meta = load_episode_meta(local_path, episode, camera_key)
|
||||
video_path = download_video_file(repo_id, local_path, episode_meta["video_rel"])
|
||||
|
||||
task_name = task or episode_meta.get("task_name", "")
|
||||
if not task_name:
|
||||
raise ValueError("No task found in dataset metadata. Pass --task explicitly.")
|
||||
|
||||
frames, frame_indices = sample_episode_frames(
|
||||
video_path,
|
||||
from_timestamp=episode_meta["from_ts"],
|
||||
to_timestamp=episode_meta["to_ts"],
|
||||
fps=episode_meta["fps"],
|
||||
num_frames=num_frames,
|
||||
)
|
||||
logging.info("Sampled %d frames for Robometer inference", len(frames))
|
||||
|
||||
progress = predict_robometer_progress(
|
||||
frames,
|
||||
task=task_name,
|
||||
reward_model_path=reward_model_path,
|
||||
device=device,
|
||||
)
|
||||
progress_data = np.stack([frame_indices, np.asarray(progress, dtype=np.float32)], axis=1)
|
||||
logging.info("Progress predictions: %s", [round(float(value), 3) for value in progress])
|
||||
|
||||
output_path = output_dir / f"{safe_name}_ep{episode}_robometer_progress.mp4"
|
||||
final_path = composite_progress_video(
|
||||
video_path=video_path,
|
||||
from_timestamp=episode_meta["from_ts"],
|
||||
to_timestamp=episode_meta["to_ts"],
|
||||
progress_data=progress_data,
|
||||
output_path=output_path,
|
||||
fps=episode_meta["fps"],
|
||||
task_name=task_name,
|
||||
)
|
||||
|
||||
if create_gif:
|
||||
final_path = convert_mp4_to_gif(final_path)
|
||||
return final_path
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create MP4/GIF videos with Robometer progress overlay for dataset episodes."
|
||||
)
|
||||
parser.add_argument("--repo-id", required=True, help="Hugging Face LeRobot dataset repo id.")
|
||||
parser.add_argument("--episode", type=int, required=True, help="Episode index to visualize.")
|
||||
parser.add_argument(
|
||||
"--reward-model-path",
|
||||
default="lilkm/robometer-4b",
|
||||
help="Robometer checkpoint path or Hub repo id (e.g. lilkm/robometer-4b).",
|
||||
)
|
||||
parser.add_argument("--device", default=_default_device(), help="Torch device for Robometer inference.")
|
||||
parser.add_argument(
|
||||
"--camera-key",
|
||||
default=None,
|
||||
help="Camera observation key (e.g. observation.images.top). Auto-selects first camera if omitted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task", default=None, help="Task description override if dataset metadata lacks one."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-frames",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of episode frames to sample for Robometer inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("progress_videos"),
|
||||
help="Directory to write output files.",
|
||||
)
|
||||
parser.add_argument("--gif", action="store_true", help="Also generate a GIF from the MP4 output.")
|
||||
args = parser.parse_args()
|
||||
|
||||
init_logging()
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = process_dataset(
|
||||
repo_id=args.repo_id,
|
||||
episode=args.episode,
|
||||
reward_model_path=args.reward_model_path,
|
||||
device=args.device,
|
||||
camera_key=args.camera_key,
|
||||
output_dir=args.output_dir,
|
||||
num_frames=args.num_frames,
|
||||
task=args.task,
|
||||
create_gif=args.gif,
|
||||
)
|
||||
logging.info("Output: %s", result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -204,6 +204,7 @@ groot = [
|
||||
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
|
||||
]
|
||||
sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
robometer = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]", "lerobot[peft-dep]"]
|
||||
xvla = ["lerobot[transformers-dep]"]
|
||||
eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||
@@ -302,6 +303,7 @@ lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main"
|
||||
lerobot-edit-dataset="lerobot.scripts.lerobot_edit_dataset:main"
|
||||
lerobot-setup-can="lerobot.scripts.lerobot_setup_can:main"
|
||||
lerobot-rollout="lerobot.scripts.lerobot_rollout:main"
|
||||
lerobot-export-robometer="lerobot.scripts.lerobot_export_robometer:main"
|
||||
|
||||
# ---------------- Tool Configurations ----------------
|
||||
|
||||
|
||||
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
"""Pinpoint exactly which rows of ``embed_tokens`` / ``lm_head`` differ.
|
||||
|
||||
Useful follow-up to ``scripts/verify_robometer_export.py`` when the verifier
|
||||
reports a small tail of differing keys but you want to know whether the
|
||||
diff is:
|
||||
|
||||
1. Concentrated in the 5 special-token rows added by ``resize_token_embeddings``
|
||||
(expected non-determinism: mean-resize sampling differs between runs).
|
||||
2. Spread across the full vocabulary (would point to a real loading bug).
|
||||
|
||||
Also confirms whether ``apply_upstream_checkpoint`` actually overwrites the
|
||||
embed/lm-head tensors when loading the upstream state dict (vs. silently
|
||||
skipping them due to a key mismatch).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer._upstream_loader import (
|
||||
_download_robometer_snapshot,
|
||||
_remap_state_dict_keys,
|
||||
_resolve_checkpoint_safetensors_files,
|
||||
apply_upstream_checkpoint,
|
||||
)
|
||||
|
||||
EMBED_KEY = "model.model.language_model.embed_tokens.weight"
|
||||
LMHEAD_KEY = "model.lm_head.weight"
|
||||
|
||||
|
||||
def _load_upstream(path: str) -> RobometerRewardModel:
|
||||
cfg = RobometerConfig(pretrained_path=path, device="cpu")
|
||||
model = RobometerRewardModel(cfg)
|
||||
apply_upstream_checkpoint(model, path)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def _load_lerobot(path: str) -> RobometerRewardModel:
|
||||
cfg = RewardModelConfig.from_pretrained(path)
|
||||
if not isinstance(cfg, RobometerConfig):
|
||||
raise TypeError(f"Expected RobometerConfig, got {type(cfg)}")
|
||||
cfg.pretrained_path = path
|
||||
cfg.device = "cpu"
|
||||
return RobometerRewardModel.from_pretrained(path, config=cfg)
|
||||
|
||||
|
||||
def _inspect_upstream_state_dict(upstream_path: str, model: RobometerRewardModel) -> None:
|
||||
"""Dump the upstream state-dict view of the embed/lm-head tensors.
|
||||
|
||||
Loads the raw upstream safetensors (pre-remap), runs the remapper, and
|
||||
reports whether the embed/lm-head keys survive into the merged dict that
|
||||
eventually hits ``model.load_state_dict``.
|
||||
"""
|
||||
snapshot_dir = _download_robometer_snapshot(upstream_path)
|
||||
files = _resolve_checkpoint_safetensors_files(snapshot_dir)
|
||||
merged: dict[str, torch.Tensor] = {}
|
||||
for path in files:
|
||||
merged.update(load_file(str(path)))
|
||||
remapped = _remap_state_dict_keys(merged, model)
|
||||
|
||||
print(f"\n=== Upstream state-dict inspection (snapshot at {snapshot_dir}) ===")
|
||||
print(f"raw keys (before remap) : {len(merged)}")
|
||||
print(f"keys after remap : {len(remapped)}")
|
||||
print(f"model expects (state_dict): {len(model.state_dict())}")
|
||||
|
||||
expected = set(model.state_dict())
|
||||
present_after_remap = set(remapped) & expected
|
||||
print(f"keys present after remap : {len(present_after_remap)}")
|
||||
|
||||
missing_keys = expected - set(remapped)
|
||||
print(f"keys missing from remap : {len(missing_keys)}")
|
||||
if missing_keys:
|
||||
sample = list(missing_keys)[:10]
|
||||
print(f" sample missing keys : {sample}")
|
||||
|
||||
unexpected_keys = set(remapped) - expected
|
||||
print(f"keys unexpected by model : {len(unexpected_keys)}")
|
||||
if unexpected_keys:
|
||||
sample = list(unexpected_keys)[:10]
|
||||
print(f" sample unexpected keys : {sample}")
|
||||
|
||||
for key in (EMBED_KEY, LMHEAD_KEY):
|
||||
present = key in remapped
|
||||
shape = tuple(remapped[key].shape) if present else None
|
||||
print(f" {key:60s} present={present}, shape={shape}")
|
||||
|
||||
|
||||
def _diff_embed(name: str, a: torch.Tensor, b: torch.Tensor, special_token_count: int) -> None:
|
||||
a = a.float()
|
||||
b = b.float()
|
||||
if a.shape != b.shape:
|
||||
print(f"❌ {name} shape mismatch: {tuple(a.shape)} vs {tuple(b.shape)}")
|
||||
return
|
||||
|
||||
abs_diff = (a - b).abs()
|
||||
per_row_max = abs_diff.max(dim=1).values
|
||||
nz_rows = (per_row_max > 0).nonzero(as_tuple=True)[0].tolist()
|
||||
print(f"\n=== {name} (shape {tuple(a.shape)}) ===")
|
||||
print(f"global max|Δ| = {abs_diff.max().item():.3e}")
|
||||
print(f"rows with any diff = {len(nz_rows)}")
|
||||
if nz_rows:
|
||||
first = nz_rows[:10]
|
||||
last = nz_rows[-10:]
|
||||
print(f" first nonzero rows = {first}")
|
||||
print(f" last nonzero rows = {last}")
|
||||
vocab_size = a.shape[0]
|
||||
base_vocab = vocab_size - special_token_count
|
||||
special_rows = list(range(base_vocab, vocab_size))
|
||||
in_special = [r for r in nz_rows if r in special_rows]
|
||||
out_special = [r for r in nz_rows if r not in special_rows]
|
||||
print(
|
||||
f" diffs in special-token rows ({base_vocab}..{vocab_size - 1}): {len(in_special)}/{special_token_count}"
|
||||
)
|
||||
print(f" diffs in base-vocab rows (0..{base_vocab - 1}) : {len(out_special)}")
|
||||
for r in special_rows:
|
||||
print(
|
||||
f" row {r}: max|Δ|={per_row_max[r].item():.3e}, "
|
||||
f"upstream_norm={a[r].norm().item():.3e}, lerobot_norm={b[r].norm().item():.3e}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--upstream", required=True)
|
||||
parser.add_argument("--lerobot", required=True)
|
||||
parser.add_argument(
|
||||
"--special-token-count",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of special tokens Robometer adds. Defaults to len(ROBOMETER_SPECIAL_TOKENS)=5.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Loading upstream: {args.upstream}")
|
||||
upstream = _load_upstream(args.upstream)
|
||||
print(f"Loading LeRobot-format: {args.lerobot}")
|
||||
lerobot = _load_lerobot(args.lerobot)
|
||||
|
||||
_inspect_upstream_state_dict(args.upstream, upstream)
|
||||
|
||||
sd_u, sd_l = upstream.state_dict(), lerobot.state_dict()
|
||||
|
||||
for key in (EMBED_KEY, LMHEAD_KEY):
|
||||
if key not in sd_u or key not in sd_l:
|
||||
print(f"❌ key missing: {key} (upstream={key in sd_u}, lerobot={key in sd_l})")
|
||||
continue
|
||||
_diff_embed(key, sd_u[key], sd_l[key], args.special_token_count)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
"""Extract one LIBERO episode for Robometer parity testing.
|
||||
|
||||
Loads a LeRobot LIBERO (or any video-bearing LeRobot) dataset, picks one
|
||||
episode, samples ``--num-frames`` frames uniformly across its duration
|
||||
(matching upstream Robometer's default of 8 frames), and saves them to
|
||||
``.npz`` plus a sidecar ``.txt`` task file.
|
||||
|
||||
The ``.npz`` layout (``frames`` key, ``(T, H, W, C) uint8``) is what upstream
|
||||
``example_inference_local.py`` consumes, so the same file feeds both pipelines
|
||||
and frame sampling cannot drift.
|
||||
|
||||
Workflow:
|
||||
|
||||
1. Run this script (LeRobot env) to produce ``frames.npz`` + ``task.txt``.
|
||||
2. Pass them to upstream ``scripts/example_inference_local.py``
|
||||
(upstream env) to produce reference progress / success outputs.
|
||||
3. Pass the same ``frames.npz`` to ``scripts/parity_robometer.py``
|
||||
(LeRobot env) to compare both sides.
|
||||
|
||||
Example:
|
||||
|
||||
uv run python scripts/extract_libero_episode_for_parity.py \\
|
||||
--repo-id lerobot/libero_10_image \\
|
||||
--episode 0 \\
|
||||
--num-frames 8 \\
|
||||
--out-dir /tmp/libero_ep0
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from lerobot.configs.types import FeatureType
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
|
||||
def _pick_visual_feature(features: dict, requested: str | None) -> str:
|
||||
"""Return a visual feature key, preferring ``requested`` when given."""
|
||||
visual_keys = [
|
||||
key
|
||||
for key, ft in features.items()
|
||||
if getattr(ft, "type", None) == FeatureType.VISUAL or ft.get("dtype", "") == "video"
|
||||
]
|
||||
if not visual_keys:
|
||||
raise ValueError(f"Dataset has no visual feature; available: {list(features)}")
|
||||
if requested is not None:
|
||||
if requested not in visual_keys:
|
||||
raise ValueError(f"Camera key {requested!r} not in dataset visual features {visual_keys}")
|
||||
return requested
|
||||
return visual_keys[0]
|
||||
|
||||
|
||||
def _frame_uint8_hwc(tensor: torch.Tensor) -> np.ndarray:
|
||||
"""Convert a LeRobotDataset video frame to ``uint8`` ``(H, W, C)`` RGB."""
|
||||
arr = tensor.detach().cpu().numpy()
|
||||
if arr.ndim == 3 and arr.shape[0] in (1, 3):
|
||||
arr = arr.transpose(1, 2, 0)
|
||||
if arr.dtype != np.uint8:
|
||||
arr = np.clip(arr * 255.0 if arr.max() <= 1.0 + 1e-3 else arr, 0, 255).astype(np.uint8)
|
||||
if arr.shape[-1] == 1:
|
||||
arr = np.repeat(arr, 3, axis=-1)
|
||||
return arr
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo-id",
|
||||
default="lerobot/libero_10_image",
|
||||
help="LeRobot LIBERO (or other) dataset repo id (default: lerobot/libero_10_image).",
|
||||
)
|
||||
parser.add_argument("--episode", type=int, default=0, help="Episode index.")
|
||||
parser.add_argument(
|
||||
"--camera-key",
|
||||
default=None,
|
||||
help="Visual feature key (e.g. observation.images.image). Auto-selects first if omitted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-frames",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of frames to sample uniformly (default: 8 — Robometer's training-time default).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-dir",
|
||||
type=Path,
|
||||
default=Path("outputs/robometer_parity/libero"),
|
||||
help="Directory to write frames.npz / task.txt / frame_indices.npy.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Loading {args.repo_id} (episode {args.episode})...")
|
||||
dataset = LeRobotDataset(args.repo_id, episodes=[args.episode])
|
||||
|
||||
camera_key = _pick_visual_feature(dataset.features, args.camera_key)
|
||||
print(f"Using camera key: {camera_key}")
|
||||
|
||||
ep_from = int(dataset.episode_data_index["from"][0].item())
|
||||
ep_to = int(dataset.episode_data_index["to"][0].item())
|
||||
total_frames = ep_to - ep_from
|
||||
if total_frames <= 0:
|
||||
print(f"ERROR: episode {args.episode} has no frames.", file=sys.stderr)
|
||||
return 1
|
||||
print(f"Episode has {total_frames} frames; sampling {args.num_frames} uniformly.")
|
||||
|
||||
indices = np.linspace(0, total_frames - 1, num=min(args.num_frames, total_frames), dtype=int)
|
||||
frames: list[np.ndarray] = []
|
||||
task: str = ""
|
||||
for offset in indices:
|
||||
sample = dataset[ep_from + int(offset)]
|
||||
frame_tensor = sample[camera_key]
|
||||
frames.append(_frame_uint8_hwc(frame_tensor))
|
||||
if not task:
|
||||
task = sample.get("task", "") or ""
|
||||
|
||||
if not task:
|
||||
print("ERROR: episode has no task description in metadata.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
frames_array = np.stack(frames)
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
frames_path = args.out_dir / "frames.npz"
|
||||
task_path = args.out_dir / "task.txt"
|
||||
indices_path = args.out_dir / "frame_indices.npy"
|
||||
|
||||
np.savez(frames_path, frames=frames_array)
|
||||
task_path.write_text(task + "\n", encoding="utf-8")
|
||||
np.save(indices_path, indices)
|
||||
|
||||
print()
|
||||
print(f"Wrote {frames_path} (shape={frames_array.shape}, dtype={frames_array.dtype})")
|
||||
print(f"Wrote {task_path} (task={task!r})")
|
||||
print(f"Wrote {indices_path} (frame_indices={indices.tolist()})")
|
||||
print()
|
||||
print("Next steps:")
|
||||
print(" # in upstream env (where `robometer` is importable):")
|
||||
print(
|
||||
f" python third_party/robometer/scripts/example_inference_local.py \\\n"
|
||||
f" --model-path robometer/Robometer-4B \\\n"
|
||||
f" --video {frames_path} \\\n"
|
||||
f' --task "{task}" \\\n'
|
||||
f" --out {args.out_dir / 'upstream.npy'}"
|
||||
)
|
||||
print()
|
||||
print(" # back in LeRobot env:")
|
||||
print(
|
||||
f" uv run python scripts/parity_robometer.py \\\n"
|
||||
f" --frames {frames_path} \\\n"
|
||||
f' --task "{task}" \\\n'
|
||||
f" --upstream-progress {args.out_dir / 'upstream.npy'} \\\n"
|
||||
f" --upstream-success {args.out_dir / 'upstream_success_probs.npy'}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
"""Functional parity check: LeRobot Robometer vs. upstream Robometer.
|
||||
|
||||
Runs the in-tree :class:`RobometerRewardModel` on the same frames + task that
|
||||
upstream Robometer was run on, and compares per-frame progress / success
|
||||
predictions against reference outputs saved by upstream's
|
||||
``scripts/example_inference_local.py``.
|
||||
|
||||
Workflow:
|
||||
|
||||
1. In the upstream Robometer environment (where ``robometer`` is importable),
|
||||
run::
|
||||
|
||||
python third_party/robometer/scripts/example_inference_local.py \\
|
||||
--model-path robometer/Robometer-4B \\
|
||||
--video /path/to/episode.mp4 \\
|
||||
--task "Open the drawer" \\
|
||||
--fps 1.0 \\
|
||||
--out /tmp/robometer_upstream.npy
|
||||
|
||||
This produces:
|
||||
- ``/tmp/robometer_upstream.npy`` (progress predictions)
|
||||
- ``/tmp/robometer_upstream_success_probs.npy`` (success probabilities)
|
||||
|
||||
2. Extract the exact same frames the upstream script used, save as ``.npz``::
|
||||
|
||||
# quick helper: extract frames at the same fps and save as .npz
|
||||
python -c "
|
||||
from third_party.robometer.scripts.example_inference_local import load_frames_input
|
||||
import numpy as np
|
||||
frames = load_frames_input('/path/to/episode.mp4', fps=1.0, max_frames=512)
|
||||
np.savez('/tmp/robometer_frames.npz', frames=frames)
|
||||
"
|
||||
|
||||
3. In this LeRobot env, run this script::
|
||||
|
||||
uv run python scripts/parity_robometer.py \\
|
||||
--frames /tmp/robometer_frames.npz \\
|
||||
--task "Open the drawer" \\
|
||||
--upstream-progress /tmp/robometer_upstream.npy \\
|
||||
--upstream-success /tmp/robometer_upstream_success_probs.npy \\
|
||||
--lerobot-model lilkm/robometer-4b
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
|
||||
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
|
||||
|
||||
|
||||
def _load_frames(path: str) -> np.ndarray:
|
||||
"""Load frames from .npy/.npz. Expects (T, H, W, C) uint8."""
|
||||
if path.endswith(".npy"):
|
||||
frames = np.load(path)
|
||||
elif path.endswith(".npz"):
|
||||
with np.load(path, allow_pickle=False) as npz:
|
||||
frames = npz["frames"].copy() if "frames" in npz else next(iter(npz.values())).copy()
|
||||
else:
|
||||
raise ValueError(f"Frames must be .npy or .npz (got {path!r}).")
|
||||
|
||||
if frames.dtype != np.uint8:
|
||||
frames = np.clip(frames, 0, 255).astype(np.uint8)
|
||||
if frames.ndim != 4:
|
||||
raise ValueError(f"Frames must be 4D (T,H,W,C); got shape {frames.shape}.")
|
||||
if frames.shape[-1] not in (1, 3):
|
||||
# Probably (T,C,H,W) — transpose
|
||||
if frames.shape[1] in (1, 3):
|
||||
frames = frames.transpose(0, 2, 3, 1)
|
||||
else:
|
||||
raise ValueError(f"Cannot interpret frame channel layout: {frames.shape}.")
|
||||
return frames
|
||||
|
||||
|
||||
def _run_lerobot(
|
||||
frames: np.ndarray,
|
||||
task: str,
|
||||
model_path: str,
|
||||
device: str,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Run LeRobot's Robometer on the given frames; return (progress, success)."""
|
||||
cfg = RobometerConfig(pretrained_path=model_path, device=device, max_frames=None)
|
||||
model = RobometerRewardModel.from_pretrained(model_path, config=cfg)
|
||||
|
||||
encoder = RobometerEncoderProcessorStep(
|
||||
base_model_id=model.config.base_model_id,
|
||||
use_multi_image=model.config.use_multi_image,
|
||||
use_per_frame_progress_token=model.config.use_per_frame_progress_token,
|
||||
max_frames=None,
|
||||
)
|
||||
batch = encoder.encode_samples([(frames, task)])
|
||||
|
||||
model_device = next(model.model.parameters()).device
|
||||
inputs = {key: value.to(model_device) if hasattr(value, "to") else value for key, value in batch.items()}
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
progress_logits, success_logits = model._compute_rbm_logits(inputs)
|
||||
|
||||
decoded = decode_progress_outputs(
|
||||
progress_logits,
|
||||
success_logits,
|
||||
is_discrete_mode=model.config.use_discrete_progress,
|
||||
)
|
||||
progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
|
||||
success = (
|
||||
np.asarray(decoded["success_probs"][0], dtype=np.float32)
|
||||
if decoded["success_probs"]
|
||||
else np.array([], dtype=np.float32)
|
||||
)
|
||||
return progress, success
|
||||
|
||||
|
||||
def _compare(name: str, lerobot: np.ndarray, upstream: np.ndarray, atol: float, rtol: float) -> bool:
|
||||
print(f"\n=== {name} ===")
|
||||
if lerobot.shape != upstream.shape:
|
||||
print(f"shape mismatch: lerobot={lerobot.shape} upstream={upstream.shape}")
|
||||
return False
|
||||
|
||||
abs_diff = np.abs(lerobot - upstream)
|
||||
rel_diff = abs_diff / (np.abs(upstream) + 1e-12)
|
||||
print(f"shape : {lerobot.shape}")
|
||||
print(f"max |Δ| : {abs_diff.max():.3e}")
|
||||
print(f"mean |Δ| : {abs_diff.mean():.3e}")
|
||||
print(f"max rel |Δ| : {rel_diff.max():.3e}")
|
||||
print(f"lerobot[:5] : {lerobot[:5]}")
|
||||
print(f"upstream[:5] : {upstream[:5]}")
|
||||
|
||||
within_tol = bool(np.allclose(lerobot, upstream, atol=atol, rtol=rtol))
|
||||
print(f"allclose(atol={atol}, rtol={rtol}) -> {within_tol}")
|
||||
return within_tol
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--frames",
|
||||
required=True,
|
||||
help=".npy / .npz file with the exact frames upstream was run on (T,H,W,C uint8).",
|
||||
)
|
||||
parser.add_argument("--task", required=True, help="Task instruction string.")
|
||||
parser.add_argument(
|
||||
"--upstream-progress",
|
||||
required=True,
|
||||
help="Reference progress .npy saved by upstream example_inference_local.py.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--upstream-success",
|
||||
default=None,
|
||||
help="Optional reference success_probs .npy. If omitted, success comparison is skipped.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lerobot-model",
|
||||
default="lilkm/robometer-4b",
|
||||
help="LeRobot-format Robometer Hub repo id or local path.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default="cuda" if torch.cuda.is_available() else "cpu",
|
||||
help="Device for the LeRobot model (default: cuda if available).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--atol",
|
||||
type=float,
|
||||
default=1e-3,
|
||||
help="Absolute tolerance for allclose (default: 1e-3; bf16 round-trip headroom).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rtol",
|
||||
type=float,
|
||||
default=1e-2,
|
||||
help="Relative tolerance for allclose (default: 1e-2).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-prefix",
|
||||
default="lerobot_robometer_outputs",
|
||||
help="Save the LeRobot outputs as <prefix>_progress.npy / <prefix>_success.npy.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# 0. Sanity: confirm the LeRobot config is a RobometerConfig.
|
||||
cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
|
||||
if not isinstance(cfg, RobometerConfig):
|
||||
print(f"ERROR: {args.lerobot_model!r} does not resolve to a RobometerConfig.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
# 1. Load frames + task + upstream reference outputs.
|
||||
frames = _load_frames(args.frames)
|
||||
upstream_progress = np.load(args.upstream_progress).astype(np.float32)
|
||||
upstream_success = (
|
||||
np.load(args.upstream_success).astype(np.float32) if args.upstream_success is not None else None
|
||||
)
|
||||
|
||||
print(f"Loaded {frames.shape[0]} frames at {frames.shape[1:]}, task={args.task!r}")
|
||||
print(f"LeRobot model: {args.lerobot_model} device: {args.device}")
|
||||
|
||||
# 2. Run LeRobot pipeline.
|
||||
progress, success = _run_lerobot(frames, args.task, args.lerobot_model, args.device)
|
||||
np.save(f"{args.out_prefix}_progress.npy", progress)
|
||||
if success.size > 0:
|
||||
np.save(f"{args.out_prefix}_success.npy", success)
|
||||
print(f"Saved LeRobot outputs to {args.out_prefix}_progress.npy / _success.npy")
|
||||
|
||||
# 3. Compare to upstream references.
|
||||
progress_ok = _compare("progress", progress, upstream_progress, args.atol, args.rtol)
|
||||
if upstream_success is not None and success.size > 0:
|
||||
success_ok = _compare("success_probs", success, upstream_success, args.atol, args.rtol)
|
||||
else:
|
||||
success_ok = True
|
||||
print("\n(skipping success comparison — upstream success file not provided)")
|
||||
|
||||
print()
|
||||
if progress_ok and success_ok:
|
||||
print("Parity check passed.")
|
||||
return 0
|
||||
print("Parity check FAILED.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,362 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
"""Run LeRobot Robometer parity against upstream Robometer's bundled examples.
|
||||
|
||||
Upstream Robometer ships three reference videos with their pre-computed
|
||||
progress / success outputs at
|
||||
``third_party/robometer/scripts/example_videos/``::
|
||||
|
||||
soar_put_green_stick_in_brown_bowl.mp4
|
||||
+ soar_put_green_stick_in_brown_bowl_rewards.npy (progress)
|
||||
+ soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy (success)
|
||||
berkeley_rpt_stack_cup.mp4
|
||||
+ berkeley_rpt_stack_cup_rewards.npy
|
||||
+ berkeley_rpt_stack_cup_rewards_success_probs.npy
|
||||
jaco_play_pick_up_green_cup.mp4
|
||||
+ pick_up_green_cup_rewards.npy
|
||||
+ pick_up_green_cup_rewards_success_probs.npy
|
||||
|
||||
This script:
|
||||
1. Decodes each video at upstream's sampling fps using ``av`` (PyAV), with the
|
||||
same linspace-over-total-frames logic as upstream's ``extract_frames``.
|
||||
2. Runs the LeRobot ``RobometerRewardModel`` on those frames + the task from
|
||||
upstream's README.
|
||||
3. Compares per-frame progress / success to the pre-saved upstream outputs.
|
||||
|
||||
This means you do **not** need to install upstream Robometer to confirm parity.
|
||||
|
||||
Run::
|
||||
|
||||
uv run python scripts/parity_robometer_upstream_examples.py \\
|
||||
--lerobot-model lilkm/robometer-4b \\
|
||||
--device cuda \\
|
||||
--decoder decord
|
||||
|
||||
The number of frames sampled per video is derived from the length of each
|
||||
upstream ``.npy`` reference, so the script does not need a ``--fps`` argument
|
||||
(the README documents ``fps=3`` for SOAR / Berkeley, but the Jaco Play
|
||||
reference was generated with a different fps).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer.modeling_robometer import decode_progress_outputs
|
||||
from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
|
||||
|
||||
try:
|
||||
import decord # type: ignore
|
||||
|
||||
_HAS_DECORD = True
|
||||
except ImportError:
|
||||
decord = None # type: ignore
|
||||
_HAS_DECORD = False
|
||||
|
||||
try:
|
||||
import av
|
||||
|
||||
_HAS_AV = True
|
||||
except ImportError:
|
||||
av = None # type: ignore
|
||||
_HAS_AV = False
|
||||
|
||||
EXAMPLES = [
|
||||
{
|
||||
"name": "soar_put_green_stick_in_brown_bowl",
|
||||
"video": "soar_put_green_stick_in_brown_bowl.mp4",
|
||||
"task": "Put green stick in brown bowl",
|
||||
"progress_npy": "soar_put_green_stick_in_brown_bowl_rewards.npy",
|
||||
"success_npy": "soar_put_green_stick_in_brown_bowl_rewards_success_probs.npy",
|
||||
},
|
||||
{
|
||||
"name": "berkeley_rpt_stack_cup",
|
||||
"video": "berkeley_rpt_stack_cup.mp4",
|
||||
"task": "Pick up the yellow cup and stack it on the other cup",
|
||||
"progress_npy": "berkeley_rpt_stack_cup_rewards.npy",
|
||||
"success_npy": "berkeley_rpt_stack_cup_rewards_success_probs.npy",
|
||||
},
|
||||
{
|
||||
"name": "jaco_play_pick_up_green_cup",
|
||||
"video": "jaco_play_pick_up_green_cup.mp4",
|
||||
"task": "Pick up the green cup",
|
||||
"progress_npy": "pick_up_green_cup_rewards.npy",
|
||||
"success_npy": "pick_up_green_cup_rewards_success_probs.npy",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _extract_frames_decord(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
|
||||
"""Sample ``num_frames`` indices uniformly from the video using decord.
|
||||
|
||||
Mirrors upstream's ``extract_frames`` indexing
|
||||
(``third_party/robometer/scripts/example_inference.py``): a
|
||||
``np.linspace(0, total_frames-1, num_frames)`` lookup over decord's
|
||||
``VideoReader``. We pass ``num_frames`` explicitly (derived from the
|
||||
upstream reference output length) so we don't have to guess what ``fps``
|
||||
upstream actually used when generating each saved ``.npy`` — the file
|
||||
length is the ground truth.
|
||||
"""
|
||||
vr = decord.VideoReader(str(video_path), num_threads=1)
|
||||
total_frames = len(vr)
|
||||
if total_frames == 0:
|
||||
raise RuntimeError(f"No decodable frames in {video_path}.")
|
||||
desired_frames = max(1, min(int(num_frames), total_frames))
|
||||
indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int).tolist()
|
||||
frames = vr.get_batch(indices).asnumpy()
|
||||
native_fps = float(vr.get_avg_fps()) or 1.0
|
||||
return frames, f"decord total={total_frames} native_fps={native_fps:.3f}"
|
||||
|
||||
|
||||
def _extract_frames_av(video_path: Path, num_frames: int) -> tuple[np.ndarray, str]:
|
||||
"""PyAV fallback for environments without decord.
|
||||
|
||||
PyAV and decord can disagree on ``total_frames`` for the same container,
|
||||
so the sampled frame indices can drift. Install ``decord`` for a real
|
||||
parity check; this fallback is for smoke tests only.
|
||||
"""
|
||||
container = av.open(str(video_path))
|
||||
stream = container.streams.video[0]
|
||||
native_fps = float(stream.average_rate) if stream.average_rate else float(stream.guessed_rate or 30.0)
|
||||
rgb_frames: list[np.ndarray] = []
|
||||
for frame in container.decode(stream):
|
||||
rgb_frames.append(frame.to_ndarray(format="rgb24"))
|
||||
container.close()
|
||||
total_frames = len(rgb_frames)
|
||||
if total_frames == 0:
|
||||
raise RuntimeError(f"No decodable frames in {video_path}.")
|
||||
desired_frames = max(1, min(int(num_frames), total_frames))
|
||||
indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int)
|
||||
frames = np.stack([rgb_frames[i] for i in indices])
|
||||
return frames, f"av total={total_frames} native_fps={native_fps:.3f}"
|
||||
|
||||
|
||||
def _extract_frames(video_path: Path, num_frames: int, prefer: str) -> tuple[np.ndarray, str]:
|
||||
"""Decoder dispatch. ``prefer`` is ``"decord"`` | ``"av"`` | ``"auto"``."""
|
||||
if prefer == "decord":
|
||||
if not _HAS_DECORD:
|
||||
raise RuntimeError("decord requested but not installed (`uv pip install decord`).")
|
||||
return _extract_frames_decord(video_path, num_frames)
|
||||
if prefer == "av":
|
||||
if not _HAS_AV:
|
||||
raise RuntimeError("av requested but not installed.")
|
||||
return _extract_frames_av(video_path, num_frames)
|
||||
# auto
|
||||
if _HAS_DECORD:
|
||||
return _extract_frames_decord(video_path, num_frames)
|
||||
if _HAS_AV:
|
||||
return _extract_frames_av(video_path, num_frames)
|
||||
raise RuntimeError("No video decoder available (install `decord` or `av`).")
|
||||
|
||||
|
||||
def _pearson(a: np.ndarray, b: np.ndarray) -> float:
|
||||
"""Pearson correlation; returns 1.0 for constant inputs (no signal to align)."""
|
||||
a = a.astype(np.float64)
|
||||
b = b.astype(np.float64)
|
||||
if a.size < 2:
|
||||
return 1.0
|
||||
da = a - a.mean()
|
||||
db = b - b.mean()
|
||||
denom = float(np.sqrt((da * da).sum()) * np.sqrt((db * db).sum()))
|
||||
if denom == 0:
|
||||
return 1.0
|
||||
return float((da * db).sum() / denom)
|
||||
|
||||
|
||||
def _run_lerobot(
|
||||
model: RobometerRewardModel,
|
||||
encoder: RobometerEncoderProcessorStep,
|
||||
frames: np.ndarray,
|
||||
task: str,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
batch = encoder.encode_samples([(frames, task)])
|
||||
device = next(model.model.parameters()).device
|
||||
inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in batch.items()}
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
progress_logits, success_logits = model._compute_rbm_logits(inputs)
|
||||
decoded = decode_progress_outputs(
|
||||
progress_logits, success_logits, is_discrete_mode=model.config.use_discrete_progress
|
||||
)
|
||||
progress = np.asarray(decoded["progress_pred"][0], dtype=np.float32)
|
||||
success = (
|
||||
np.asarray(decoded["success_probs"][0], dtype=np.float32)
|
||||
if decoded["success_probs"]
|
||||
else np.array([], dtype=np.float32)
|
||||
)
|
||||
return progress, success
|
||||
|
||||
|
||||
def _compare(
|
||||
name: str,
|
||||
lerobot: np.ndarray,
|
||||
upstream: np.ndarray,
|
||||
*,
|
||||
atol: float,
|
||||
pearson_min: float,
|
||||
) -> bool:
|
||||
if lerobot.shape != upstream.shape:
|
||||
print(f" {name:8s} SHAPE MISMATCH lerobot={lerobot.shape} upstream={upstream.shape}")
|
||||
return False
|
||||
abs_diff = np.abs(lerobot - upstream)
|
||||
pearson = _pearson(lerobot, upstream)
|
||||
abs_ok = bool(abs_diff.max() <= atol)
|
||||
pearson_ok = bool(pearson >= pearson_min)
|
||||
verdict = "PASS" if (abs_ok or pearson_ok) else "FAIL"
|
||||
print(
|
||||
f" {name:8s} shape={lerobot.shape} max|Δ|={abs_diff.max():.3e} "
|
||||
f"mean|Δ|={abs_diff.mean():.3e} pearson={pearson:.4f} "
|
||||
f"(atol={atol:.0e} pearson_min={pearson_min:.3f}) -> {verdict}"
|
||||
)
|
||||
return abs_ok or pearson_ok
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--examples-dir",
|
||||
type=Path,
|
||||
default=Path("third_party/robometer/scripts/example_videos"),
|
||||
help="Directory containing the upstream Robometer example mp4s + .npy outputs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lerobot-model",
|
||||
default="lilkm/robometer-4b",
|
||||
help="LeRobot-format Robometer Hub repo id or local path.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default="cuda" if torch.cuda.is_available() else "cpu",
|
||||
help="Device for the LeRobot model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decoder",
|
||||
choices=("auto", "decord", "av"),
|
||||
default="auto",
|
||||
help=(
|
||||
"Video decoder. ``auto`` prefers decord (matches upstream) and falls back to av. "
|
||||
"Force ``decord`` for a clean parity check."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--progress-atol",
|
||||
type=float,
|
||||
default=1e-2,
|
||||
help="Absolute tolerance for the progress array. Default 1e-2 covers CUDA bf16 noise.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--success-atol",
|
||||
type=float,
|
||||
default=1e-1,
|
||||
help=(
|
||||
"Absolute tolerance for the success array. Looser than progress because "
|
||||
"``sigmoid`` amplifies logit-space noise near 0.5."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pearson-min",
|
||||
type=float,
|
||||
default=0.99,
|
||||
help="Minimum Pearson correlation for a PASS verdict (per array).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.decoder == "av" or (args.decoder == "auto" and not _HAS_DECORD):
|
||||
print(
|
||||
"WARNING: using PyAV decoder. PyAV's total-frame count can differ from decord's, "
|
||||
"which propagates into different sampled-frame indices. Install `decord` and "
|
||||
"re-run for a clean parity check.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
examples_dir = args.examples_dir.resolve()
|
||||
if not examples_dir.is_dir():
|
||||
print(f"ERROR: examples dir {examples_dir} does not exist.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
# Sanity-check the LeRobot config is a RobometerConfig before loading weights.
|
||||
cfg = RewardModelConfig.from_pretrained(args.lerobot_model)
|
||||
if not isinstance(cfg, RobometerConfig):
|
||||
print(f"ERROR: {args.lerobot_model!r} did not resolve to a RobometerConfig.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
print(f"Loading LeRobot Robometer from {args.lerobot_model} on {args.device}...")
|
||||
cfg.pretrained_path = args.lerobot_model
|
||||
cfg.device = args.device
|
||||
model = RobometerRewardModel.from_pretrained(args.lerobot_model, config=cfg)
|
||||
encoder = RobometerEncoderProcessorStep(
|
||||
base_model_id=model.config.base_model_id,
|
||||
use_multi_image=model.config.use_multi_image,
|
||||
use_per_frame_progress_token=model.config.use_per_frame_progress_token,
|
||||
max_frames=None,
|
||||
)
|
||||
|
||||
all_ok = True
|
||||
for ex in EXAMPLES:
|
||||
video_path = examples_dir / ex["video"]
|
||||
upstream_progress_path = examples_dir / ex["progress_npy"]
|
||||
upstream_success_path = examples_dir / ex["success_npy"]
|
||||
|
||||
missing = [p for p in (video_path, upstream_progress_path, upstream_success_path) if not p.exists()]
|
||||
if missing:
|
||||
print(f"[skip] {ex['name']}: missing {[str(m) for m in missing]}")
|
||||
all_ok = False
|
||||
continue
|
||||
|
||||
print(f"\n=== {ex['name']} ===")
|
||||
print(f" task: {ex['task']!r}")
|
||||
|
||||
# Trust the upstream reference array as the source of truth for how
|
||||
# many frames to sample. The README documents fps=3 for SOAR/Berkeley
|
||||
# but Jaco Play was generated with a different fps, so any hardcoded
|
||||
# ``--fps`` mismatches at least one example. The npy length always
|
||||
# tells us what upstream actually used.
|
||||
upstream_progress = np.load(upstream_progress_path).astype(np.float32)
|
||||
upstream_success = np.load(upstream_success_path).astype(np.float32)
|
||||
target_num_frames = int(upstream_progress.shape[0])
|
||||
frames, decoder_info = _extract_frames(video_path, target_num_frames, prefer=args.decoder)
|
||||
print(
|
||||
f" decoded {frames.shape[0]} frames (matches upstream npy length); "
|
||||
f"shape={frames.shape} [{decoder_info}]"
|
||||
)
|
||||
|
||||
progress, success = _run_lerobot(model, encoder, frames, ex["task"])
|
||||
|
||||
progress_ok = _compare(
|
||||
"progress",
|
||||
progress,
|
||||
upstream_progress,
|
||||
atol=args.progress_atol,
|
||||
pearson_min=args.pearson_min,
|
||||
)
|
||||
success_ok = _compare(
|
||||
"success",
|
||||
success,
|
||||
upstream_success,
|
||||
atol=args.success_atol,
|
||||
pearson_min=args.pearson_min,
|
||||
)
|
||||
verdict = "PASS" if (progress_ok and success_ok) else "FAIL"
|
||||
print(f" -> {verdict}")
|
||||
all_ok = all_ok and progress_ok and success_ok
|
||||
|
||||
print()
|
||||
if all_ok:
|
||||
print("All upstream example parity checks passed.")
|
||||
return 0
|
||||
print("Some upstream example parity checks FAILED.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
"""Verify that a LeRobot-format Robometer is byte-equivalent to its upstream source.
|
||||
|
||||
Run this once after publishing a LeRobot-format Robometer to the Hub, before
|
||||
flipping the default `RobometerConfig.pretrained_path` to it. It loads both
|
||||
the upstream snapshot and the re-exported copy, compares state dicts, and
|
||||
prints a clear pass/fail summary.
|
||||
|
||||
Example:
|
||||
|
||||
python scripts/verify_robometer_export.py \\
|
||||
--upstream robometer/Robometer-4B \\
|
||||
--lerobot lerobot/robometer-4b
|
||||
|
||||
python scripts/verify_robometer_export.py \\
|
||||
--upstream robometer/Robometer-4B \\
|
||||
--lerobot ./robometer-4b-lerobot # local folder also works
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer._upstream_loader import apply_upstream_checkpoint
|
||||
|
||||
|
||||
def _load_upstream(path: str) -> RobometerRewardModel:
|
||||
# Fresh ``RobometerConfig`` (``vlm_config=None``) triggers
|
||||
# ``RobometerRewardModel.__init__``'s upstream-matching path: download
|
||||
# base Qwen, resize for ROBOMETER_SPECIAL_TOKENS. The subsequent
|
||||
# ``apply_upstream_checkpoint`` call resizes again if the checkpoint's
|
||||
# vocab differs (e.g. upstream was trained against an older Qwen).
|
||||
cfg = RobometerConfig(pretrained_path=path, device="cpu")
|
||||
model = RobometerRewardModel(cfg)
|
||||
apply_upstream_checkpoint(model, path)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def _load_lerobot(path: str) -> RobometerRewardModel:
|
||||
cfg = RewardModelConfig.from_pretrained(path)
|
||||
if not isinstance(cfg, RobometerConfig):
|
||||
raise TypeError(f"Expected RobometerConfig in LeRobot export, got {type(cfg)}")
|
||||
cfg.pretrained_path = path
|
||||
cfg.device = "cpu"
|
||||
return RobometerRewardModel.from_pretrained(path, config=cfg)
|
||||
|
||||
|
||||
def compare_state_dicts(a: RobometerRewardModel, b: RobometerRewardModel) -> bool:
|
||||
sd_a, sd_b = a.state_dict(), b.state_dict()
|
||||
keys_a, keys_b = set(sd_a), set(sd_b)
|
||||
|
||||
missing = keys_a - keys_b
|
||||
extra = keys_b - keys_a
|
||||
if missing:
|
||||
print(f"❌ {len(missing)} keys missing in LeRobot-format model (sample: {list(missing)[:5]})")
|
||||
if extra:
|
||||
print(f"❌ {len(extra)} extra keys in LeRobot-format model (sample: {list(extra)[:5]})")
|
||||
if missing or extra:
|
||||
return False
|
||||
|
||||
diff_summary: list[tuple[str, float]] = []
|
||||
for key in sorted(keys_a):
|
||||
ta, tb = sd_a[key], sd_b[key]
|
||||
if ta.shape != tb.shape:
|
||||
print(f"❌ shape mismatch at {key}: {tuple(ta.shape)} vs {tuple(tb.shape)}")
|
||||
return False
|
||||
# Compare in float to avoid bfloat16 equality quirks.
|
||||
max_abs = (ta.float() - tb.float()).abs().max().item()
|
||||
if max_abs > 0:
|
||||
diff_summary.append((key, max_abs))
|
||||
|
||||
if not diff_summary:
|
||||
print(f"✅ All {len(keys_a)} parameters identical")
|
||||
return True
|
||||
|
||||
# Some keys differ; show worst offenders.
|
||||
diff_summary.sort(key=lambda kv: kv[1], reverse=True)
|
||||
print(f"⚠️ {len(diff_summary)} keys differ. Top 10 by max abs diff:")
|
||||
for key, value in diff_summary[:10]:
|
||||
print(f" {key:60s} max|Δ| = {value:.3e}")
|
||||
|
||||
# Tolerance: bf16 round-trips can introduce ULP-level noise but no real
|
||||
# change. Allow up to 1e-3 absolute difference; anything larger is a real
|
||||
# divergence.
|
||||
worst = diff_summary[0][1]
|
||||
if worst < 1e-3:
|
||||
print(f"✅ Worst diff {worst:.3e} is within bf16 round-trip tolerance")
|
||||
return True
|
||||
print(f"❌ Worst diff {worst:.3e} exceeds tolerance (1e-3)")
|
||||
return False
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--upstream", required=True, help="Upstream Robometer repo id or local path.")
|
||||
parser.add_argument("--lerobot", required=True, help="LeRobot-format Robometer repo id or local path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Loading upstream: {args.upstream}")
|
||||
upstream = _load_upstream(args.upstream)
|
||||
print(f"Loading LeRobot-format: {args.lerobot}")
|
||||
lerobot = _load_lerobot(args.lerobot)
|
||||
|
||||
print("\n=== Config comparison ===")
|
||||
config_ok = True
|
||||
for field in [
|
||||
"base_model_id",
|
||||
"torch_dtype",
|
||||
"use_multi_image",
|
||||
"use_per_frame_progress_token",
|
||||
"average_temporal_patches",
|
||||
"frame_pooling",
|
||||
"frame_pooling_attn_temperature",
|
||||
"progress_loss_type",
|
||||
"progress_discrete_bins",
|
||||
]:
|
||||
a, b = getattr(upstream.config, field), getattr(lerobot.config, field)
|
||||
field_ok = a == b
|
||||
config_ok = config_ok and field_ok
|
||||
ok = "✅" if field_ok else "❌"
|
||||
print(f" {ok} {field}: upstream={a!r}, lerobot={b!r}")
|
||||
|
||||
print("\n=== State-dict comparison ===")
|
||||
state_dict_ok = compare_state_dicts(upstream, lerobot)
|
||||
|
||||
print()
|
||||
if config_ok and state_dict_ok:
|
||||
print("🎉 Verification passed — safe to flip the default.")
|
||||
return 0
|
||||
print("⛔ Verification failed — DO NOT flip the default.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -89,9 +89,16 @@ class RewardModelConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
|
||||
def reward_delta_indices(self) -> list | None: # type: ignore[type-arg]
|
||||
return None
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_optimizer_preset(self) -> OptimizerConfig:
|
||||
raise NotImplementedError
|
||||
def get_optimizer_preset(self) -> OptimizerConfig | None:
|
||||
"""Default optimizer for this reward model, or ``None`` for zero-shot models.
|
||||
|
||||
Trainable reward models (e.g. SARM, Classifier) must override this with a
|
||||
concrete optimizer config. Zero-shot reward models (e.g. Robometer) leave
|
||||
the default ``None`` — they error out earlier via the
|
||||
:attr:`~lerobot.rewards.pretrained.PreTrainedRewardModel.is_trainable`
|
||||
check in ``lerobot-train``.
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_scheduler_preset(self) -> LRSchedulerConfig | None:
|
||||
return None
|
||||
|
||||
@@ -20,11 +20,13 @@ from .factory import (
|
||||
make_reward_pre_post_processors as make_reward_pre_post_processors,
|
||||
)
|
||||
from .pretrained import PreTrainedRewardModel as PreTrainedRewardModel
|
||||
from .robometer.configuration_robometer import RobometerConfig as RobometerConfig
|
||||
from .sarm.configuration_sarm import SARMConfig as SARMConfig
|
||||
|
||||
__all__ = [
|
||||
# Configuration classes
|
||||
"RewardClassifierConfig",
|
||||
"RobometerConfig",
|
||||
"SARMConfig",
|
||||
# Base class
|
||||
"PreTrainedRewardModel",
|
||||
|
||||
@@ -24,6 +24,7 @@ from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.processor import PolicyAction, PolicyProcessorPipeline
|
||||
from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig
|
||||
from lerobot.rewards.pretrained import PreTrainedRewardModel
|
||||
from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
|
||||
from lerobot.rewards.sarm.configuration_sarm import SARMConfig
|
||||
|
||||
|
||||
@@ -36,7 +37,7 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
|
||||
|
||||
Args:
|
||||
name: The name of the reward model. Supported names are "reward_classifier",
|
||||
"sarm".
|
||||
"sarm", "robometer".
|
||||
|
||||
Returns:
|
||||
The reward model class corresponding to the given name.
|
||||
@@ -52,6 +53,10 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
|
||||
from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
|
||||
|
||||
return SARMRewardModel
|
||||
elif name == "robometer":
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
return RobometerRewardModel
|
||||
else:
|
||||
try:
|
||||
return _get_reward_model_cls_from_name(name=name)
|
||||
@@ -68,7 +73,7 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
|
||||
|
||||
Args:
|
||||
reward_type: The type of the reward model. Supported types include
|
||||
"reward_classifier", "sarm".
|
||||
"reward_classifier", "sarm", "robometer".
|
||||
**kwargs: Keyword arguments to be passed to the configuration class constructor.
|
||||
|
||||
Returns:
|
||||
@@ -81,6 +86,8 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
|
||||
return RewardClassifierConfig(**kwargs)
|
||||
elif reward_type == "sarm":
|
||||
return SARMConfig(**kwargs)
|
||||
elif reward_type == "robometer":
|
||||
return RobometerConfig(**kwargs)
|
||||
else:
|
||||
try:
|
||||
config_cls = RewardModelConfig.get_choice_class(reward_type)
|
||||
@@ -160,6 +167,13 @@ def make_reward_pre_post_processors(
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
dataset_meta=kwargs.get("dataset_meta"),
|
||||
)
|
||||
elif isinstance(reward_cfg, RobometerConfig):
|
||||
from lerobot.rewards.robometer.processor_robometer import make_robometer_pre_post_processors
|
||||
|
||||
return make_robometer_pre_post_processors(
|
||||
config=reward_cfg,
|
||||
dataset_stats=kwargs.get("dataset_stats"),
|
||||
)
|
||||
|
||||
else:
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_robometer import RobometerConfig
|
||||
from .modeling_robometer import RobometerRewardModel
|
||||
from .processor_robometer import make_robometer_pre_post_processors
|
||||
|
||||
__all__ = ["RobometerConfig", "RobometerRewardModel", "make_robometer_pre_post_processors"]
|
||||
@@ -0,0 +1,229 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Upstream/legacy Robometer checkpoint loader.
|
||||
|
||||
This module is **only** used by the one-time conversion tooling
|
||||
(:mod:`lerobot.scripts.lerobot_export_robometer` and
|
||||
``scripts/verify_robometer_export.py``). It supports:
|
||||
|
||||
- Sharded upstream checkpoints (``model-0000X-of-Y.safetensors`` + index).
|
||||
- PEFT/LoRA adapter checkpoints (``adapter_config.json`` + adapter weights).
|
||||
- Local snapshot directories or Hugging Face Hub repo ids.
|
||||
|
||||
Once :class:`~lerobot.rewards.robometer.RobometerRewardModel` is loaded
|
||||
through this module, calling ``save_pretrained`` writes the canonical
|
||||
LeRobot-native layout (single ``model.safetensors`` + ``config.json``) that
|
||||
the base loader understands.
|
||||
|
||||
The runtime path
|
||||
(:meth:`~lerobot.rewards.pretrained.PreTrainedRewardModel.from_pretrained`)
|
||||
does **not** import this file. It is safe to delete once you no longer need
|
||||
the conversion tooling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from safetensors.torch import load_file
|
||||
from torch import Tensor, nn
|
||||
|
||||
from lerobot.utils.import_utils import require_package
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _download_robometer_snapshot(
|
||||
pretrained_path: str,
|
||||
*,
|
||||
hub_token: str | None = None,
|
||||
) -> Path:
|
||||
"""Resolve a Robometer snapshot directory.
|
||||
|
||||
- If ``pretrained_path`` is an existing local directory, return it directly.
|
||||
- Otherwise treat ``pretrained_path`` as a Hugging Face repo id (optionally
|
||||
with ``@revision``) and download it via ``snapshot_download``.
|
||||
"""
|
||||
local_candidate = Path(pretrained_path)
|
||||
if local_candidate.is_dir():
|
||||
return local_candidate
|
||||
|
||||
if "@" in pretrained_path:
|
||||
repo_id, revision = pretrained_path.split("@", 1)
|
||||
else:
|
||||
repo_id, revision = pretrained_path, None
|
||||
|
||||
return Path(
|
||||
snapshot_download(
|
||||
repo_id=repo_id,
|
||||
revision=revision,
|
||||
token=hub_token,
|
||||
allow_patterns=[
|
||||
"*.json",
|
||||
"*.safetensors",
|
||||
"*.bin",
|
||||
"*.txt",
|
||||
"*.model",
|
||||
"tokenizer*",
|
||||
"special_tokens_map.json",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _maybe_apply_peft(base_model: Any, snapshot_dir: Path) -> Any:
|
||||
adapter_config = snapshot_dir / "adapter_config.json"
|
||||
if not adapter_config.exists():
|
||||
return base_model
|
||||
|
||||
require_package("peft", extra="peft-dep")
|
||||
from peft import PeftModel
|
||||
|
||||
return PeftModel.from_pretrained(base_model, str(snapshot_dir))
|
||||
|
||||
|
||||
def _remap_state_dict_keys(state_dict: dict[str, Tensor], model: nn.Module) -> dict[str, Tensor]:
|
||||
"""Try a few common prefix swaps so PEFT-wrapped checkpoints load cleanly."""
|
||||
model_keys = set(model.state_dict().keys())
|
||||
remapped: dict[str, Tensor] = {}
|
||||
|
||||
for key, value in state_dict.items():
|
||||
if key in model_keys:
|
||||
remapped[key] = value
|
||||
continue
|
||||
|
||||
candidates: list[str] = []
|
||||
if key.startswith("model.model."):
|
||||
candidates.append(key.replace("model.model.", "model.base_model.model.model.", 1))
|
||||
candidates.append(key.replace("model.model.", "model.", 1))
|
||||
if key.startswith("model."):
|
||||
candidates.append(f"model.{key}")
|
||||
candidates.append(key.replace("model.", "", 1))
|
||||
else:
|
||||
candidates.append(f"model.{key}")
|
||||
if key.startswith("model.") and not key.startswith("model.base_model."):
|
||||
parts = key.split(".", 1)
|
||||
if len(parts) == 2:
|
||||
candidates.append(f"model.base_model.{parts[1]}")
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate in model_keys:
|
||||
remapped[candidate] = value
|
||||
break
|
||||
else:
|
||||
remapped[key] = value
|
||||
|
||||
return remapped
|
||||
|
||||
|
||||
def _resolve_checkpoint_safetensors_files(snapshot_dir: Path) -> list[Path]:
|
||||
"""Pick the safetensors files that hold the full model weights.
|
||||
|
||||
When ``model.safetensors.index.json`` is present, only the files it lists are
|
||||
loaded. Otherwise any ``model*.safetensors`` shards are preferred over
|
||||
sidecar files. Falls back to every ``*.safetensors`` in the snapshot.
|
||||
"""
|
||||
index_path = snapshot_dir / "model.safetensors.index.json"
|
||||
if index_path.exists():
|
||||
with index_path.open() as f:
|
||||
weight_map = json.load(f).get("weight_map", {})
|
||||
indexed = sorted(
|
||||
{snapshot_dir / name for name in weight_map.values() if (snapshot_dir / name).exists()}
|
||||
)
|
||||
if indexed:
|
||||
return indexed
|
||||
|
||||
model_shards = sorted(snapshot_dir.glob("model*.safetensors"))
|
||||
if model_shards:
|
||||
return model_shards
|
||||
|
||||
return sorted(snapshot_dir.glob("*.safetensors"))
|
||||
|
||||
|
||||
def apply_upstream_checkpoint(
|
||||
model: nn.Module,
|
||||
pretrained_path: str,
|
||||
*,
|
||||
hub_token: str | None = None,
|
||||
) -> None:
|
||||
"""Load an upstream (sharded / PEFT) Robometer checkpoint into ``model``.
|
||||
|
||||
Downloads the snapshot, optionally applies PEFT wrapping, merges sharded
|
||||
``.safetensors`` files in memory, remaps PEFT-prefixed keys, and loads them
|
||||
into ``model`` non-strictly. ``model`` must already be constructed with the
|
||||
matching Robometer architecture (e.g. via
|
||||
:class:`~lerobot.rewards.robometer.RobometerRewardModel` ``__init__``).
|
||||
"""
|
||||
snapshot_dir = _download_robometer_snapshot(pretrained_path, hub_token=hub_token)
|
||||
|
||||
# PEFT adapter checkpoints wrap the base model before weight loading so the
|
||||
# remapper can place adapter tensors at the right prefix.
|
||||
base_model = getattr(model, "model", None)
|
||||
if base_model is not None:
|
||||
wrapped = _maybe_apply_peft(base_model, snapshot_dir)
|
||||
if wrapped is not base_model:
|
||||
model.model = wrapped
|
||||
|
||||
files = _resolve_checkpoint_safetensors_files(snapshot_dir)
|
||||
if not files:
|
||||
logger.warning("No *.safetensors files in %s; using freshly initialised heads", snapshot_dir)
|
||||
return
|
||||
|
||||
merged: dict[str, Tensor] = {}
|
||||
for path in files:
|
||||
merged.update(load_file(str(path)))
|
||||
|
||||
remapped = _remap_state_dict_keys(merged, model)
|
||||
|
||||
# Defensive vocab-match. With the corrected resize logic
|
||||
# (``_resize_embeddings_for_robometer`` uses ``len(tokenizer) + 5``),
|
||||
# a freshly built ``RobometerRewardModel`` should already share the same
|
||||
# vocabulary as the upstream checkpoint (e.g. 151,674 for
|
||||
# ``robometer/Robometer-4B``). This block stays in place as a safety net
|
||||
# in case a future upstream variant uses a different vocab — we never
|
||||
# want ``load_state_dict`` to trip on a silent shape mismatch.
|
||||
base_model = getattr(model, "model", None)
|
||||
if base_model is not None and hasattr(base_model, "get_input_embeddings"):
|
||||
for key in (
|
||||
"model.model.language_model.embed_tokens.weight",
|
||||
"model.language_model.embed_tokens.weight",
|
||||
"model.embed_tokens.weight",
|
||||
):
|
||||
tensor = remapped.get(key)
|
||||
if tensor is None:
|
||||
continue
|
||||
ckpt_vocab = int(tensor.shape[0])
|
||||
current_vocab = int(base_model.get_input_embeddings().num_embeddings)
|
||||
if ckpt_vocab != current_vocab:
|
||||
logger.info(
|
||||
"Resizing model embed table %d -> %d to match upstream checkpoint vocab "
|
||||
"(upstream was trained against a different Qwen revision).",
|
||||
current_vocab,
|
||||
ckpt_vocab,
|
||||
)
|
||||
base_model.resize_token_embeddings(ckpt_vocab)
|
||||
break
|
||||
|
||||
missing, unexpected = model.load_state_dict(remapped, strict=False)
|
||||
if missing:
|
||||
logger.debug("Robometer checkpoint missing %d keys (sample: %s)", len(missing), missing[:5])
|
||||
if unexpected:
|
||||
logger.debug(
|
||||
"Robometer checkpoint had %d unexpected keys (sample: %s)", len(unexpected), unexpected[:5]
|
||||
)
|
||||
@@ -0,0 +1,162 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.utils.constants import OBS_IMAGES
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
else:
|
||||
AutoConfig = None # type: ignore[assignment]
|
||||
AutoTokenizer = None # type: ignore[assignment]
|
||||
|
||||
|
||||
@RewardModelConfig.register_subclass("robometer")
|
||||
@dataclass
|
||||
class RobometerConfig(RewardModelConfig):
|
||||
"""Configuration for the Robometer reward model."""
|
||||
|
||||
pretrained_path: str | None = "lilkm/Robometer-4B"
|
||||
image_key: str = OBS_IMAGES + ".top"
|
||||
task_key: str = "task"
|
||||
default_task: str | None = None
|
||||
|
||||
max_frames: int | None = 8
|
||||
reward_output: str = "progress" # "progress" or "success"
|
||||
success_threshold: float = 0.5
|
||||
|
||||
license: str | None = "apache-2.0"
|
||||
tags: list[str] | None = field(
|
||||
default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
|
||||
)
|
||||
|
||||
base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
|
||||
torch_dtype: str = "bfloat16"
|
||||
use_multi_image: bool = True
|
||||
use_per_frame_progress_token: bool = True
|
||||
average_temporal_patches: bool = True
|
||||
frame_pooling: str = "mean" # "mean" | "boundary" | "attention"
|
||||
frame_pooling_attn_temperature: float = 1.0
|
||||
progress_loss_type: str = "discrete" # "l1" | "l2" | "discrete"
|
||||
progress_discrete_bins: int = 10
|
||||
|
||||
# Serialised Qwen backbone config (post-resize). Always populated by
|
||||
# ``__post_init__`` from ``base_model_id`` + ``len(tokenizer) + 5``, so it
|
||||
# is never ``None`` after construction (EO-1 style). Saved into
|
||||
# ``config.json`` automatically by the base ``_save_pretrained``.
|
||||
vlm_config: dict[str, Any] | None = None
|
||||
|
||||
input_features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
output_features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
normalization_mapping: dict[str, NormalizationMode] = field(
|
||||
default_factory=lambda: {
|
||||
"VISUAL": NormalizationMode.IDENTITY,
|
||||
"REWARD": NormalizationMode.IDENTITY,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
if self.reward_output not in {"progress", "success"}:
|
||||
raise ValueError(f"reward_output must be 'progress' or 'success', got {self.reward_output!r}")
|
||||
if self.max_frames is not None and self.max_frames < 1:
|
||||
raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
|
||||
if self.frame_pooling not in {"mean", "boundary", "attention"}:
|
||||
raise ValueError(f"frame_pooling must be mean/boundary/attention; got {self.frame_pooling!r}")
|
||||
if self.frame_pooling_attn_temperature <= 0:
|
||||
raise ValueError("frame_pooling_attn_temperature must be > 0")
|
||||
if self.progress_loss_type not in {"l1", "l2", "discrete"}:
|
||||
raise ValueError(f"progress_loss_type must be l1/l2/discrete; got {self.progress_loss_type!r}")
|
||||
if self.use_per_frame_progress_token and not self.use_multi_image:
|
||||
raise ValueError("use_per_frame_progress_token=True requires use_multi_image=True")
|
||||
|
||||
if self.image_key not in self.input_features:
|
||||
self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
|
||||
self.output_features.setdefault("progress", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
|
||||
self.output_features.setdefault("success", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
|
||||
|
||||
# Deterministically populate ``vlm_config`` so it is never ``None``
|
||||
# after construction (mirrors EO-1's ``__post_init__`` snapshot).
|
||||
# The target vocab matches upstream Robometer's runtime resize
|
||||
# ``base_model.resize_token_embeddings(len(processor.tokenizer))`` —
|
||||
# see ``third_party/robometer/.../setup_utils.py`` —
|
||||
# i.e. ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``.
|
||||
#
|
||||
# For ``Qwen/Qwen3-VL-4B-Instruct`` this gives 151,669 + 5 = 151,674,
|
||||
# which is exactly the published ``robometer/Robometer-4B`` checkpoint
|
||||
# vocab. NB: ``text_config.vocab_size`` in the raw Qwen config is the
|
||||
# padded embedding-table size (151,936), not the tokenizer length —
|
||||
# we override it with the tokenizer-driven value to stay consistent
|
||||
# with upstream.
|
||||
if self.vlm_config is None:
|
||||
require_package("transformers", extra="robometer")
|
||||
# Local import avoids a top-level cycle (modeling_robometer imports
|
||||
# this module). ``ROBOMETER_SPECIAL_TOKENS`` is the single source
|
||||
# of truth for the resize delta.
|
||||
from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_SPECIAL_TOKENS
|
||||
|
||||
vlm = AutoConfig.from_pretrained(self.base_model_id).to_dict()
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.base_model_id)
|
||||
text_config = vlm.get("text_config")
|
||||
if not isinstance(text_config, dict):
|
||||
raise ValueError(
|
||||
f"Backbone config for {self.base_model_id!r} has no nested `text_config`; "
|
||||
"Robometer expects a Qwen-VL-style config."
|
||||
)
|
||||
text_config["vocab_size"] = len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)
|
||||
self.vlm_config = vlm
|
||||
|
||||
@property
|
||||
def use_discrete_progress(self) -> bool:
|
||||
"""Whether the progress head outputs distribution logits over bins."""
|
||||
return self.progress_loss_type.lower() == "discrete"
|
||||
|
||||
@property
|
||||
def vlm_backbone_config(self):
|
||||
"""Reconstruct the Qwen backbone config from :attr:`vlm_config`.
|
||||
|
||||
``vlm_config`` is always populated after :meth:`__post_init__`
|
||||
(either fresh, computed from the tokenizer, or loaded from a saved
|
||||
``config.json`` via draccus).
|
||||
"""
|
||||
require_package("transformers", extra="robometer")
|
||||
config_dict = deepcopy(self.vlm_config)
|
||||
model_type = config_dict.pop("model_type", None)
|
||||
if model_type is None:
|
||||
raise ValueError("vlm_config must include `model_type` to reconstruct the backbone config")
|
||||
return AutoConfig.for_model(model_type, **config_dict)
|
||||
|
||||
@property
|
||||
def observation_delta_indices(self) -> list[int] | None:
|
||||
return None
|
||||
|
||||
@property
|
||||
def action_delta_indices(self) -> None:
|
||||
return None
|
||||
|
||||
@property
|
||||
def reward_delta_indices(self) -> None:
|
||||
return None
|
||||
|
||||
def validate_features(self) -> None:
|
||||
if self.image_key not in self.input_features:
|
||||
raise ValueError(f"Robometer requires image input feature {self.image_key!r}")
|
||||
@@ -0,0 +1,493 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Robometer reward model.
|
||||
|
||||
- Qwen3-VL backbone (default: ``Qwen/Qwen3-VL-4B-Instruct``).
|
||||
- Progress + success heads at inference; the preference head is preserved in the
|
||||
state dict but not queried.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from lerobot.rewards.pretrained import PreTrainedRewardModel
|
||||
from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
|
||||
from lerobot.utils.constants import OBS_PREFIX
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers import AutoModelForImageTextToText
|
||||
else:
|
||||
AutoModelForImageTextToText = None # type: ignore[assignment]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Namespace for Robometer's pre-encoded Qwen-VL observation tensors. The
|
||||
# processor writes both Qwen-VL tensors and Robometer-specific token ids /
|
||||
# metadata here; the model reads them at inference (no tokenizer needed in
|
||||
# the model — EO1-style separation).
|
||||
ROBOMETER_FEATURE_PREFIX = f"{OBS_PREFIX}robometer."
|
||||
ROBOMETER_QWEN_INPUT_KEYS = (
|
||||
"input_ids",
|
||||
"attention_mask",
|
||||
"pixel_values",
|
||||
"pixel_values_videos",
|
||||
"image_grid_thw",
|
||||
"video_grid_thw",
|
||||
"second_per_grid_ts",
|
||||
)
|
||||
ROBOMETER_METADATA_KEYS = (
|
||||
"prog_token_id",
|
||||
"vision_start_token_id",
|
||||
"vision_end_token_id",
|
||||
"video_merge_size",
|
||||
)
|
||||
ROBOMETER_INPUT_KEYS = ROBOMETER_QWEN_INPUT_KEYS + ROBOMETER_METADATA_KEYS
|
||||
|
||||
# Order matters: the released checkpoint resized `embed_tokens` after adding
|
||||
# these tokens in this order, so changing the set or order would silently
|
||||
# misalign the saved embedding rows with their token ids. `<|reward_token|>`
|
||||
# and `<|sim_token|>` are vestigial (never read by any head) but still occupy
|
||||
# rows the checkpoint expects.
|
||||
ROBOMETER_SPECIAL_TOKENS = (
|
||||
"<|split_token|>",
|
||||
"<|reward_token|>",
|
||||
"<|pref_token|>",
|
||||
"<|sim_token|>",
|
||||
"<|prog_token|>",
|
||||
)
|
||||
|
||||
|
||||
def convert_bins_to_continuous(bin_logits: Tensor) -> Tensor:
|
||||
"""Collapse per-bin logits into a single value in ``[0, 1]``.
|
||||
|
||||
The discrete progress head outputs ``num_bins`` logits per frame. Bins are
|
||||
evenly spaced centers in ``[0, 1]``; the continuous prediction is the
|
||||
softmax-weighted mean of those centers.
|
||||
"""
|
||||
bin_probs = torch.softmax(bin_logits, dim=-1)
|
||||
num_bins = bin_logits.shape[-1]
|
||||
bin_centers = torch.linspace(0.0, 1.0, num_bins, device=bin_logits.device, dtype=bin_logits.dtype)
|
||||
return (bin_probs * bin_centers).sum(dim=-1)
|
||||
|
||||
|
||||
def squeeze_last_safe(x: Tensor) -> Tensor:
|
||||
"""Drop a trailing singleton dim only when present.
|
||||
|
||||
Matches the upstream helper of the same name in
|
||||
``robometer.models.rbm`` (kept module-level and non-underscored to mirror
|
||||
upstream).
|
||||
"""
|
||||
return x.squeeze(-1) if x.ndim > 1 and x.shape[-1] == 1 else x
|
||||
|
||||
|
||||
def _torch_dtype(name: str) -> torch.dtype:
|
||||
dtype = getattr(torch, name, None)
|
||||
if isinstance(dtype, torch.dtype):
|
||||
return dtype
|
||||
raise ValueError(f"Unknown torch dtype: {name!r}")
|
||||
|
||||
|
||||
class RobometerPredictionHead(nn.Sequential):
|
||||
"""Small MLP head used for Robometer's progress / success / preference outputs.
|
||||
|
||||
Subclasses ``nn.Sequential`` (not ``nn.Module``) so the ``state_dict`` keys
|
||||
stay flat (``progress_head.0.weight``, ``progress_head.1.weight``, ...) and
|
||||
remain byte-compatible with the published ``lilkm/robometer-4b`` checkpoint.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_dim: int, output_size: int, *, dropout: float, with_sigmoid: bool) -> None:
|
||||
layers: list[nn.Module] = [
|
||||
nn.Linear(hidden_dim, hidden_dim // 2),
|
||||
nn.LayerNorm(hidden_dim // 2),
|
||||
nn.GELU(),
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(hidden_dim // 2, output_size),
|
||||
]
|
||||
if with_sigmoid:
|
||||
layers.append(nn.Sigmoid())
|
||||
super().__init__(*layers)
|
||||
|
||||
|
||||
def decode_progress_outputs(
|
||||
progress_logits: Tensor | None,
|
||||
success_logits: Tensor | None,
|
||||
*,
|
||||
is_discrete_mode: bool,
|
||||
) -> dict[str, list[list[float]]]:
|
||||
"""Decode RBM head outputs into per-frame floats.
|
||||
|
||||
Args:
|
||||
progress_logits: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
|
||||
success_logits: ``(B, T)`` raw logits, ``sigmoid``-ed to probabilities.
|
||||
is_discrete_mode: if True the progress logits get a softmax over bins
|
||||
and are projected onto bin centers via :func:`convert_bins_to_continuous`.
|
||||
|
||||
Returns:
|
||||
Dict with ``progress_pred`` and ``success_probs``, each a list of
|
||||
length ``B`` of per-frame float lists.
|
||||
"""
|
||||
progress_pred: list[list[float]] = []
|
||||
success_probs: list[list[float]] = []
|
||||
|
||||
if progress_logits is not None:
|
||||
for sample_logits in progress_logits:
|
||||
if is_discrete_mode:
|
||||
continuous = convert_bins_to_continuous(sample_logits.detach().float().cpu())
|
||||
progress_pred.append(continuous.flatten().tolist())
|
||||
else:
|
||||
progress_pred.append(sample_logits.detach().float().cpu().flatten().tolist())
|
||||
|
||||
if success_logits is not None:
|
||||
for sample_logits in success_logits:
|
||||
success_probs.append(torch.sigmoid(sample_logits.detach().float().cpu()).flatten().tolist())
|
||||
|
||||
return {"progress_pred": progress_pred, "success_probs": success_probs}
|
||||
|
||||
|
||||
class RobometerRewardModel(PreTrainedRewardModel):
|
||||
"""Robometer reward model: Qwen3-VL backbone + progress/success heads."""
|
||||
|
||||
name = "robometer"
|
||||
config_class = RobometerConfig
|
||||
|
||||
def __init__(self, config: RobometerConfig, *, dropout: float = 0.1) -> None:
|
||||
require_package("transformers", extra="robometer")
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
# Two backbone-build paths (EO-1 style, branched on ``pretrained_path``):
|
||||
#
|
||||
# - Fresh training (``pretrained_path is None``): download the base
|
||||
# Qwen weights and resize the embed table to match
|
||||
# ``vlm_config.text_config.vocab_size`` — populated deterministically
|
||||
# in ``RobometerConfig.__post_init__`` as
|
||||
# ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``, mirroring
|
||||
# upstream Robometer's ``_add_special_tokens_and_resize`` in
|
||||
# ``third_party/robometer/.../setup_utils.py``.
|
||||
#
|
||||
# - Loading a saved checkpoint (``pretrained_path`` is set): rebuild
|
||||
# the empty architecture from ``vlm_config`` via
|
||||
# ``AutoModelForImageTextToText.from_config`` so the subsequent
|
||||
# ``model.safetensors`` load is a direct fill of the right shape —
|
||||
# no redundant Qwen weight download.
|
||||
torch_dtype = _torch_dtype(config.torch_dtype)
|
||||
if config.pretrained_path is None:
|
||||
self.model = AutoModelForImageTextToText.from_pretrained(
|
||||
config.base_model_id,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
target_vocab = config.vlm_config["text_config"]["vocab_size"]
|
||||
self.model.resize_token_embeddings(target_vocab)
|
||||
else:
|
||||
self.model = AutoModelForImageTextToText.from_config(
|
||||
config.vlm_backbone_config,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# All Qwen-VL backbones Robometer supports expose `text_config.hidden_size`.
|
||||
# Falls back to the top-level `hidden_size` so future non-multimodal
|
||||
# variants would still resolve.
|
||||
backbone_config = self.model.config
|
||||
text_config = getattr(backbone_config, "text_config", None)
|
||||
hidden_size = getattr(text_config, "hidden_size", None) if text_config is not None else None
|
||||
if hidden_size is None:
|
||||
hidden_size = getattr(backbone_config, "hidden_size", None)
|
||||
if hidden_size is None:
|
||||
raise AttributeError(
|
||||
f"Could not infer hidden_size from backbone config of {config.base_model_id}"
|
||||
)
|
||||
hidden_dim = int(hidden_size)
|
||||
|
||||
# Robometer's three prediction heads + frame-pool attention. The
|
||||
# preference head is preserved to match the published state-dict layout
|
||||
# even though only progress + success are consumed at inference, and
|
||||
# `frame_pool_attn` is always allocated so checkpoints trained with
|
||||
# `frame_pooling="attention"` load without remapping.
|
||||
progress_output = config.progress_discrete_bins if config.use_discrete_progress else 1
|
||||
self.progress_head = RobometerPredictionHead(
|
||||
hidden_dim,
|
||||
progress_output,
|
||||
dropout=dropout,
|
||||
with_sigmoid=not config.use_discrete_progress,
|
||||
)
|
||||
self.preference_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
|
||||
self.success_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
|
||||
self.frame_pool_attn = nn.Linear(hidden_dim, 1, bias=False)
|
||||
|
||||
# Match the dtype of the loaded base model so weight loading is a no-op cast.
|
||||
model_dtype = next(self.model.parameters()).dtype
|
||||
self.progress_head.to(dtype=model_dtype)
|
||||
self.preference_head.to(dtype=model_dtype)
|
||||
self.success_head.to(dtype=model_dtype)
|
||||
self.frame_pool_attn.to(dtype=model_dtype)
|
||||
|
||||
def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
|
||||
inputs = {
|
||||
key: batch[f"{ROBOMETER_FEATURE_PREFIX}{key}"]
|
||||
for key in ROBOMETER_INPUT_KEYS
|
||||
if f"{ROBOMETER_FEATURE_PREFIX}{key}" in batch
|
||||
}
|
||||
if "input_ids" not in inputs:
|
||||
raise KeyError(
|
||||
f"Robometer batch missing pre-encoded inputs (expected "
|
||||
f"`{ROBOMETER_FEATURE_PREFIX}input_ids`). Make sure the "
|
||||
"RobometerEncoderProcessorStep ran before `compute_reward`."
|
||||
)
|
||||
|
||||
device = next(self.model.parameters()).device
|
||||
inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
|
||||
|
||||
self.eval()
|
||||
with torch.no_grad():
|
||||
progress_logits, success_logits = self._compute_rbm_logits(inputs)
|
||||
|
||||
decoded = decode_progress_outputs(
|
||||
progress_logits,
|
||||
success_logits,
|
||||
is_discrete_mode=self.config.use_discrete_progress,
|
||||
)
|
||||
values = (
|
||||
decoded["success_probs"] if self.config.reward_output == "success" else decoded["progress_pred"]
|
||||
)
|
||||
|
||||
rewards = torch.stack([torch.as_tensor(seq, dtype=torch.float32)[-1] for seq in values])
|
||||
if self.config.reward_output == "success":
|
||||
rewards = (rewards > self.config.success_threshold).float()
|
||||
return rewards.to(self.config.device or "cpu")
|
||||
|
||||
def _compute_rbm_logits(
|
||||
self,
|
||||
inputs: dict[str, Any],
|
||||
) -> tuple[Tensor, Tensor]:
|
||||
"""Run the Qwen3-VL backbone and apply Robometer's heads.
|
||||
|
||||
``inputs`` is the encoded batch produced by
|
||||
:class:`RobometerEncoderProcessorStep`. It carries Qwen tensors as well
|
||||
as Robometer-specific metadata (``prog_token_id``,
|
||||
``vision_start_token_id``, ``vision_end_token_id``, ``video_merge_size``)
|
||||
— the metadata is popped here so the rest can be forwarded straight to
|
||||
the Qwen model.
|
||||
|
||||
Returns ``(progress_logits, success_logits)``. Shapes:
|
||||
|
||||
- ``progress_logits``: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
|
||||
- ``success_logits``: ``(B, T)`` raw logits (sigmoid happens at decode time).
|
||||
"""
|
||||
prog_token_id = inputs.pop("prog_token_id", None)
|
||||
vision_start_token_id = inputs.pop("vision_start_token_id", None)
|
||||
vision_end_token_id = inputs.pop("vision_end_token_id", None)
|
||||
video_merge_size = inputs.pop("video_merge_size", 14)
|
||||
|
||||
# Qwen3-VL doesn't reliably populate `last_hidden_state`; ask for the
|
||||
# full hidden-state tuple and take the last layer. This matches the
|
||||
# `is_qwen3` path in upstream Robometer's `RBM.forward_qwen` (main).
|
||||
outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
|
||||
hidden_state = (
|
||||
outputs.hidden_states[-1]
|
||||
if getattr(outputs, "hidden_states", None)
|
||||
else outputs.last_hidden_state
|
||||
)
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
if self.config.use_per_frame_progress_token:
|
||||
if prog_token_id is None:
|
||||
raise KeyError("`prog_token_id` missing in batch (run RobometerEncoderProcessorStep first)")
|
||||
return self._process_token_extraction(hidden_state, input_ids, prog_token_id=prog_token_id)
|
||||
if self.config.use_multi_image:
|
||||
if vision_start_token_id is None or vision_end_token_id is None:
|
||||
raise KeyError(
|
||||
"`vision_start_token_id` / `vision_end_token_id` missing in batch "
|
||||
"(run RobometerEncoderProcessorStep first)"
|
||||
)
|
||||
return self._process_multi_image_frames(
|
||||
hidden_state,
|
||||
input_ids,
|
||||
start_id=vision_start_token_id,
|
||||
end_id=vision_end_token_id,
|
||||
)
|
||||
video_grid_thw = inputs.get("video_grid_thw")
|
||||
if video_grid_thw is None:
|
||||
raise ValueError("video_grid_thw is required for video-mode Robometer inference")
|
||||
if vision_start_token_id is None:
|
||||
raise KeyError("`vision_start_token_id` missing in batch")
|
||||
return self._process_video_frames(
|
||||
hidden_state,
|
||||
input_ids,
|
||||
video_grid_thw,
|
||||
start_id=vision_start_token_id,
|
||||
merge_size=video_merge_size,
|
||||
)
|
||||
|
||||
def _apply_heads_to_hidden_states(self, frame_embeddings: Tensor) -> tuple[Tensor, Tensor]:
|
||||
"""Apply progress + success heads to a tensor of frame embeddings.
|
||||
|
||||
Mirrors upstream ``RBM._apply_heads_to_hidden_states``.
|
||||
"""
|
||||
progress_out = self.progress_head(frame_embeddings)
|
||||
progress = progress_out if self.config.use_discrete_progress else squeeze_last_safe(progress_out)
|
||||
success = squeeze_last_safe(self.success_head(frame_embeddings))
|
||||
return progress, success
|
||||
|
||||
def _process_token_extraction(
|
||||
self,
|
||||
hidden_state: Tensor,
|
||||
input_ids: Tensor,
|
||||
*,
|
||||
prog_token_id: int,
|
||||
) -> tuple[Tensor, Tensor]:
|
||||
"""Per-frame progress/success from ``<|prog_token|>`` positions.
|
||||
|
||||
Mirrors the progress-sample branch of upstream
|
||||
``RBM._process_token_extraction``.
|
||||
"""
|
||||
token_mask = input_ids == prog_token_id
|
||||
batch_indices, positions = token_mask.nonzero(as_tuple=True)
|
||||
if positions.numel() == 0:
|
||||
raise ValueError("`<|prog_token|>` not found in any sequence")
|
||||
|
||||
per_sample_hidden = [
|
||||
hidden_state[i, positions[batch_indices == i]] for i in range(input_ids.shape[0])
|
||||
]
|
||||
progress_list, success_list = [], []
|
||||
for embeddings in per_sample_hidden:
|
||||
if embeddings.shape[0] == 0:
|
||||
raise ValueError("`<|prog_token|>` missing in a sequence")
|
||||
progress, success = self._apply_heads_to_hidden_states(embeddings)
|
||||
progress_list.append(progress)
|
||||
success_list.append(success)
|
||||
|
||||
return torch.stack(progress_list), torch.stack(success_list)
|
||||
|
||||
def _process_multi_image_frames(
|
||||
self,
|
||||
hidden_state: Tensor,
|
||||
input_ids: Tensor,
|
||||
*,
|
||||
start_id: int,
|
||||
end_id: int,
|
||||
) -> tuple[Tensor, Tensor]:
|
||||
"""Per-frame progress/success in multi-image mode (Qwen-VL).
|
||||
|
||||
Mirrors upstream ``RBM._process_multi_image_frames`` (progress-sample
|
||||
branch only — we don't run preference at inference).
|
||||
"""
|
||||
progress_list, success_list = [], []
|
||||
for batch_idx in range(input_ids.shape[0]):
|
||||
seq_ids = input_ids[batch_idx]
|
||||
seq_hidden = hidden_state[batch_idx]
|
||||
frame_embeddings = self._extract_hidden_states_from_token_pairs(
|
||||
seq_hidden, seq_ids, start_id, end_id
|
||||
)
|
||||
progress, success = self._apply_heads_to_hidden_states(frame_embeddings)
|
||||
progress_list.append(progress)
|
||||
success_list.append(success)
|
||||
|
||||
return torch.stack(progress_list), torch.stack(success_list)
|
||||
|
||||
def _extract_hidden_states_from_token_pairs(
|
||||
self,
|
||||
hidden_state: Tensor,
|
||||
input_ids: Tensor,
|
||||
start_id: int,
|
||||
end_id: int,
|
||||
) -> Tensor:
|
||||
start_positions = (input_ids == start_id).nonzero(as_tuple=True)[0]
|
||||
end_positions = (input_ids == end_id).nonzero(as_tuple=True)[0]
|
||||
if start_positions.numel() == 0:
|
||||
raise ValueError("`<|vision_start|>` not found in sequence")
|
||||
if start_positions.numel() != end_positions.numel():
|
||||
raise ValueError(
|
||||
f"Mismatched vision token counts: {start_positions.numel()} start vs "
|
||||
f"{end_positions.numel()} end"
|
||||
)
|
||||
|
||||
frames: list[Tensor] = []
|
||||
for start, end in zip(start_positions.tolist(), end_positions.tolist(), strict=True):
|
||||
if start >= end:
|
||||
raise ValueError(f"Invalid vision token pair: start={start} end={end}")
|
||||
patch_tokens = hidden_state[start + 1 : end]
|
||||
if patch_tokens.shape[0] == 0:
|
||||
frames.append((hidden_state[start] + hidden_state[end]) / 2.0)
|
||||
continue
|
||||
|
||||
pooling = self.config.frame_pooling
|
||||
if pooling == "mean":
|
||||
frames.append(patch_tokens.mean(dim=0))
|
||||
elif pooling == "boundary":
|
||||
frames.append(patch_tokens[-1])
|
||||
else: # attention
|
||||
scores = (
|
||||
self.frame_pool_attn(patch_tokens).squeeze(-1)
|
||||
/ self.config.frame_pooling_attn_temperature
|
||||
)
|
||||
weights = torch.softmax(scores, dim=0).unsqueeze(-1)
|
||||
frames.append((weights * patch_tokens).sum(dim=0))
|
||||
|
||||
return torch.stack(frames)
|
||||
|
||||
def _process_video_frames(
|
||||
self,
|
||||
hidden_state: Tensor,
|
||||
input_ids: Tensor,
|
||||
video_grid_thw: Tensor,
|
||||
*,
|
||||
start_id: int,
|
||||
merge_size: int,
|
||||
) -> tuple[Tensor, Tensor]:
|
||||
"""Per-frame progress/success in video mode (Qwen-VL).
|
||||
|
||||
Mirrors upstream ``RBM._process_video_frames`` /
|
||||
``RBM._extract_progress_from_trajectory`` (progress-sample branch
|
||||
only — preference is not run at inference). In particular,
|
||||
``average_temporal_patches=False`` reads the *boundary* token at
|
||||
``cursor + tokens_per_frame`` to match upstream byte-for-byte.
|
||||
"""
|
||||
progress_list, success_list = [], []
|
||||
for batch_idx in range(input_ids.shape[0]):
|
||||
seq_ids = input_ids[batch_idx]
|
||||
seq_hidden = hidden_state[batch_idx]
|
||||
start_positions = (seq_ids == start_id).nonzero(as_tuple=True)[0]
|
||||
if start_positions.numel() == 0:
|
||||
raise ValueError("`<|vision_start|>` not found in sequence")
|
||||
t_dim, h_dim, w_dim = (int(x) for x in video_grid_thw[batch_idx].tolist())
|
||||
tokens_per_frame = (h_dim * w_dim) // (merge_size**2)
|
||||
|
||||
cursor = start_positions[0].item()
|
||||
frame_embeddings: list[Tensor] = []
|
||||
for _ in range(t_dim):
|
||||
if self.config.average_temporal_patches:
|
||||
patch = seq_hidden[cursor : cursor + tokens_per_frame]
|
||||
frame_embeddings.append(patch.mean(dim=0))
|
||||
else:
|
||||
# Upstream takes the position *one past* the patch span as
|
||||
# the per-frame boundary; see
|
||||
# `RBM._extract_progress_from_trajectory`.
|
||||
frame_embeddings.append(seq_hidden[cursor + tokens_per_frame])
|
||||
cursor += tokens_per_frame
|
||||
|
||||
stacked = torch.stack(frame_embeddings)
|
||||
progress, success = self._apply_heads_to_hidden_states(stacked)
|
||||
progress_list.append(progress)
|
||||
success_list.append(success)
|
||||
|
||||
return torch.stack(progress_list), torch.stack(success_list)
|
||||
@@ -0,0 +1,348 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Robometer pre/post processing pipelines."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.configs import PipelineFeatureType, PolicyFeature
|
||||
from lerobot.processor import (
|
||||
AddBatchDimensionProcessorStep,
|
||||
DeviceProcessorStep,
|
||||
PolicyAction,
|
||||
PolicyProcessorPipeline,
|
||||
ProcessorStep,
|
||||
ProcessorStepRegistry,
|
||||
policy_action_to_transition,
|
||||
)
|
||||
from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
|
||||
from lerobot.rewards.robometer.modeling_robometer import (
|
||||
ROBOMETER_FEATURE_PREFIX,
|
||||
ROBOMETER_SPECIAL_TOKENS,
|
||||
)
|
||||
from lerobot.types import EnvTransition, TransitionKey
|
||||
from lerobot.utils.constants import (
|
||||
OBS_IMAGES,
|
||||
POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
)
|
||||
from lerobot.utils.import_utils import _transformers_available, require_package
|
||||
|
||||
if TYPE_CHECKING or _transformers_available:
|
||||
from transformers import AutoProcessor
|
||||
else:
|
||||
AutoProcessor = None
|
||||
|
||||
PROGRESS_PROMPT = (
|
||||
"The task for the robot is '{task}'. Given the trajectory video, predict "
|
||||
"the task progress at each frame, how far along the robot is towards "
|
||||
"completing the task, a float between 0 and 1, where 0 is the starting "
|
||||
"state and 1 is when the task is completed. If the robot is not "
|
||||
"performing the same task, predict 0 progress."
|
||||
)
|
||||
|
||||
|
||||
def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
|
||||
"""Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
|
||||
if frames.ndim != 4:
|
||||
raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
|
||||
if frames.dtype != np.uint8:
|
||||
frames = np.clip(frames, 0, 255).astype(np.uint8)
|
||||
return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
|
||||
|
||||
|
||||
def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray:
|
||||
"""Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array."""
|
||||
if max_frames is not None:
|
||||
video = video[-max_frames:]
|
||||
if video.shape[1] in (1, 3):
|
||||
video = video.permute(0, 2, 3, 1)
|
||||
elif video.shape[-1] not in (1, 3):
|
||||
raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
|
||||
|
||||
array = video.detach().cpu().numpy()
|
||||
if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0:
|
||||
array = array * 255.0
|
||||
return np.clip(array, 0, 255).astype(np.uint8)
|
||||
|
||||
|
||||
def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
|
||||
if task is None:
|
||||
task = default
|
||||
if task is None:
|
||||
raise KeyError("Robometer expected a task description in complementary data")
|
||||
if isinstance(task, str):
|
||||
return [task] * batch_size
|
||||
if isinstance(task, tuple):
|
||||
task = list(task)
|
||||
if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
|
||||
raise TypeError(f"Robometer task must be a string or list of strings, got {type(task)}")
|
||||
if len(task) == 1 and batch_size > 1:
|
||||
return task * batch_size
|
||||
if len(task) != batch_size:
|
||||
raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
|
||||
return task
|
||||
|
||||
|
||||
@dataclass
|
||||
@ProcessorStepRegistry.register(name="robometer_encoder")
|
||||
class RobometerEncoderProcessorStep(ProcessorStep):
|
||||
"""Encode raw frames + task into Qwen-VL tensors for the Robometer model.
|
||||
|
||||
Loads a :class:`~transformers.AutoProcessor` matching ``base_model_id`` and
|
||||
registers Robometer's special tokens on the tokenizer. The matching
|
||||
embedding resize happens model-side in
|
||||
:meth:`RobometerRewardModel.__init__`. This step owns the tokenizer — the
|
||||
model itself never needs one — and is the EO1-style boundary between
|
||||
pre-processing and modeling.
|
||||
|
||||
At call time the step reads:
|
||||
|
||||
- ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
|
||||
- ``complementary_data[task_key]``: a string or list of strings.
|
||||
|
||||
and writes ``observation[f"{ROBOMETER_FEATURE_PREFIX}<name>"]`` for:
|
||||
|
||||
- the Qwen-VL processor outputs: ``input_ids``, ``attention_mask``,
|
||||
``pixel_values``, ``image_grid_thw``, ``video_grid_thw``, ...
|
||||
- Robometer-specific token ids consumed by the model heads:
|
||||
``prog_token_id``, ``vision_start_token_id``, ``vision_end_token_id``,
|
||||
``video_merge_size``.
|
||||
"""
|
||||
|
||||
base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
|
||||
image_key: str = OBS_IMAGES + ".top"
|
||||
task_key: str = "task"
|
||||
default_task: str | None = None
|
||||
max_frames: int | None = 8
|
||||
use_multi_image: bool = True
|
||||
use_per_frame_progress_token: bool = True
|
||||
max_length: int = 1024
|
||||
|
||||
_processor: Any = field(default=None, init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
require_package("transformers", extra="robometer")
|
||||
require_package("qwen-vl-utils", extra="robometer", import_name="qwen_vl_utils")
|
||||
|
||||
self._processor = AutoProcessor.from_pretrained(
|
||||
self.base_model_id,
|
||||
trust_remote_code=True,
|
||||
do_sample_frames=False,
|
||||
padding_side="right",
|
||||
)
|
||||
|
||||
# Register Robometer's special tokens on the tokenizer. The matching
|
||||
# embedding resize happens model-side in `RobometerRewardModel.__init__`.
|
||||
tokenizer = self._processor.tokenizer
|
||||
# Qwen tokenizers may not define a pad token, but batched prompts/videos
|
||||
# require padding, so reuse EOS as the padding token.
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
for token in ROBOMETER_SPECIAL_TOKENS:
|
||||
if token not in tokenizer.get_vocab():
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [token]})
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
observation = transition.get(TransitionKey.OBSERVATION)
|
||||
complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
|
||||
if not isinstance(observation, dict):
|
||||
raise ValueError("RobometerEncoderProcessorStep requires an observation dict")
|
||||
|
||||
if self.image_key not in observation:
|
||||
raise KeyError(f"Robometer expected image key {self.image_key!r} in observation")
|
||||
|
||||
frames = observation[self.image_key]
|
||||
tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
|
||||
if tensor.ndim == 4:
|
||||
tensor = tensor.unsqueeze(1)
|
||||
elif tensor.ndim != 5:
|
||||
raise ValueError(
|
||||
f"Expected Robometer frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
|
||||
)
|
||||
|
||||
batch_size = tensor.shape[0]
|
||||
tasks = _expand_tasks(
|
||||
complementary.get(self.task_key, self.default_task),
|
||||
batch_size=batch_size,
|
||||
default=self.default_task,
|
||||
)
|
||||
|
||||
samples = [
|
||||
(_video_to_numpy(tensor[i], max_frames=self.max_frames), tasks[i]) for i in range(batch_size)
|
||||
]
|
||||
encoded = self.encode_samples(samples)
|
||||
|
||||
new_observation = dict(observation)
|
||||
for key, value in encoded.items():
|
||||
new_observation[f"{ROBOMETER_FEATURE_PREFIX}{key}"] = value
|
||||
|
||||
new_transition = transition.copy()
|
||||
new_transition[TransitionKey.OBSERVATION] = new_observation
|
||||
return new_transition
|
||||
|
||||
def encode_samples(self, samples: list[tuple[np.ndarray, str]]) -> dict[str, Tensor]:
|
||||
"""Run the Qwen-VL processor on a list of ``(frames, task)`` samples.
|
||||
|
||||
Used internally by ``__call__`` and exposed for callers that want to
|
||||
run the encoder on a single trajectory without building an
|
||||
:class:`EnvTransition` (see ``examples/dataset/create_robometer_progress_videos.py``).
|
||||
"""
|
||||
from qwen_vl_utils import process_vision_info
|
||||
|
||||
conversations = [self._build_conversation(frames, task) for frames, task in samples]
|
||||
|
||||
texts = [
|
||||
self._processor.apply_chat_template(
|
||||
msg,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
add_vision_id=True,
|
||||
enable_thinking=False,
|
||||
fps=1,
|
||||
)
|
||||
for msg in conversations
|
||||
]
|
||||
|
||||
process_kwargs: dict[str, Any] = {
|
||||
"return_video_kwargs": True,
|
||||
"return_video_metadata": True,
|
||||
}
|
||||
image_processor = getattr(self._processor, "image_processor", None)
|
||||
if image_processor is not None and hasattr(image_processor, "patch_size"):
|
||||
process_kwargs["image_patch_size"] = image_processor.patch_size
|
||||
|
||||
image_inputs, video_inputs, video_kwargs = process_vision_info(conversations, **process_kwargs)
|
||||
|
||||
videos: list[Any] | None = None
|
||||
video_metadatas: list[Any] | None = None
|
||||
if video_inputs:
|
||||
if isinstance(video_inputs[0], tuple) and len(video_inputs[0]) == 2:
|
||||
videos_seq, metadatas_seq = zip(*video_inputs, strict=False)
|
||||
videos = list(videos_seq)
|
||||
video_metadatas = list(metadatas_seq)
|
||||
else:
|
||||
videos = list(video_inputs)
|
||||
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": texts,
|
||||
"images": image_inputs,
|
||||
"padding": True,
|
||||
"truncation": False,
|
||||
"max_length": self.max_length,
|
||||
"return_tensors": "pt",
|
||||
"do_resize": False,
|
||||
}
|
||||
if videos is not None:
|
||||
processor_kwargs["videos"] = videos
|
||||
if video_metadatas is not None:
|
||||
processor_kwargs["video_metadata"] = video_metadatas
|
||||
if video_kwargs:
|
||||
processor_kwargs.update(video_kwargs)
|
||||
|
||||
encoded = self._processor(**processor_kwargs)
|
||||
|
||||
# Write Robometer-specific token ids and the video patch merge size into
|
||||
# the encoded batch so `RobometerRewardModel` doesn't need its own
|
||||
# tokenizer at inference (EO1-style separation: the processor owns the
|
||||
# tokenizer, the model owns the backbone and heads).
|
||||
tokenizer = self._processor.tokenizer
|
||||
encoded["prog_token_id"] = tokenizer.convert_tokens_to_ids("<|prog_token|>")
|
||||
encoded["vision_start_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_start|>")
|
||||
encoded["vision_end_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_end|>")
|
||||
video_processor = getattr(self._processor, "video_processor", None)
|
||||
encoded["video_merge_size"] = int(getattr(video_processor, "merge_size", 14))
|
||||
return encoded
|
||||
|
||||
def _build_conversation(self, frames: np.ndarray, task: str) -> list[dict[str, Any]]:
|
||||
pil_frames = _frames_to_pil(frames)
|
||||
prompt = PROGRESS_PROMPT.format(task=task)
|
||||
content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
|
||||
|
||||
if self.use_multi_image:
|
||||
for image in pil_frames:
|
||||
content.append({"type": "image", "image": image})
|
||||
if self.use_per_frame_progress_token:
|
||||
content.append({"type": "text", "text": "<|prog_token|>"})
|
||||
else:
|
||||
content.append({"type": "video", "video": pil_frames, "sample_fps": 1.0})
|
||||
|
||||
return [{"role": "user", "content": content}]
|
||||
|
||||
def transform_features(
|
||||
self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
|
||||
) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
|
||||
# The Qwen-VL processor produces variable-length sequence tensors that
|
||||
# don't fit the static `PolicyFeature(shape=...)` mould; we deliberately
|
||||
# do not advertise the new observation keys here.
|
||||
return features
|
||||
|
||||
def get_config(self) -> dict[str, Any]:
|
||||
return {
|
||||
"base_model_id": self.base_model_id,
|
||||
"image_key": self.image_key,
|
||||
"task_key": self.task_key,
|
||||
"default_task": self.default_task,
|
||||
"max_frames": self.max_frames,
|
||||
"use_multi_image": self.use_multi_image,
|
||||
"use_per_frame_progress_token": self.use_per_frame_progress_token,
|
||||
"max_length": self.max_length,
|
||||
}
|
||||
|
||||
|
||||
def make_robometer_pre_post_processors(
|
||||
config: RobometerConfig,
|
||||
dataset_stats: dict[str, dict[str, Any]] | None = None,
|
||||
) -> tuple[
|
||||
PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
|
||||
PolicyProcessorPipeline[PolicyAction, PolicyAction],
|
||||
]:
|
||||
"""Pipeline that pre-encodes frames + task into Qwen-VL tensors.
|
||||
|
||||
The preprocessor adds a batch dimension if needed, runs Robometer's
|
||||
encoder, and moves everything to the configured device. The
|
||||
postprocessor is the identity since Robometer outputs a single reward
|
||||
tensor (no action to un-normalise).
|
||||
"""
|
||||
del dataset_stats # Robometer has its own normalisation inside the Qwen-VL processor.
|
||||
|
||||
preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
|
||||
steps=[
|
||||
AddBatchDimensionProcessorStep(),
|
||||
RobometerEncoderProcessorStep(
|
||||
base_model_id=config.base_model_id,
|
||||
image_key=config.image_key,
|
||||
task_key=config.task_key,
|
||||
default_task=config.default_task,
|
||||
max_frames=config.max_frames,
|
||||
use_multi_image=config.use_multi_image,
|
||||
use_per_frame_progress_token=config.use_per_frame_progress_token,
|
||||
),
|
||||
DeviceProcessorStep(device=config.device or "cpu"),
|
||||
],
|
||||
name=POLICY_PREPROCESSOR_DEFAULT_NAME,
|
||||
)
|
||||
postprocessor = PolicyProcessorPipeline(
|
||||
name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
|
||||
to_transition=policy_action_to_transition,
|
||||
)
|
||||
return preprocessor, postprocessor
|
||||
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Re-save a Robometer checkpoint in LeRobot HF format.
|
||||
|
||||
LeRobot's reward model format is ``config.json`` (a draccus-encoded
|
||||
:class:`~lerobot.rewards.robometer.RobometerConfig`) plus a single
|
||||
``model.safetensors`` containing the merged base + heads weights. The
|
||||
released checkpoint at ``lilkm/robometer-4b`` already follows this layout;
|
||||
this script is for converting other Robometer variants (e.g. a future
|
||||
upstream release or a local training run) into the same format.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
lerobot-export-robometer \\
|
||||
--src robometer/Robometer-4B \\
|
||||
--dst ./robometer-4b-lerobot
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
|
||||
from lerobot.rewards.robometer._upstream_loader import apply_upstream_checkpoint
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
|
||||
def export_robometer_to_lerobot(
|
||||
src: str,
|
||||
dst: str | Path,
|
||||
*,
|
||||
device: str = "cpu",
|
||||
dataset_repo_id: str = "",
|
||||
write_model_card: bool = True,
|
||||
) -> Path:
|
||||
"""Load Robometer from ``src`` and re-save it under ``dst`` in LeRobot HF format.
|
||||
|
||||
Produces ``config.json``, ``model.safetensors``, and (optionally) ``README.md``.
|
||||
|
||||
Args:
|
||||
src: Upstream source. Hugging Face repo id (``"robometer/Robometer-4B"``,
|
||||
optionally ``"...@revision"``) or a local snapshot directory.
|
||||
dst: Output directory. ``config.json`` and ``model.safetensors`` are
|
||||
written here.
|
||||
device: Where to place the model during loading. Defaults to CPU; use
|
||||
``"cuda"`` if you want to verify on GPU before saving.
|
||||
dataset_repo_id: Hugging Face dataset id the model was trained on
|
||||
(e.g. ``"robometer/RBM-1M"``). Written into the model card's
|
||||
``datasets:`` metadata. Leave empty if not applicable.
|
||||
write_model_card: Generate a ``README.md`` using LeRobot's reward
|
||||
model card template. Disable if you want to write the README
|
||||
yourself.
|
||||
|
||||
Returns:
|
||||
The resolved output directory.
|
||||
"""
|
||||
# A fresh ``RobometerConfig`` has ``vlm_config=None``, which routes
|
||||
# ``__init__`` through the upstream-matching path: download base Qwen,
|
||||
# resize embeddings per ``ROBOMETER_SPECIAL_TOKENS``. ``apply_upstream_checkpoint``
|
||||
# then resizes again (if needed) to match the upstream checkpoint's vocab
|
||||
# and overlays its weights. ``_save_pretrained`` snapshots the resulting
|
||||
# post-resize architecture into ``vlm_config`` for fast future loads.
|
||||
cfg = RobometerConfig(pretrained_path=src, device=device)
|
||||
model = RobometerRewardModel(cfg)
|
||||
apply_upstream_checkpoint(model, src)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
dst = Path(dst)
|
||||
dst.mkdir(parents=True, exist_ok=True)
|
||||
model.save_pretrained(str(dst))
|
||||
|
||||
if write_model_card:
|
||||
card = model.generate_model_card(
|
||||
dataset_repo_id=dataset_repo_id,
|
||||
model_type=model.config.type,
|
||||
license=model.config.license,
|
||||
tags=model.config.tags,
|
||||
)
|
||||
card.save(str(dst / "README.md"))
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument(
|
||||
"--src",
|
||||
default="robometer/Robometer-4B",
|
||||
help="Upstream Robometer source (HF repo id or local directory).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dst",
|
||||
required=True,
|
||||
help="Output directory for the LeRobot-format checkpoint.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default="cpu",
|
||||
help="Torch device to load the model on (default: cpu). Conversion only "
|
||||
"needs CPU; use cuda if you also want to smoke-test inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="",
|
||||
help="Optional Hugging Face dataset id used for training "
|
||||
"(e.g. `robometer/RBM-1M`). Written into the auto-generated model card's "
|
||||
"`datasets:` metadata.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-readme",
|
||||
action="store_true",
|
||||
help="Skip writing README.md. Use if you want to author the model card by hand.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
init_logging()
|
||||
args = _parse_args()
|
||||
out = export_robometer_to_lerobot(
|
||||
src=args.src,
|
||||
dst=args.dst,
|
||||
device=args.device,
|
||||
dataset_repo_id=args.dataset,
|
||||
write_model_card=not args.no_readme,
|
||||
)
|
||||
logging.info("Saved LeRobot-format Robometer checkpoint to %s", out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -41,8 +41,6 @@ For more details, see the [Physical Intelligence π₀ blog post](https://www.ph
|
||||
For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
|
||||
{% elif model_name == "gaussian_actor" %}
|
||||
This is a Gaussian Actor policy (Gaussian policy with a tanh squash) — the policy-side component used by [Soft Actor-Critic (SAC)](https://huggingface.co/papers/1801.01290) and related maximum-entropy continuous-control algorithms.
|
||||
{% elif model_name == "reward_classifier" %}
|
||||
A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
|
||||
{% else %}
|
||||
_Model type not recognized — please update this template._
|
||||
{% endif %}
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
|
||||
{% elif model_name == "sarm" %}
|
||||
A Success-Aware Reward Model (SARM) predicts a dense reward signal from observations, typically used downstream for reinforcement learning or human-in-the-loop fine-tuning when task success is not directly observable.
|
||||
{% elif model_name == "robometer" %}
|
||||
Robometer is a zero-shot general-purpose robotic reward model built on a fine-tuned Qwen3-VL backbone with progress, preference, and success heads. Given a video and a task description it outputs a per-frame progress signal in [0, 1] and a per-frame success probability — suitable for offline reward labelling and for low-frequency reward signals during RL fine-tuning of robot policies.
|
||||
{% else %}
|
||||
_Reward model type not recognized — please update this template._
|
||||
{% endif %}
|
||||
|
||||
@@ -0,0 +1,299 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the in-tree Robometer reward model."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
|
||||
from lerobot.rewards.robometer import RobometerConfig
|
||||
from lerobot.rewards.robometer.modeling_robometer import (
|
||||
ROBOMETER_FEATURE_PREFIX,
|
||||
convert_bins_to_continuous,
|
||||
decode_progress_outputs,
|
||||
)
|
||||
|
||||
|
||||
class _FakeQwenConfig:
|
||||
"""Stand-in for a Qwen3-VL config (the `model.config` attribute).
|
||||
|
||||
``to_dict`` matches HF's ``PretrainedConfig.to_dict`` closely enough for
|
||||
``RobometerRewardModel._save_pretrained`` to snapshot a meaningful
|
||||
``vlm_config`` into the saved ``config.json`` and for the reload path
|
||||
to round-trip through ``AutoConfig.for_model``.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_dim: int = 8, vocab_size: int = 100) -> None:
|
||||
self.text_config = SimpleNamespace(hidden_size=hidden_dim, vocab_size=vocab_size)
|
||||
self._hidden_dim = hidden_dim
|
||||
self._vocab_size = vocab_size
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"model_type": "fake_qwen",
|
||||
"text_config": {
|
||||
"hidden_size": self._hidden_dim,
|
||||
"vocab_size": self._vocab_size,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class _FakeEmbeddings(torch.nn.Module):
|
||||
def __init__(self, num_embeddings: int = 100) -> None:
|
||||
super().__init__()
|
||||
self.num_embeddings = num_embeddings
|
||||
|
||||
|
||||
class _FakeBaseModel(torch.nn.Module):
|
||||
"""Stand-in for the Qwen3-VL backbone during tests.
|
||||
|
||||
Provides the minimum surface `RobometerRewardModel.__init__` and
|
||||
`_compute_rbm_logits` rely on: a `parameters()` iterator (for dtype +
|
||||
device), a `config.text_config.hidden_size`, a `config.to_dict()` so
|
||||
`_save_pretrained` can snapshot `vlm_config`,
|
||||
`get_input_embeddings()` / `resize_token_embeddings()` so the fresh-init
|
||||
embed resize is a no-op, and a forward that returns a `SimpleNamespace`
|
||||
with a `hidden_states` tuple.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_dim: int = 8) -> None:
|
||||
super().__init__()
|
||||
self._param = torch.nn.Parameter(torch.zeros(1))
|
||||
self.hidden_dim = hidden_dim
|
||||
self.config = _FakeQwenConfig(hidden_dim)
|
||||
self._embeddings = _FakeEmbeddings()
|
||||
|
||||
def get_input_embeddings(self) -> _FakeEmbeddings:
|
||||
return self._embeddings
|
||||
|
||||
def resize_token_embeddings(self, new_size: int) -> None:
|
||||
self._embeddings.num_embeddings = new_size
|
||||
|
||||
def forward(self, **kwargs): # noqa: ARG002 - intentional kwargs sink
|
||||
input_ids = kwargs["input_ids"]
|
||||
return SimpleNamespace(
|
||||
hidden_states=(torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),),
|
||||
last_hidden_state=torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),
|
||||
)
|
||||
|
||||
|
||||
class _FakeTokenizer:
|
||||
"""Minimal stand-in for an HF tokenizer.
|
||||
|
||||
``RobometerConfig.__post_init__`` uses ``len(tokenizer)`` to compute the
|
||||
deterministic resize target ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``,
|
||||
so a working ``__len__`` is all we need.
|
||||
"""
|
||||
|
||||
def __init__(self, length: int = 100) -> None:
|
||||
self._length = length
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self._length
|
||||
|
||||
|
||||
def _patch_build(monkeypatch) -> None:
|
||||
"""Stub out the HF AutoX calls so Robometer construction stays cheap in tests.
|
||||
|
||||
Covers (EO-1 style — no model-side override hooks):
|
||||
* ``AutoConfig.from_pretrained`` (config side) — used by
|
||||
``RobometerConfig.__post_init__`` to snapshot the backbone config.
|
||||
* ``AutoTokenizer.from_pretrained`` (config side) — used by
|
||||
``__post_init__`` to compute ``len(tokenizer) + 5``.
|
||||
* ``AutoConfig.for_model`` — used by
|
||||
``RobometerConfig.vlm_backbone_config`` when rebuilding for ``from_config``.
|
||||
* ``AutoModelForImageTextToText.from_pretrained`` — fresh-training path
|
||||
(``pretrained_path is None``).
|
||||
* ``AutoModelForImageTextToText.from_config`` — checkpoint-reload path
|
||||
(``pretrained_path`` is set).
|
||||
"""
|
||||
from lerobot.rewards.robometer import configuration_robometer, modeling_robometer
|
||||
|
||||
monkeypatch.setattr(
|
||||
modeling_robometer.AutoModelForImageTextToText,
|
||||
"from_pretrained",
|
||||
lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
modeling_robometer.AutoModelForImageTextToText,
|
||||
"from_config",
|
||||
lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
configuration_robometer.AutoConfig,
|
||||
"for_model",
|
||||
lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
configuration_robometer.AutoConfig,
|
||||
"from_pretrained",
|
||||
lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
configuration_robometer.AutoTokenizer,
|
||||
"from_pretrained",
|
||||
lambda *args, **kwargs: _FakeTokenizer(length=100),
|
||||
)
|
||||
|
||||
|
||||
def _make_batch(features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
|
||||
"""Build a `compute_reward`-ready batch using Robometer's namespaced keys."""
|
||||
return {f"{ROBOMETER_FEATURE_PREFIX}{key}": value for key, value in features.items()}
|
||||
|
||||
|
||||
def test_robometer_config_registered(monkeypatch):
|
||||
_patch_build(monkeypatch)
|
||||
assert "robometer" in RewardModelConfig.get_known_choices()
|
||||
assert RewardModelConfig.get_choice_class("robometer") is RobometerConfig
|
||||
assert isinstance(make_reward_model_config("robometer", device="cpu"), RobometerConfig)
|
||||
|
||||
|
||||
def test_robometer_factory_returns_in_tree_class():
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
assert get_reward_model_class("robometer") is RobometerRewardModel
|
||||
|
||||
|
||||
def test_convert_bins_to_continuous_returns_expected_values():
|
||||
# Two frames: first peaks at bin 0 (center 0.0), second peaks at bin 9 (center 1.0).
|
||||
bin_logits = torch.full((2, 10), -10.0)
|
||||
bin_logits[0, 0] = 10.0
|
||||
bin_logits[1, -1] = 10.0
|
||||
values = convert_bins_to_continuous(bin_logits)
|
||||
assert values.shape == (2,)
|
||||
assert torch.allclose(values, torch.tensor([0.0, 1.0]), atol=1e-3)
|
||||
|
||||
|
||||
def test_decode_progress_outputs_returns_last_frame_values():
|
||||
progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
|
||||
success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
|
||||
|
||||
outputs = decode_progress_outputs(progress, success_logits, is_discrete_mode=False)
|
||||
|
||||
assert outputs["progress_pred"] == [pytest.approx([0.1, 0.9]), pytest.approx([0.4, 0.6])]
|
||||
assert outputs["success_probs"][0][-1] == pytest.approx(torch.sigmoid(torch.tensor(5.0)).item(), abs=1e-3)
|
||||
assert outputs["success_probs"][1][-1] == pytest.approx(
|
||||
torch.sigmoid(torch.tensor(-5.0)).item(), abs=1e-3
|
||||
)
|
||||
|
||||
|
||||
def test_decode_progress_outputs_discrete_mode_softmaxes_over_bins():
|
||||
# 2 frames, peaks at bin 0 and bin 9 → continuous predictions 0.0 and 1.0
|
||||
bin_logits = torch.full((1, 2, 10), -10.0)
|
||||
bin_logits[0, 0, 0] = 10.0
|
||||
bin_logits[0, 1, -1] = 10.0
|
||||
|
||||
outputs = decode_progress_outputs(bin_logits, success_logits=None, is_discrete_mode=True)
|
||||
|
||||
assert outputs["success_probs"] == []
|
||||
assert outputs["progress_pred"][0] == pytest.approx([0.0, 1.0], abs=1e-3)
|
||||
|
||||
|
||||
def test_robometer_compute_reward_reads_pre_encoded_inputs(monkeypatch):
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
|
||||
success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
|
||||
_patch_build(monkeypatch)
|
||||
|
||||
cfg = RobometerConfig(device="cpu", reward_output="progress", progress_loss_type="l2")
|
||||
model = RobometerRewardModel(cfg)
|
||||
# Bypass the Qwen3-VL forward + head extraction with deterministic logits.
|
||||
monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
|
||||
|
||||
batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
|
||||
rewards = model.compute_reward(batch)
|
||||
|
||||
assert torch.allclose(rewards, torch.tensor([0.9, 0.6]))
|
||||
|
||||
|
||||
def test_robometer_compute_reward_can_return_binary_success(monkeypatch):
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
|
||||
success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]]) # sigmoid(5) > 0.5; sigmoid(-5) < 0.5
|
||||
_patch_build(monkeypatch)
|
||||
|
||||
cfg = RobometerConfig(
|
||||
device="cpu",
|
||||
reward_output="success",
|
||||
success_threshold=0.5,
|
||||
progress_loss_type="l2",
|
||||
)
|
||||
model = RobometerRewardModel(cfg)
|
||||
monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
|
||||
|
||||
batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
|
||||
rewards = model.compute_reward(batch)
|
||||
|
||||
assert torch.equal(rewards, torch.tensor([1.0, 0.0]))
|
||||
|
||||
|
||||
def test_robometer_compute_reward_errors_when_inputs_missing(monkeypatch):
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
_patch_build(monkeypatch)
|
||||
|
||||
cfg = RobometerConfig(device="cpu", progress_loss_type="l2")
|
||||
model = RobometerRewardModel(cfg)
|
||||
|
||||
with pytest.raises(KeyError, match=r"observation\.robometer\.input_ids"):
|
||||
model.compute_reward({})
|
||||
|
||||
|
||||
def test_robometer_save_pretrained_roundtrips(monkeypatch, tmp_path):
|
||||
"""Saving and reloading a Robometer model in LeRobot HF format must produce
|
||||
a single ``model.safetensors`` + ``config.json`` (no Hydra ``config.yaml``).
|
||||
"""
|
||||
from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
|
||||
|
||||
from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
|
||||
|
||||
_patch_build(monkeypatch)
|
||||
cfg = RobometerConfig(
|
||||
device="cpu",
|
||||
pretrained_path="robometer/Robometer-4B",
|
||||
# Knobs the user might tweak — must survive the round-trip.
|
||||
image_key="observation.images.cam_top",
|
||||
task_key="task",
|
||||
reward_output="success",
|
||||
success_threshold=0.7,
|
||||
progress_loss_type="l2",
|
||||
)
|
||||
model = RobometerRewardModel(cfg)
|
||||
model.save_pretrained(str(tmp_path))
|
||||
|
||||
# Exactly the files LeRobot's HubMixin promises.
|
||||
assert (tmp_path / CONFIG_NAME).exists()
|
||||
assert (tmp_path / SAFETENSORS_SINGLE_FILE).exists()
|
||||
assert not (tmp_path / "config.yaml").exists() # we want HF-style, not Hydra
|
||||
|
||||
# Reload from the local directory: no Hub fetch, no YAML overlay. The
|
||||
# base class drives subclass dispatch via the `type` field in config.json.
|
||||
reloaded_cfg = RewardModelConfig.from_pretrained(str(tmp_path))
|
||||
assert isinstance(reloaded_cfg, RobometerConfig)
|
||||
reloaded_cfg.pretrained_path = str(tmp_path) # mimic lerobot-train's `validate()`
|
||||
reloaded = RobometerRewardModel.from_pretrained(str(tmp_path), config=reloaded_cfg)
|
||||
|
||||
assert reloaded.config.image_key == "observation.images.cam_top"
|
||||
assert reloaded.config.task_key == "task"
|
||||
assert reloaded.config.reward_output == "success"
|
||||
assert reloaded.config.success_threshold == 0.7
|
||||
assert reloaded.config.progress_loss_type == "l2" # came back from config.json
|
||||
@@ -1,5 +1,5 @@
|
||||
version = 1
|
||||
revision = 2
|
||||
revision = 3
|
||||
requires-python = ">=3.12"
|
||||
resolution-markers = [
|
||||
"(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
|
||||
@@ -1142,7 +1142,7 @@ name = "decord"
|
||||
version = "0.6.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy", marker = "(platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "numpy", marker = "(platform_machine != 'arm64' and platform_machine != 's390x' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 's390x' and sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" },
|
||||
@@ -2972,6 +2972,11 @@ qwen-vl-utils-dep = [
|
||||
reachy2 = [
|
||||
{ name = "reachy2-sdk" },
|
||||
]
|
||||
robometer = [
|
||||
{ name = "peft" },
|
||||
{ name = "qwen-vl-utils" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
robstride = [
|
||||
{ name = "python-can" },
|
||||
]
|
||||
@@ -3122,6 +3127,7 @@ requires-dist = [
|
||||
{ name = "lerobot", extras = ["peft"], marker = "extra == 'all'" },
|
||||
{ name = "lerobot", extras = ["peft-dep"], marker = "extra == 'groot'" },
|
||||
{ name = "lerobot", extras = ["peft-dep"], marker = "extra == 'peft'" },
|
||||
{ name = "lerobot", extras = ["peft-dep"], marker = "extra == 'robometer'" },
|
||||
{ name = "lerobot", extras = ["peft-dep"], marker = "extra == 'wallx'" },
|
||||
{ name = "lerobot", extras = ["phone"], marker = "extra == 'all'" },
|
||||
{ name = "lerobot", extras = ["pi"], marker = "extra == 'all'" },
|
||||
@@ -3139,6 +3145,7 @@ requires-dist = [
|
||||
{ name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'lekiwi'" },
|
||||
{ name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'robometer'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
|
||||
{ name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
|
||||
@@ -3160,6 +3167,7 @@ requires-dist = [
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'multi-task-dit'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'peft'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'pi'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'robometer'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'sarm'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'smolvla'" },
|
||||
{ name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'wallx'" },
|
||||
@@ -3227,7 +3235,7 @@ requires-dist = [
|
||||
{ name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
|
||||
{ name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
|
||||
]
|
||||
provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
|
||||
provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "robometer", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
|
||||
|
||||
[[package]]
|
||||
name = "librt"
|
||||
|
||||
Reference in New Issue
Block a user