optmize topreward input processing (#3660)

This commit is contained in:
Haoming Song
2026-05-25 22:07:45 +08:00
committed by GitHub
parent 616663cd9f
commit 3b5b94dbd6
10 changed files with 300 additions and 281 deletions
+1 -1
View File
@@ -53,7 +53,7 @@ or, with `uv` from a source checkout:
uv sync --extra topreward uv sync --extra topreward
``` ```
This pulls in `transformers` and `qwen-vl-utils`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended. This pulls in `transformers`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended.
## Model Inputs and Outputs ## Model Inputs and Outputs
+1 -1
View File
@@ -209,7 +209,7 @@ groot = [
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'" "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
] ]
sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"] sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
topreward = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"] topreward = ["lerobot[transformers-dep]"]
xvla = ["lerobot[transformers-dep]"] xvla = ["lerobot[transformers-dep]"]
eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"] eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -107,10 +107,10 @@ def compute_instruction_rewards_for_prefixes(
else: else:
prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64)) prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64))
episode_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
rewards: list[float] = [] rewards: list[float] = []
for length in prefix_lengths: for length in prefix_lengths:
frames = torch.stack([dataset[ep_start + i][image_key] for i in range(int(length))]) frames = episode_frames[: int(length)].unsqueeze(0) # (1, T, C, H, W)
frames = frames.unsqueeze(0) # (1, T, C, H, W)
transition = { transition = {
TransitionKey.OBSERVATION: {image_key: frames}, TransitionKey.OBSERVATION: {image_key: frames},
@@ -146,7 +146,6 @@ def compute_topreward_progress(
device: str = "cuda", device: str = "cuda",
num_samples: int | None = None, num_samples: int | None = None,
fps: float | None = None, fps: float | None = None,
reduction: str | None = None,
episodes: list[int] | None = None, episodes: list[int] | None = None,
) -> Path: ) -> Path:
"""Run TOPReward over a dataset and write per-frame progress.""" """Run TOPReward over a dataset and write per-frame progress."""
@@ -154,10 +153,10 @@ def compute_topreward_progress(
logging.info(f"Loading TOPReward config from: {reward_model_path}") logging.info(f"Loading TOPReward config from: {reward_model_path}")
model = TOPRewardModel.from_pretrained(reward_model_path) model = TOPRewardModel.from_pretrained(reward_model_path)
config = model.config config = model.config
config.device = device
if vlm_name is not None and vlm_name != config.vlm_name: if vlm_name is not None and vlm_name != config.vlm_name:
logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}") logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}")
config.vlm_name = vlm_name config.vlm_name = vlm_name
config.device = device
model = TOPRewardModel(config) model = TOPRewardModel(config)
else: else:
config_kwargs: dict[str, Any] = {"device": device} config_kwargs: dict[str, Any] = {"device": device}
@@ -165,8 +164,6 @@ def compute_topreward_progress(
config_kwargs["vlm_name"] = vlm_name config_kwargs["vlm_name"] = vlm_name
if fps is not None: if fps is not None:
config_kwargs["fps"] = fps config_kwargs["fps"] = fps
if reduction is not None:
config_kwargs["reduction"] = reduction
config = TOPRewardConfig(**config_kwargs) config = TOPRewardConfig(**config_kwargs)
logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}") logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}")
model = TOPRewardModel(config) model = TOPRewardModel(config)
@@ -302,9 +299,6 @@ Examples:
help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).", help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).",
) )
parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.") parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.")
parser.add_argument(
"--reduction", type=str, default=None, choices=["mean", "sum"], help="Override reduction."
)
parser.add_argument( parser.add_argument(
"--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub." "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
) )
@@ -321,7 +315,6 @@ Examples:
device=args.device, device=args.device,
num_samples=args.num_samples, num_samples=args.num_samples,
fps=args.fps, fps=args.fps,
reduction=args.reduction,
episodes=args.episodes, episodes=args.episodes,
) )
@@ -22,8 +22,8 @@ from lerobot.utils.constants import OBS_IMAGES
# Default prompt scaffolding from the upstream TOPReward paper / reference # Default prompt scaffolding from the upstream TOPReward paper / reference
# implementation (``QwenClient.compute_instruction_reward``). The prompt # implementation (``QwenClient.compute_instruction_reward``). The prompt
# computes the log-likelihood of the suffix ``f"{instruction} ... True"`` # scores the terminal ``True`` token in ``f"{instruction} ... True"``
# given the video, then reduces those token log-probs to a scalar reward. # given the video.
DEFAULT_PROMPT_PREFIX = ( DEFAULT_PROMPT_PREFIX = (
"The above video shows a robot manipulation trajectory that completes the following task: " "The above video shows a robot manipulation trajectory that completes the following task: "
) )
@@ -67,8 +67,6 @@ class TOPRewardConfig(RewardModelConfig):
add_chat_template: If ``True``, wrap the full prompt with the add_chat_template: If ``True``, wrap the full prompt with the
tokenizer's chat template before tokenisation (matches tokenizer's chat template before tokenisation (matches
upstream ``add_chat_template=True``). upstream ``add_chat_template=True``).
reduction: Reduction over per-token log-probs of the suffix
tokens (``"mean"`` or ``"sum"``).
success_threshold: Optional log-prob threshold. If finite, success_threshold: Optional log-prob threshold. If finite,
:meth:`TOPRewardModel.compute_reward` returns :meth:`TOPRewardModel.compute_reward` returns
``(reward > success_threshold).float()`` instead of the raw ``(reward > success_threshold).float()`` instead of the raw
@@ -96,7 +94,6 @@ class TOPRewardConfig(RewardModelConfig):
prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
add_chat_template: bool = False add_chat_template: bool = False
reduction: str = "mean"
success_threshold: float = float("-inf") success_threshold: float = float("-inf")
max_input_length: int = 32768 max_input_length: int = 32768
@@ -116,8 +113,6 @@ class TOPRewardConfig(RewardModelConfig):
def __post_init__(self) -> None: def __post_init__(self) -> None:
super().__post_init__() super().__post_init__()
if self.reduction not in {"mean", "sum"}:
raise ValueError(f"reduction must be 'mean' or 'sum', got {self.reduction!r}")
if self.max_frames is not None and self.max_frames < 1: if self.max_frames is not None and self.max_frames < 1:
raise ValueError(f"max_frames must be >= 1, got {self.max_frames}") raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
if self.fps <= 0: if self.fps <= 0:
@@ -29,15 +29,16 @@ and returns that log-likelihood as the reward signal.
Inference recipe: Inference recipe:
1. The processor builds a chat-style prompt, tokenises it, and emits 1. The processor builds a chat-style prompt, tokenises it, and emits
``input_ids``, ``attention_mask``, vision tensors, and ``prompt_length``. ``input_ids``, ``attention_mask``, vision tensors, and ``labels``.
2. The model label-masks everything before ``prompt_length`` with ``-100``. The processor label-masks everything except the terminal answer token with
3. Forward the full token sequence through the VLM. ``-100``.
4. Read per-token log-probabilities of the unmasked suffix tokens from the 2. Forward the full token sequence through the VLM.
logits and reduce them (mean or sum) into a scalar reward. 3. Read the terminal answer token log-probability from the logits as the
scalar reward.
With the default ``prompt_suffix_template`` and ``prompt_length = input_len - 1`` With the default ``prompt_suffix_template``, the only unmasked token is the
(mirrored from upstream), the only unmasked token is the literal ``"True"`` literal ``"True"`` at the end — the reward is
at the end — the reward is ``log P("True" | video + prompt + instruction)``. ``log P("True" | video + prompt + instruction)``.
This LeRobot port is **inference-only and not trainable** — :meth:`forward` This LeRobot port is **inference-only and not trainable** — :meth:`forward`
is intentionally inherited from :class:`PreTrainedRewardModel` and raises is intentionally inherited from :class:`PreTrainedRewardModel` and raises
@@ -66,6 +67,7 @@ from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.constants import CONFIG_NAME from huggingface_hub.constants import CONFIG_NAME
from huggingface_hub.errors import HfHubHTTPError from huggingface_hub.errors import HfHubHTTPError
from torch import Tensor from torch import Tensor
from torch.nn.functional import cross_entropy
from lerobot.configs.rewards import RewardModelConfig from lerobot.configs.rewards import RewardModelConfig
from lerobot.rewards.pretrained import PreTrainedRewardModel from lerobot.rewards.pretrained import PreTrainedRewardModel
@@ -116,57 +118,29 @@ class TOPRewardModel(PreTrainedRewardModel):
def compute_reward(self, batch: dict[str, Any]) -> Tensor: def compute_reward(self, batch: dict[str, Any]) -> Tensor:
"""Return one log-prob reward per sample in the batch.""" """Return one log-prob reward per sample in the batch."""
inputs = { inputs: dict[str, Any] = {}
key: batch[f"{TOPREWARD_FEATURE_PREFIX}{key}"] for key in TOPREWARD_INPUT_KEYS:
for key in TOPREWARD_INPUT_KEYS batch_key = f"{TOPREWARD_FEATURE_PREFIX}{key}"
if f"{TOPREWARD_FEATURE_PREFIX}{key}" in batch if batch_key not in batch:
}
if "input_ids" not in inputs:
raise KeyError( raise KeyError(
f"TOPReward batch missing pre-encoded inputs (expected " f"TOPReward batch missing `{batch_key}`. Make sure the "
f"`{TOPREWARD_FEATURE_PREFIX}input_ids`). Make sure the "
"TOPRewardEncoderProcessorStep ran before `compute_reward`." "TOPRewardEncoderProcessorStep ran before `compute_reward`."
) )
inputs[key] = batch[batch_key]
prompt_lengths = inputs.pop("prompt_length")
device = next(self.model.parameters()).device device = next(self.model.parameters()).device
inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()} inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
labels = inputs.pop("labels")
labels = inputs["input_ids"].clone() inputs["logits_to_keep"] = 2
for i, plen in enumerate(prompt_lengths.tolist()):
labels[i, : int(plen)] = -100
if "attention_mask" in inputs:
labels = labels.masked_fill(inputs["attention_mask"] == 0, -100)
self.eval() self.eval()
with torch.no_grad(): with torch.no_grad():
outputs = self.model(**inputs) outputs = self.model(**inputs)
logits = outputs.logits
logits = outputs.logits[:, :-1, :] rewards = -cross_entropy(logits[:, -2, :].float(), labels[:, -1], reduction="none")
target_labels = labels[:, 1:]
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
mask = target_labels != -100
safe_targets = target_labels.masked_fill(~mask, 0)
token_log_probs = log_probs.gather(-1, safe_targets.unsqueeze(-1)).squeeze(-1)
batch_size = inputs["input_ids"].shape[0]
rewards = []
for i in range(batch_size):
sample_log_probs = token_log_probs[i][mask[i]]
if sample_log_probs.numel() == 0:
raise RuntimeError(
"TOPReward could not isolate any suffix tokens to score. Check that "
"`prompt_suffix_template` produces at least one tokenised character."
)
if self.config.reduction == "sum":
rewards.append(sample_log_probs.sum().item())
else:
rewards.append(sample_log_probs.mean().item())
out = torch.as_tensor(rewards, dtype=torch.float32)
if np.isfinite(self.config.success_threshold): if np.isfinite(self.config.success_threshold):
out = (out > self.config.success_threshold).float() rewards = (rewards > self.config.success_threshold).float()
return out.to(self.config.device or "cpu") return rewards.to(self.config.device or "cpu")
def _save_pretrained(self, save_directory: Path) -> None: def _save_pretrained(self, save_directory: Path) -> None:
"""Save ``config.json`` only.""" """Save ``config.json`` only."""
@@ -19,9 +19,7 @@ from __future__ import annotations
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
import numpy as np
import torch import torch
from PIL import Image
from torch import Tensor from torch import Tensor
from lerobot.configs import PipelineFeatureType, PolicyFeature from lerobot.configs import PipelineFeatureType, PolicyFeature
@@ -60,39 +58,33 @@ _TRUE_ANSWER = "True"
TOPREWARD_VLM_INPUT_KEYS = ( TOPREWARD_VLM_INPUT_KEYS = (
"input_ids", "input_ids",
"attention_mask", "attention_mask",
"pixel_values",
"pixel_values_videos", "pixel_values_videos",
"image_grid_thw",
"video_grid_thw", "video_grid_thw",
"second_per_grid_ts",
"mm_token_type_ids", "mm_token_type_ids",
) )
TOPREWARD_METADATA_KEYS = ("prompt_length",) TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + ("labels",)
TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + TOPREWARD_METADATA_KEYS
def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray: def _prepare_video_batch(video: Tensor, *, max_frames: int | None) -> Tensor:
"""Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array.""" """Return videos as ``(B, T, C, H, W)`` uint8 tensors for Qwen3-VL."""
if video.ndim == 4:
video = video.unsqueeze(1)
elif video.ndim != 5:
raise ValueError(
f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(video.shape)}"
)
if max_frames is not None: if max_frames is not None:
video = video[-max_frames:] video = video[:, -max_frames:]
if video.shape[1] in (1, 3): if video.shape[-1] in (1, 3):
video = video.permute(0, 2, 3, 1) video = video.permute(0, 1, 4, 2, 3)
elif video.shape[-1] not in (1, 3): elif video.shape[2] not in (1, 3):
raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}") raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
array = video.detach().cpu().numpy() if video.is_floating_point():
if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0: video = video * 255.0
array = array * 255.0
return np.clip(array, 0, 255).astype(np.uint8)
return video.clamp(0, 255).to(torch.uint8).contiguous()
def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
"""Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
if frames.ndim != 4:
raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
if frames.dtype != np.uint8:
frames = np.clip(frames, 0, 255).astype(np.uint8)
return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]: def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
@@ -120,10 +112,9 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and
builds the full chat prompt including the instruction suffix. The builds the full chat prompt including the instruction suffix. The
resulting ``input_ids``, ``attention_mask``, vision tensors, and a resulting ``input_ids``, ``attention_mask``, vision tensors, and
per-sample ``prompt_length`` integer are written under the ``labels`` are written under the ``observation.topreward.*`` namespace
``observation.topreward.*`` namespace so the model can label-mask and so the model can score without re-tokenising.
forward without re-tokenising.
At call time the step reads: At call time the step reads:
@@ -131,7 +122,7 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
- ``complementary_data[task_key]``: a string or list of strings. - ``complementary_data[task_key]``: a string or list of strings.
and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}<name>"]`` for the and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}<name>"]`` for the
Qwen-VL tensors plus ``prompt_length``. Qwen-VL tensors plus ``labels``.
""" """
vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct" vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
@@ -149,35 +140,26 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
def __post_init__(self) -> None: def __post_init__(self) -> None:
require_package("transformers", extra="topreward") require_package("transformers", extra="topreward")
require_package("qwen-vl-utils", extra="topreward", import_name="qwen_vl_utils")
self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True) self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True)
def __call__(self, transition: EnvTransition) -> EnvTransition: def __call__(self, transition: EnvTransition) -> EnvTransition:
observation = transition.get(TransitionKey.OBSERVATION) observation = transition.get(TransitionKey.OBSERVATION)
complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {} complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
if not isinstance(observation, dict):
raise ValueError("TOPRewardEncoderProcessorStep requires an observation dict")
if self.image_key not in observation: if self.image_key not in observation:
raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation") raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation")
frames = observation[self.image_key] frames = observation[self.image_key]
tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames) videos = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
if tensor.ndim == 4: videos = _prepare_video_batch(videos, max_frames=self.max_frames)
tensor = tensor.unsqueeze(1)
elif tensor.ndim != 5:
raise ValueError(
f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
)
batch_size = tensor.shape[0] batch_size = videos.shape[0]
tasks = _expand_tasks( tasks = _expand_tasks(
complementary.get(self.task_key, self.default_task), complementary.get(self.task_key, self.default_task),
batch_size=batch_size, batch_size=batch_size,
default=self.default_task, default=self.default_task,
) )
encoded = self._encode_batch(tensor, tasks) encoded = self._encode_batch(videos, tasks, batch_size)
new_observation = dict(observation) new_observation = dict(observation)
for key, value in encoded.items(): for key, value in encoded.items():
@@ -187,34 +169,33 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
new_transition[TransitionKey.OBSERVATION] = new_observation new_transition[TransitionKey.OBSERVATION] = new_observation
return new_transition return new_transition
def _encode_batch(self, tensor: Tensor, tasks: list[str]) -> dict[str, Any]: def _encode_batch(self, videos: Tensor, tasks: list[str], batch_size) -> dict[str, Any]:
"""Tokenise a batch of (frames, task) pairs into Qwen-VL tensors. """Tokenise a batch of (frames, task) pairs into Qwen-VL tensors.
Processes samples one at a time (each may have a different token The loop only builds per-sample chat strings. Tokenisation, padding,
length due to different numbers of vision patches), then pads / video preprocessing, and label construction are batched.
stacks the results.
""" """
from qwen_vl_utils import process_vision_info
batch_size = tensor.shape[0] texts: list[str] = []
all_encoded: list[dict[str, Any]] = [] video_metadata = [
all_prompt_lengths: list[int] = [] {
"total_num_frames": int(videos.shape[1]),
for i in range(batch_size): "fps": float(self.fps),
frames_np = _video_to_numpy(tensor[i], max_frames=self.max_frames) "frames_indices": list(range(int(videos.shape[1]))),
pil_frames = _frames_to_pil(frames_np) }
task = tasks[i] for _ in range(batch_size)
]
instruction_suffix = self.prompt_suffix_template.format(instruction=task)
eos_token = self._processor.tokenizer.eos_token eos_token = self._processor.tokenizer.eos_token
for i in range(batch_size):
instruction_suffix = self.prompt_suffix_template.format(instruction=tasks[i])
if self.add_chat_template: if self.add_chat_template:
suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip() suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip()
templated_messages = [ templated_messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "video", "video": pil_frames, "fps": self.fps}, {"type": "video", "video": videos[i], "fps": self.fps},
{"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"}, {"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"},
], ],
} }
@@ -223,13 +204,12 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
templated_messages, tokenize=False, add_generation_prompt=True templated_messages, tokenize=False, add_generation_prompt=True
) )
full_text = f"{prompt_chat}{_TRUE_ANSWER}" full_text = f"{prompt_chat}{_TRUE_ANSWER}"
image_inputs, video_inputs = process_vision_info(templated_messages)
else: else:
user_messages = [ user_messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "video", "video": pil_frames, "fps": self.fps}, {"type": "video", "video": videos[i], "fps": self.fps},
{"type": "text", "text": self.prompt_prefix}, {"type": "text", "text": self.prompt_prefix},
], ],
} }
@@ -240,59 +220,29 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
if eos_token is not None: if eos_token is not None:
prompt_chat = prompt_chat.split(eos_token)[0] prompt_chat = prompt_chat.split(eos_token)[0]
full_text = f"{prompt_chat}{instruction_suffix}" full_text = f"{prompt_chat}{instruction_suffix}"
image_inputs, video_inputs = process_vision_info(user_messages)
inputs = self._processor( texts.append(full_text)
text=[full_text],
images=image_inputs, result = self._processor(
videos=video_inputs, text=texts,
videos=videos,
video_metadata=video_metadata,
do_sample_frames=False,
padding=True, padding=True,
padding_side="left",
return_tensors="pt", return_tensors="pt",
) )
input_ids = result["input_ids"]
input_len = int(inputs["input_ids"].shape[-1]) if input_ids.shape[-1] > self.max_length:
if input_len > self.max_length:
raise ValueError( raise ValueError(
f"TOPReward input length {input_len} exceeds max_length " f"TOPReward input length {input_ids.shape[-1]} exceeds max_length "
f"{self.max_length}; lower `max_frames` or raise `max_length`." f"{self.max_length}; lower `max_frames` or raise `max_length`."
) )
prompt_length = input_len - 1 labels = torch.full_like(input_ids, -100)
all_encoded.append(inputs) labels[:, -1] = input_ids[:, -1]
all_prompt_lengths.append(prompt_length) result["labels"] = labels
result = dict(all_encoded[0]) if batch_size == 1 else self._pad_and_stack(all_encoded)
result["prompt_length"] = torch.tensor(all_prompt_lengths, dtype=torch.long)
return result
@staticmethod
def _pad_and_stack(encoded_list: list[dict[str, Any]]) -> dict[str, Any]:
"""Right-pad and stack per-sample encoded dicts into a batch."""
keys = [k for k in encoded_list[0] if isinstance(encoded_list[0][k], Tensor)]
max_len = max(enc["input_ids"].shape[-1] for enc in encoded_list)
result: dict[str, Any] = {}
for key in keys:
tensors = [enc[key] for enc in encoded_list]
if key in ("input_ids", "attention_mask"):
padded = []
pad_value = 0
for t in tensors:
pad_size = max_len - t.shape[-1]
if pad_size > 0:
padded.append(torch.nn.functional.pad(t, (0, pad_size), value=pad_value))
else:
padded.append(t)
result[key] = torch.cat(padded, dim=0)
else:
# Vision tensors (pixel_values_videos, image_grid_thw, etc.) are expected
# to have matching shapes since max_frames is applied uniformly per batch
result[key] = torch.cat(tensors, dim=0)
for key in encoded_list[0]:
if key not in result:
result[key] = encoded_list[0][key]
return result return result
def transform_features( def transform_features(
@@ -325,8 +275,8 @@ def make_topreward_pre_post_processors(
"""Pipeline that pre-encodes frames + task into Qwen-VL tensors. """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
The preprocessor adds a batch dimension if needed, runs TOPReward's The preprocessor adds a batch dimension if needed, runs TOPReward's
encoder (which tokenises the full prompt and emits ``prompt_length``), encoder (which tokenises the full prompt and emits ``labels``), and
and moves everything to the configured device. The postprocessor is moves everything to the configured device. The postprocessor is
the identity since TOPReward outputs a single reward tensor. the identity since TOPReward outputs a single reward tensor.
""" """
preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]]( preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+70 -23
View File
@@ -24,7 +24,7 @@ import torch
from lerobot.configs.rewards import RewardModelConfig from lerobot.configs.rewards import RewardModelConfig
from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
from lerobot.rewards.topreward import TOPRewardConfig from lerobot.rewards.topreward import TOPRewardConfig
from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS
from tests.utils import skip_if_package_missing from tests.utils import skip_if_package_missing
@@ -45,20 +45,23 @@ class _FakeQwenModel(torch.nn.Module):
def from_pretrained(cls, *args, **kwargs): # noqa: ARG003 def from_pretrained(cls, *args, **kwargs): # noqa: ARG003
return cls() return cls()
def forward(self, input_ids, attention_mask=None, labels=None, **kwargs): # noqa: ARG002 def forward( # noqa: ARG002
self, input_ids, attention_mask=None, labels=None, logits_to_keep=0, **kwargs
):
batch_size, seq_len = input_ids.shape batch_size, seq_len = input_ids.shape
vocab_size = 1000 vocab_size = 1000
logits = torch.zeros(batch_size, seq_len, vocab_size) logits = torch.zeros(batch_size, seq_len, vocab_size)
# Place a controlled log-prob at the target token position so the # Place a controlled log-prob at the target token position so the
# model returns a predictable reward value. # model returns a predictable reward value.
# The label-masked suffix is the last token (prompt_length = seq_len - 1). # The label-masked suffix is the last token.
# After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored # After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored
# position is logits[:, -2, :] predicting labels[:, -1]. # position is logits[:, -2, :] predicting labels[:, -1].
# We set logits so that log_softmax at the target token ≈ _reward_value. # We set logits so that log_softmax at the target token ≈ _reward_value.
if labels is not None:
for i in range(batch_size): for i in range(batch_size):
target_idx = int(input_ids[i, -1].item()) target_idx = int(input_ids[i, -1].item())
logits[i, -2, target_idx] = self._reward_value * -10 # high logit -> high log-prob logits[i, -2, target_idx] = self._reward_value * -10 # high logit -> high log-prob
if logits_to_keep:
logits = logits[:, -logits_to_keep:, :]
return SimpleNamespace(logits=logits) return SimpleNamespace(logits=logits)
@@ -72,17 +75,39 @@ def _patch_build(monkeypatch) -> None:
def _make_batch( def _make_batch(
input_ids: torch.Tensor, input_ids: torch.Tensor,
attention_mask: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None,
prompt_length: torch.Tensor | None = None, labels: torch.Tensor | None = None,
*,
omit: str | None = None,
) -> dict[str, torch.Tensor]: ) -> dict[str, torch.Tensor]:
"""Build a ``compute_reward``-ready batch using TOPReward's namespaced keys.""" """Build a ``compute_reward``-ready batch using TOPReward's namespaced keys."""
batch: dict[str, torch.Tensor] = {f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids} batch_size, seq_len = input_ids.shape
if attention_mask is not None: if attention_mask is None:
batch[f"{TOPREWARD_FEATURE_PREFIX}attention_mask"] = attention_mask attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
if prompt_length is not None: batch: dict[str, torch.Tensor] = {}
batch[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] = prompt_length if labels is not None:
batch[f"{TOPREWARD_FEATURE_PREFIX}labels"] = labels
batch.update(
{
f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids,
f"{TOPREWARD_FEATURE_PREFIX}attention_mask": attention_mask,
f"{TOPREWARD_FEATURE_PREFIX}pixel_values_videos": torch.zeros(
batch_size, 1536, dtype=torch.float32
),
f"{TOPREWARD_FEATURE_PREFIX}video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
f"{TOPREWARD_FEATURE_PREFIX}mm_token_type_ids": torch.zeros_like(input_ids),
}
)
if omit is not None:
batch.pop(f"{TOPREWARD_FEATURE_PREFIX}{omit}", None)
return batch return batch
def _terminal_labels(input_ids: torch.Tensor) -> torch.Tensor:
labels = torch.full_like(input_ids, -100)
labels[:, -1] = input_ids[:, -1]
return labels
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Registry + factory # Registry + factory
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -105,11 +130,6 @@ def test_topreward_factory_returns_in_tree_class():
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_topreward_config_rejects_bad_reduction():
with pytest.raises(ValueError, match="reduction must be"):
TOPRewardConfig(device="cpu", reduction="median")
def test_topreward_config_rejects_zero_max_frames(): def test_topreward_config_rejects_zero_max_frames():
with pytest.raises(ValueError, match="max_frames must be >= 1"): with pytest.raises(ValueError, match="max_frames must be >= 1"):
TOPRewardConfig(device="cpu", max_frames=0) TOPRewardConfig(device="cpu", max_frames=0)
@@ -142,9 +162,9 @@ def test_topreward_compute_reward_returns_one_scalar_per_sample(monkeypatch):
input_ids = torch.randint(0, 100, (2, 10)) input_ids = torch.randint(0, 100, (2, 10))
attention_mask = torch.ones(2, 10, dtype=torch.long) attention_mask = torch.ones(2, 10, dtype=torch.long)
prompt_length = torch.tensor([9, 9]) # unmask only the last token labels = _terminal_labels(input_ids)
batch = _make_batch(input_ids, attention_mask, prompt_length) batch = _make_batch(input_ids, attention_mask, labels)
rewards = model.compute_reward(batch) rewards = model.compute_reward(batch)
assert rewards.shape == (2,) assert rewards.shape == (2,)
@@ -162,9 +182,9 @@ def test_topreward_compute_reward_applies_success_threshold(monkeypatch):
input_ids = torch.randint(0, 100, (2, 10)) input_ids = torch.randint(0, 100, (2, 10))
attention_mask = torch.ones(2, 10, dtype=torch.long) attention_mask = torch.ones(2, 10, dtype=torch.long)
prompt_length = torch.tensor([9, 9]) labels = _terminal_labels(input_ids)
batch = _make_batch(input_ids, attention_mask, prompt_length) batch = _make_batch(input_ids, attention_mask, labels)
rewards = model.compute_reward(batch) rewards = model.compute_reward(batch)
assert rewards.shape == (2,) assert rewards.shape == (2,)
@@ -180,7 +200,37 @@ def test_topreward_compute_reward_errors_when_inputs_missing(monkeypatch):
model = TOPRewardModel(cfg) model = TOPRewardModel(cfg)
with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"): with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"):
model.compute_reward({}) model.compute_reward(_make_batch(torch.randint(0, 100, (1, 10)), omit="input_ids"))
@skip_if_package_missing("transformers")
def test_topreward_compute_reward_errors_when_labels_missing(monkeypatch):
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
_patch_build(monkeypatch)
cfg = TOPRewardConfig(device="cpu")
model = TOPRewardModel(cfg)
input_ids = torch.randint(0, 100, (1, 10))
with pytest.raises(KeyError, match=r"observation\.topreward\.labels"):
model.compute_reward(_make_batch(input_ids, labels=None))
@skip_if_package_missing("transformers")
def test_topreward_compute_reward_requires_all_encoder_keys(monkeypatch):
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
_patch_build(monkeypatch)
cfg = TOPRewardConfig(device="cpu")
model = TOPRewardModel(cfg)
input_ids = torch.randint(0, 100, (1, 10))
labels = _terminal_labels(input_ids)
required_encoder_keys = set(TOPREWARD_INPUT_KEYS) - {"input_ids", "labels"}
for key in required_encoder_keys:
with pytest.raises(KeyError, match=rf"observation\.topreward\.{key}"):
model.compute_reward(_make_batch(input_ids, labels=labels, omit=key))
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -198,7 +248,6 @@ def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path
cfg = TOPRewardConfig( cfg = TOPRewardConfig(
device="cpu", device="cpu",
vlm_name="Qwen/Qwen3-VL-8B-Instruct", vlm_name="Qwen/Qwen3-VL-8B-Instruct",
reduction="sum",
fps=4.0, fps=4.0,
image_key="observation.images.front", image_key="observation.images.front",
) )
@@ -217,7 +266,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
cfg = TOPRewardConfig( cfg = TOPRewardConfig(
device="cpu", device="cpu",
vlm_name="Qwen/Qwen3-VL-8B-Instruct", vlm_name="Qwen/Qwen3-VL-8B-Instruct",
reduction="sum",
fps=4.0, fps=4.0,
image_key="observation.images.front", image_key="observation.images.front",
add_chat_template=True, add_chat_template=True,
@@ -229,7 +277,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
assert isinstance(reloaded.config, TOPRewardConfig) assert isinstance(reloaded.config, TOPRewardConfig)
assert reloaded.config.vlm_name == "Qwen/Qwen3-VL-8B-Instruct" assert reloaded.config.vlm_name == "Qwen/Qwen3-VL-8B-Instruct"
assert reloaded.config.reduction == "sum"
assert reloaded.config.fps == 4.0 assert reloaded.config.fps == 4.0
assert reloaded.config.image_key == "observation.images.front" assert reloaded.config.image_key == "observation.images.front"
assert reloaded.config.add_chat_template is True assert reloaded.config.add_chat_template is True
+80
View File
@@ -0,0 +1,80 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""End-to-end TOPReward smoke test with the real Qwen3-VL model."""
import os
import pytest
import torch
pytest.importorskip("transformers")
from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig # noqa: E402
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel # noqa: E402
from lerobot.rewards.topreward.processor_topreward import ( # noqa: E402
TOPREWARD_FEATURE_PREFIX,
TOPREWARD_INPUT_KEYS,
make_topreward_pre_post_processors,
)
from tests.utils import require_cuda # noqa: E402
pytestmark = pytest.mark.skipif(
os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
reason="This test requires downloading and loading Qwen3-VL and is not meant for CI",
)
def _make_dummy_topreward_batch(image_key: str, task_key: str) -> dict[str, object]:
num_frames = 4
image_size = 64
frames = torch.zeros(1, num_frames, 3, image_size, image_size, dtype=torch.uint8)
for frame_idx in range(num_frames):
frames[0, frame_idx, 0].fill_(min(frame_idx * 48, 255))
frames[0, frame_idx, 1].fill_(96)
frames[0, frame_idx, 2].fill_(192)
return {
image_key: frames,
task_key: ["pick up the red cube"],
}
@require_cuda
def test_topreward_full_qwen3vl_preprocessor_to_compute_reward():
cfg = TOPRewardConfig(
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
device="cuda",
max_frames=4,
fps=2.0,
max_input_length=4096,
)
preprocessor, _ = make_topreward_pre_post_processors(cfg)
encoded_batch = preprocessor(_make_dummy_topreward_batch(cfg.image_key, cfg.task_key))
for key in TOPREWARD_INPUT_KEYS:
assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in encoded_batch
model = TOPRewardModel(cfg)
try:
model.to(cfg.device)
model.eval()
rewards = model.compute_reward(encoded_batch)
finally:
del model
torch.cuda.empty_cache()
assert rewards.shape == (1,)
assert rewards.dtype == torch.float32
assert torch.isfinite(rewards).all()
+52 -70
View File
@@ -16,71 +16,71 @@
from __future__ import annotations from __future__ import annotations
import numpy as np
import pytest import pytest
import torch import torch
from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
from lerobot.rewards.topreward.processor_topreward import ( from lerobot.rewards.topreward.processor_topreward import (
TOPREWARD_FEATURE_PREFIX, TOPREWARD_FEATURE_PREFIX,
TOPREWARD_INPUT_KEYS,
_expand_tasks, _expand_tasks,
_video_to_numpy, _prepare_video_batch,
) )
from lerobot.types import TransitionKey from lerobot.types import TransitionKey
from tests.utils import skip_if_package_missing from tests.utils import skip_if_package_missing
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# _video_to_numpy — pure (T, C, H, W) -> (T, H, W, C) uint8 conversion # _prepare_video_batch — raw image/video batch -> (B, T, C, H, W) uint8
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_video_to_numpy_chw_float_is_converted_to_thwc_uint8(): def test_prepare_video_batch_batched_chw_float_is_converted_to_uint8():
video = torch.rand(4, 3, 8, 8) video = torch.rand(2, 4, 3, 8, 8)
array = _video_to_numpy(video, max_frames=None) tensor = _prepare_video_batch(video, max_frames=None)
assert array.shape == (4, 8, 8, 3) assert tensor.shape == (2, 4, 3, 8, 8)
assert array.dtype == np.uint8 assert tensor.dtype == torch.uint8
assert array.min() >= 0 and array.max() <= 255 assert tensor.min() >= 0 and tensor.max() <= 255
def test_video_to_numpy_already_thwc_uint8_passes_through(): def test_prepare_video_batch_batched_thwc_uint8_is_permuted_to_channel_first():
video = torch.randint(0, 256, (3, 8, 8, 3), dtype=torch.uint8) video = torch.randint(0, 256, (2, 3, 8, 8, 3), dtype=torch.uint8)
array = _video_to_numpy(video, max_frames=None) tensor = _prepare_video_batch(video, max_frames=None)
assert array.shape == (3, 8, 8, 3) assert tensor.shape == (2, 3, 3, 8, 8)
assert array.dtype == np.uint8 assert tensor.dtype == torch.uint8
def test_video_to_numpy_max_frames_tail_crops_recent_frames(): def test_prepare_video_batch_max_frames_tail_crops_recent_frames():
video = torch.zeros(10, 3, 4, 4) video = torch.zeros(1, 10, 3, 4, 4)
for t in range(10): for t in range(10):
video[t] = t / 9.0 video[:, t] = t / 9.0
array = _video_to_numpy(video, max_frames=3) tensor = _prepare_video_batch(video, max_frames=3)
assert array.shape == (3, 4, 4, 3) assert tensor.shape == (1, 3, 3, 4, 4)
assert int(array[0, 0, 0, 0]) == int(round(7 / 9 * 255)) assert int(tensor[0, 0, 0, 0, 0]) == int(7 / 9 * 255)
assert int(array[-1, 0, 0, 0]) == 255 assert int(tensor[0, -1, 0, 0, 0]) == 255
def test_video_to_numpy_rejects_3d_input(): def test_prepare_video_batch_rejects_3d_input():
with pytest.raises(ValueError, match="Expected channel dim"): with pytest.raises(ValueError, match="Expected TOPReward frames"):
_video_to_numpy(torch.zeros(4, 8, 8), max_frames=None) _prepare_video_batch(torch.zeros(4, 8, 8), max_frames=None)
def test_video_to_numpy_floats_above_one_pass_through_without_rescaling(): def test_prepare_video_batch_floats_above_one_are_rescaled_and_clipped():
video = torch.full((1, 3, 2, 2), 5.0) video = torch.full((1, 1, 3, 2, 2), 5.0)
array = _video_to_numpy(video, max_frames=None) tensor = _prepare_video_batch(video, max_frames=None)
assert array.shape == (1, 2, 2, 3) assert tensor.shape == (1, 1, 3, 2, 2)
assert int(array.max()) == 5 assert int(tensor.max()) == 255
def test_video_to_numpy_clips_very_large_floats_to_uint8_max(): def test_prepare_video_batch_clips_very_large_floats_to_uint8_max():
video = torch.full((1, 3, 2, 2), 300.0) video = torch.full((1, 1, 3, 2, 2), 300.0)
array = _video_to_numpy(video, max_frames=None) tensor = _prepare_video_batch(video, max_frames=None)
assert int(array.max()) == 255 assert int(tensor.max()) == 255
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -124,12 +124,11 @@ def test_expand_tasks_wrong_type_raises():
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Encoder step — stubbed AutoProcessor + process_vision_info # Encoder step — stubbed AutoProcessor
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _skip_if_topreward_extras_missing(func): def _skip_if_topreward_extras_missing(func):
func = skip_if_package_missing("qwen-vl-utils", import_name="qwen_vl_utils")(func)
func = skip_if_package_missing("transformers")(func) func = skip_if_package_missing("transformers")(func)
return func return func
@@ -155,32 +154,20 @@ class _FakeAutoProcessor:
def __call__(self, text=None, images=None, videos=None, **kwargs): # noqa: ARG002 def __call__(self, text=None, images=None, videos=None, **kwargs): # noqa: ARG002
seq_len = 10 seq_len = 10
batch_size = len(text) if isinstance(text, list) else 1
return { return {
"input_ids": torch.randint(0, 100, (1, seq_len)), "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
"attention_mask": torch.ones(1, seq_len, dtype=torch.long), "attention_mask": torch.ones(batch_size, seq_len, dtype=torch.long),
"pixel_values_videos": torch.zeros(batch_size, 1536, dtype=torch.float32),
"video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
"mm_token_type_ids": torch.zeros(batch_size, seq_len, dtype=torch.long),
} }
def _build_step(monkeypatch, **overrides): def _build_step(monkeypatch, **overrides):
import importlib
import sys
import types
from lerobot.rewards.topreward import processor_topreward from lerobot.rewards.topreward import processor_topreward
from lerobot.utils import import_utils
monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor) monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor)
# Stub qwen_vl_utils as a real module object (not MagicMock) so
# ``require_package`` / ``find_spec`` don't choke on a missing ``__spec__``.
fake_qwen_vl = types.ModuleType("qwen_vl_utils")
fake_qwen_vl.process_vision_info = lambda messages: (None, None) # type: ignore[attr-defined]
fake_qwen_vl.__spec__ = importlib.machinery.ModuleSpec("qwen_vl_utils", None)
monkeypatch.setitem(sys.modules, "qwen_vl_utils", fake_qwen_vl)
# Clear the require_package cache so the stub is picked up.
import_utils._require_package_cache.pop("qwen_vl_utils", None)
return processor_topreward.TOPRewardEncoderProcessorStep(**overrides) return processor_topreward.TOPRewardEncoderProcessorStep(**overrides)
@@ -192,27 +179,29 @@ def _make_transition(observation: dict, complementary: dict | None = None) -> di
@_skip_if_topreward_extras_missing @_skip_if_topreward_extras_missing
def test_encoder_step_emits_input_ids_and_prompt_length(monkeypatch): def test_encoder_step_emits_input_ids_and_labels(monkeypatch):
"""The processor must emit Qwen-VL tensors including ``input_ids`` and """The processor must emit Qwen-VL tensors including ``input_ids`` and
``prompt_length`` under the ``observation.topreward.*`` namespace.""" ``labels`` under the ``observation.topreward.*`` namespace."""
step = _build_step(monkeypatch) step = _build_step(monkeypatch)
frames_batch = torch.zeros(1, 4, 3, 8, 8) frames_batch = torch.zeros(2, 4, 3, 8, 8)
out = step( out = step(
_make_transition( _make_transition(
observation={"observation.images.top": frames_batch}, observation={"observation.images.top": frames_batch},
complementary={"task": "pick"}, complementary={"task": ["pick", "place"]},
) )
) )
obs_out = out[TransitionKey.OBSERVATION] obs_out = out[TransitionKey.OBSERVATION]
assert f"{TOPREWARD_FEATURE_PREFIX}input_ids" in obs_out for key in TOPREWARD_INPUT_KEYS:
assert f"{TOPREWARD_FEATURE_PREFIX}attention_mask" in obs_out assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in obs_out
assert f"{TOPREWARD_FEATURE_PREFIX}prompt_length" in obs_out
prompt_length = obs_out[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] input_ids = obs_out[f"{TOPREWARD_FEATURE_PREFIX}input_ids"]
assert prompt_length.dtype == torch.long labels = obs_out[f"{TOPREWARD_FEATURE_PREFIX}labels"]
assert prompt_length.shape == (1,) assert labels.dtype == torch.long
assert labels.shape == (2, 10)
assert labels[:, :-1].eq(-100).all()
assert labels[:, -1].equal(input_ids[:, -1])
@_skip_if_topreward_extras_missing @_skip_if_topreward_extras_missing
@@ -255,10 +244,3 @@ def test_encoder_step_rejects_missing_image_key(monkeypatch):
step = _build_step(monkeypatch, image_key="observation.images.top") step = _build_step(monkeypatch, image_key="observation.images.top")
with pytest.raises(KeyError, match="image key"): with pytest.raises(KeyError, match="image key"):
step(_make_transition(observation={}, complementary={"task": "pick"})) step(_make_transition(observation={}, complementary={"task": "pick"}))
@_skip_if_topreward_extras_missing
def test_encoder_step_rejects_non_dict_observation(monkeypatch):
step = _build_step(monkeypatch)
with pytest.raises(ValueError, match="observation dict"):
step({TransitionKey.OBSERVATION: torch.zeros(1, 3, 8, 8)})
Generated
-2
View File
@@ -3010,7 +3010,6 @@ test = [
{ name = "pytest-timeout" }, { name = "pytest-timeout" },
] ]
topreward = [ topreward = [
{ name = "qwen-vl-utils" },
{ name = "transformers" }, { name = "transformers" },
] ]
training = [ training = [
@@ -3158,7 +3157,6 @@ requires-dist = [
{ name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" }, { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" },
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" },
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'topreward'" },
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
{ name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" }, { name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
{ name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" }, { name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" },