diff --git a/docs/source/topreward.mdx b/docs/source/topreward.mdx index cb497428a..f84fbed49 100644 --- a/docs/source/topreward.mdx +++ b/docs/source/topreward.mdx @@ -53,7 +53,7 @@ or, with `uv` from a source checkout: uv sync --extra topreward ``` -This pulls in `transformers` and `qwen-vl-utils`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended. +This pulls in `transformers`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended. ## Model Inputs and Outputs diff --git a/pyproject.toml b/pyproject.toml index 42b01c616..ba7588446 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -209,7 +209,7 @@ groot = [ "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'" ] sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"] -topreward = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"] +topreward = ["lerobot[transformers-dep]"] xvla = ["lerobot[transformers-dep]"] eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"] hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"] diff --git a/src/lerobot/rewards/topreward/compute_rabc_weights.py b/src/lerobot/rewards/topreward/compute_rabc_weights.py index 7e2b07d7c..a448654e5 100644 --- a/src/lerobot/rewards/topreward/compute_rabc_weights.py +++ b/src/lerobot/rewards/topreward/compute_rabc_weights.py @@ -107,10 +107,10 @@ def compute_instruction_rewards_for_prefixes( else: prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64)) + episode_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)]) rewards: list[float] = [] for length in prefix_lengths: - frames = torch.stack([dataset[ep_start + i][image_key] for i in range(int(length))]) - frames = frames.unsqueeze(0) # (1, T, C, H, W) + frames = episode_frames[: int(length)].unsqueeze(0) # (1, T, C, H, W) transition = { TransitionKey.OBSERVATION: {image_key: frames}, @@ -146,7 +146,6 @@ def compute_topreward_progress( device: str = "cuda", num_samples: int | None = None, fps: float | None = None, - reduction: str | None = None, episodes: list[int] | None = None, ) -> Path: """Run TOPReward over a dataset and write per-frame progress.""" @@ -154,10 +153,10 @@ def compute_topreward_progress( logging.info(f"Loading TOPReward config from: {reward_model_path}") model = TOPRewardModel.from_pretrained(reward_model_path) config = model.config + config.device = device if vlm_name is not None and vlm_name != config.vlm_name: logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}") config.vlm_name = vlm_name - config.device = device model = TOPRewardModel(config) else: config_kwargs: dict[str, Any] = {"device": device} @@ -165,8 +164,6 @@ def compute_topreward_progress( config_kwargs["vlm_name"] = vlm_name if fps is not None: config_kwargs["fps"] = fps - if reduction is not None: - config_kwargs["reduction"] = reduction config = TOPRewardConfig(**config_kwargs) logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}") model = TOPRewardModel(config) @@ -302,9 +299,6 @@ Examples: help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).", ) parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.") - parser.add_argument( - "--reduction", type=str, default=None, choices=["mean", "sum"], help="Override reduction." - ) parser.add_argument( "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub." ) @@ -321,7 +315,6 @@ Examples: device=args.device, num_samples=args.num_samples, fps=args.fps, - reduction=args.reduction, episodes=args.episodes, ) diff --git a/src/lerobot/rewards/topreward/configuration_topreward.py b/src/lerobot/rewards/topreward/configuration_topreward.py index cf5ea38a6..7302734c8 100644 --- a/src/lerobot/rewards/topreward/configuration_topreward.py +++ b/src/lerobot/rewards/topreward/configuration_topreward.py @@ -22,8 +22,8 @@ from lerobot.utils.constants import OBS_IMAGES # Default prompt scaffolding from the upstream TOPReward paper / reference # implementation (``QwenClient.compute_instruction_reward``). The prompt -# computes the log-likelihood of the suffix ``f"{instruction} ... True"`` -# given the video, then reduces those token log-probs to a scalar reward. +# scores the terminal ``True`` token in ``f"{instruction} ... True"`` +# given the video. DEFAULT_PROMPT_PREFIX = ( "The above video shows a robot manipulation trajectory that completes the following task: " ) @@ -67,8 +67,6 @@ class TOPRewardConfig(RewardModelConfig): add_chat_template: If ``True``, wrap the full prompt with the tokenizer's chat template before tokenisation (matches upstream ``add_chat_template=True``). - reduction: Reduction over per-token log-probs of the suffix - tokens (``"mean"`` or ``"sum"``). success_threshold: Optional log-prob threshold. If finite, :meth:`TOPRewardModel.compute_reward` returns ``(reward > success_threshold).float()`` instead of the raw @@ -96,7 +94,6 @@ class TOPRewardConfig(RewardModelConfig): prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE add_chat_template: bool = False - reduction: str = "mean" success_threshold: float = float("-inf") max_input_length: int = 32768 @@ -116,8 +113,6 @@ class TOPRewardConfig(RewardModelConfig): def __post_init__(self) -> None: super().__post_init__() - if self.reduction not in {"mean", "sum"}: - raise ValueError(f"reduction must be 'mean' or 'sum', got {self.reduction!r}") if self.max_frames is not None and self.max_frames < 1: raise ValueError(f"max_frames must be >= 1, got {self.max_frames}") if self.fps <= 0: diff --git a/src/lerobot/rewards/topreward/modeling_topreward.py b/src/lerobot/rewards/topreward/modeling_topreward.py index 4f057e737..4958d5449 100644 --- a/src/lerobot/rewards/topreward/modeling_topreward.py +++ b/src/lerobot/rewards/topreward/modeling_topreward.py @@ -29,15 +29,16 @@ and returns that log-likelihood as the reward signal. Inference recipe: 1. The processor builds a chat-style prompt, tokenises it, and emits - ``input_ids``, ``attention_mask``, vision tensors, and ``prompt_length``. -2. The model label-masks everything before ``prompt_length`` with ``-100``. -3. Forward the full token sequence through the VLM. -4. Read per-token log-probabilities of the unmasked suffix tokens from the - logits and reduce them (mean or sum) into a scalar reward. + ``input_ids``, ``attention_mask``, vision tensors, and ``labels``. + The processor label-masks everything except the terminal answer token with + ``-100``. +2. Forward the full token sequence through the VLM. +3. Read the terminal answer token log-probability from the logits as the + scalar reward. -With the default ``prompt_suffix_template`` and ``prompt_length = input_len - 1`` -(mirrored from upstream), the only unmasked token is the literal ``"True"`` -at the end — the reward is ``log P("True" | video + prompt + instruction)``. +With the default ``prompt_suffix_template``, the only unmasked token is the +literal ``"True"`` at the end — the reward is +``log P("True" | video + prompt + instruction)``. This LeRobot port is **inference-only and not trainable** — :meth:`forward` is intentionally inherited from :class:`PreTrainedRewardModel` and raises @@ -66,6 +67,7 @@ from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.constants import CONFIG_NAME from huggingface_hub.errors import HfHubHTTPError from torch import Tensor +from torch.nn.functional import cross_entropy from lerobot.configs.rewards import RewardModelConfig from lerobot.rewards.pretrained import PreTrainedRewardModel @@ -116,57 +118,29 @@ class TOPRewardModel(PreTrainedRewardModel): def compute_reward(self, batch: dict[str, Any]) -> Tensor: """Return one log-prob reward per sample in the batch.""" - inputs = { - key: batch[f"{TOPREWARD_FEATURE_PREFIX}{key}"] - for key in TOPREWARD_INPUT_KEYS - if f"{TOPREWARD_FEATURE_PREFIX}{key}" in batch - } - if "input_ids" not in inputs: - raise KeyError( - f"TOPReward batch missing pre-encoded inputs (expected " - f"`{TOPREWARD_FEATURE_PREFIX}input_ids`). Make sure the " - "TOPRewardEncoderProcessorStep ran before `compute_reward`." - ) + inputs: dict[str, Any] = {} + for key in TOPREWARD_INPUT_KEYS: + batch_key = f"{TOPREWARD_FEATURE_PREFIX}{key}" + if batch_key not in batch: + raise KeyError( + f"TOPReward batch missing `{batch_key}`. Make sure the " + "TOPRewardEncoderProcessorStep ran before `compute_reward`." + ) + inputs[key] = batch[batch_key] - prompt_lengths = inputs.pop("prompt_length") device = next(self.model.parameters()).device inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()} - - labels = inputs["input_ids"].clone() - for i, plen in enumerate(prompt_lengths.tolist()): - labels[i, : int(plen)] = -100 - if "attention_mask" in inputs: - labels = labels.masked_fill(inputs["attention_mask"] == 0, -100) + labels = inputs.pop("labels") + inputs["logits_to_keep"] = 2 self.eval() with torch.no_grad(): outputs = self.model(**inputs) - - logits = outputs.logits[:, :-1, :] - target_labels = labels[:, 1:] - log_probs = torch.nn.functional.log_softmax(logits, dim=-1) - mask = target_labels != -100 - safe_targets = target_labels.masked_fill(~mask, 0) - token_log_probs = log_probs.gather(-1, safe_targets.unsqueeze(-1)).squeeze(-1) - - batch_size = inputs["input_ids"].shape[0] - rewards = [] - for i in range(batch_size): - sample_log_probs = token_log_probs[i][mask[i]] - if sample_log_probs.numel() == 0: - raise RuntimeError( - "TOPReward could not isolate any suffix tokens to score. Check that " - "`prompt_suffix_template` produces at least one tokenised character." - ) - if self.config.reduction == "sum": - rewards.append(sample_log_probs.sum().item()) - else: - rewards.append(sample_log_probs.mean().item()) - - out = torch.as_tensor(rewards, dtype=torch.float32) + logits = outputs.logits + rewards = -cross_entropy(logits[:, -2, :].float(), labels[:, -1], reduction="none") if np.isfinite(self.config.success_threshold): - out = (out > self.config.success_threshold).float() - return out.to(self.config.device or "cpu") + rewards = (rewards > self.config.success_threshold).float() + return rewards.to(self.config.device or "cpu") def _save_pretrained(self, save_directory: Path) -> None: """Save ``config.json`` only.""" diff --git a/src/lerobot/rewards/topreward/processor_topreward.py b/src/lerobot/rewards/topreward/processor_topreward.py index 6bf73fcb4..ff0646e49 100644 --- a/src/lerobot/rewards/topreward/processor_topreward.py +++ b/src/lerobot/rewards/topreward/processor_topreward.py @@ -19,9 +19,7 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any -import numpy as np import torch -from PIL import Image from torch import Tensor from lerobot.configs import PipelineFeatureType, PolicyFeature @@ -60,39 +58,33 @@ _TRUE_ANSWER = "True" TOPREWARD_VLM_INPUT_KEYS = ( "input_ids", "attention_mask", - "pixel_values", "pixel_values_videos", - "image_grid_thw", "video_grid_thw", - "second_per_grid_ts", "mm_token_type_ids", ) -TOPREWARD_METADATA_KEYS = ("prompt_length",) -TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + TOPREWARD_METADATA_KEYS +TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + ("labels",) -def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray: - """Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array.""" +def _prepare_video_batch(video: Tensor, *, max_frames: int | None) -> Tensor: + """Return videos as ``(B, T, C, H, W)`` uint8 tensors for Qwen3-VL.""" + if video.ndim == 4: + video = video.unsqueeze(1) + elif video.ndim != 5: + raise ValueError( + f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(video.shape)}" + ) + if max_frames is not None: - video = video[-max_frames:] - if video.shape[1] in (1, 3): - video = video.permute(0, 2, 3, 1) - elif video.shape[-1] not in (1, 3): + video = video[:, -max_frames:] + if video.shape[-1] in (1, 3): + video = video.permute(0, 1, 4, 2, 3) + elif video.shape[2] not in (1, 3): raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}") - array = video.detach().cpu().numpy() - if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0: - array = array * 255.0 - return np.clip(array, 0, 255).astype(np.uint8) + if video.is_floating_point(): + video = video * 255.0 - -def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]: - """Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images.""" - if frames.ndim != 4: - raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}") - if frames.dtype != np.uint8: - frames = np.clip(frames, 0, 255).astype(np.uint8) - return [Image.fromarray(frames[i]) for i in range(frames.shape[0])] + return video.clamp(0, 255).to(torch.uint8).contiguous() def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]: @@ -120,10 +112,9 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and builds the full chat prompt including the instruction suffix. The - resulting ``input_ids``, ``attention_mask``, vision tensors, and a - per-sample ``prompt_length`` integer are written under the - ``observation.topreward.*`` namespace so the model can label-mask and - forward without re-tokenising. + resulting ``input_ids``, ``attention_mask``, vision tensors, and + ``labels`` are written under the ``observation.topreward.*`` namespace + so the model can score without re-tokenising. At call time the step reads: @@ -131,7 +122,7 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): - ``complementary_data[task_key]``: a string or list of strings. and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}"]`` for the - Qwen-VL tensors plus ``prompt_length``. + Qwen-VL tensors plus ``labels``. """ vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct" @@ -149,35 +140,26 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): def __post_init__(self) -> None: require_package("transformers", extra="topreward") - require_package("qwen-vl-utils", extra="topreward", import_name="qwen_vl_utils") self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True) def __call__(self, transition: EnvTransition) -> EnvTransition: observation = transition.get(TransitionKey.OBSERVATION) complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {} - if not isinstance(observation, dict): - raise ValueError("TOPRewardEncoderProcessorStep requires an observation dict") - if self.image_key not in observation: raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation") frames = observation[self.image_key] - tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames) - if tensor.ndim == 4: - tensor = tensor.unsqueeze(1) - elif tensor.ndim != 5: - raise ValueError( - f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}" - ) + videos = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames) + videos = _prepare_video_batch(videos, max_frames=self.max_frames) - batch_size = tensor.shape[0] + batch_size = videos.shape[0] tasks = _expand_tasks( complementary.get(self.task_key, self.default_task), batch_size=batch_size, default=self.default_task, ) - encoded = self._encode_batch(tensor, tasks) + encoded = self._encode_batch(videos, tasks, batch_size) new_observation = dict(observation) for key, value in encoded.items(): @@ -187,34 +169,33 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): new_transition[TransitionKey.OBSERVATION] = new_observation return new_transition - def _encode_batch(self, tensor: Tensor, tasks: list[str]) -> dict[str, Any]: + def _encode_batch(self, videos: Tensor, tasks: list[str], batch_size) -> dict[str, Any]: """Tokenise a batch of (frames, task) pairs into Qwen-VL tensors. - Processes samples one at a time (each may have a different token - length due to different numbers of vision patches), then pads / - stacks the results. + The loop only builds per-sample chat strings. Tokenisation, padding, + video preprocessing, and label construction are batched. """ - from qwen_vl_utils import process_vision_info - batch_size = tensor.shape[0] - all_encoded: list[dict[str, Any]] = [] - all_prompt_lengths: list[int] = [] + texts: list[str] = [] + video_metadata = [ + { + "total_num_frames": int(videos.shape[1]), + "fps": float(self.fps), + "frames_indices": list(range(int(videos.shape[1]))), + } + for _ in range(batch_size) + ] + eos_token = self._processor.tokenizer.eos_token for i in range(batch_size): - frames_np = _video_to_numpy(tensor[i], max_frames=self.max_frames) - pil_frames = _frames_to_pil(frames_np) - task = tasks[i] - - instruction_suffix = self.prompt_suffix_template.format(instruction=task) - eos_token = self._processor.tokenizer.eos_token - + instruction_suffix = self.prompt_suffix_template.format(instruction=tasks[i]) if self.add_chat_template: suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip() templated_messages = [ { "role": "user", "content": [ - {"type": "video", "video": pil_frames, "fps": self.fps}, + {"type": "video", "video": videos[i], "fps": self.fps}, {"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"}, ], } @@ -223,13 +204,12 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): templated_messages, tokenize=False, add_generation_prompt=True ) full_text = f"{prompt_chat}{_TRUE_ANSWER}" - image_inputs, video_inputs = process_vision_info(templated_messages) else: user_messages = [ { "role": "user", "content": [ - {"type": "video", "video": pil_frames, "fps": self.fps}, + {"type": "video", "video": videos[i], "fps": self.fps}, {"type": "text", "text": self.prompt_prefix}, ], } @@ -240,59 +220,29 @@ class TOPRewardEncoderProcessorStep(ProcessorStep): if eos_token is not None: prompt_chat = prompt_chat.split(eos_token)[0] full_text = f"{prompt_chat}{instruction_suffix}" - image_inputs, video_inputs = process_vision_info(user_messages) - inputs = self._processor( - text=[full_text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", + texts.append(full_text) + + result = self._processor( + text=texts, + videos=videos, + video_metadata=video_metadata, + do_sample_frames=False, + padding=True, + padding_side="left", + return_tensors="pt", + ) + input_ids = result["input_ids"] + + if input_ids.shape[-1] > self.max_length: + raise ValueError( + f"TOPReward input length {input_ids.shape[-1]} exceeds max_length " + f"{self.max_length}; lower `max_frames` or raise `max_length`." ) - input_len = int(inputs["input_ids"].shape[-1]) - if input_len > self.max_length: - raise ValueError( - f"TOPReward input length {input_len} exceeds max_length " - f"{self.max_length}; lower `max_frames` or raise `max_length`." - ) - - prompt_length = input_len - 1 - all_encoded.append(inputs) - all_prompt_lengths.append(prompt_length) - - result = dict(all_encoded[0]) if batch_size == 1 else self._pad_and_stack(all_encoded) - - result["prompt_length"] = torch.tensor(all_prompt_lengths, dtype=torch.long) - return result - - @staticmethod - def _pad_and_stack(encoded_list: list[dict[str, Any]]) -> dict[str, Any]: - """Right-pad and stack per-sample encoded dicts into a batch.""" - keys = [k for k in encoded_list[0] if isinstance(encoded_list[0][k], Tensor)] - max_len = max(enc["input_ids"].shape[-1] for enc in encoded_list) - result: dict[str, Any] = {} - - for key in keys: - tensors = [enc[key] for enc in encoded_list] - if key in ("input_ids", "attention_mask"): - padded = [] - pad_value = 0 - for t in tensors: - pad_size = max_len - t.shape[-1] - if pad_size > 0: - padded.append(torch.nn.functional.pad(t, (0, pad_size), value=pad_value)) - else: - padded.append(t) - result[key] = torch.cat(padded, dim=0) - else: - # Vision tensors (pixel_values_videos, image_grid_thw, etc.) are expected - # to have matching shapes since max_frames is applied uniformly per batch - result[key] = torch.cat(tensors, dim=0) - - for key in encoded_list[0]: - if key not in result: - result[key] = encoded_list[0][key] + labels = torch.full_like(input_ids, -100) + labels[:, -1] = input_ids[:, -1] + result["labels"] = labels return result def transform_features( @@ -325,8 +275,8 @@ def make_topreward_pre_post_processors( """Pipeline that pre-encodes frames + task into Qwen-VL tensors. The preprocessor adds a batch dimension if needed, runs TOPReward's - encoder (which tokenises the full prompt and emits ``prompt_length``), - and moves everything to the configured device. The postprocessor is + encoder (which tokenises the full prompt and emits ``labels``), and + moves everything to the configured device. The postprocessor is the identity since TOPReward outputs a single reward tensor. """ preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]]( diff --git a/tests/rewards/test_modeling_topreward.py b/tests/rewards/test_modeling_topreward.py index b2bfa7664..0cd185e12 100644 --- a/tests/rewards/test_modeling_topreward.py +++ b/tests/rewards/test_modeling_topreward.py @@ -24,7 +24,7 @@ import torch from lerobot.configs.rewards import RewardModelConfig from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config from lerobot.rewards.topreward import TOPRewardConfig -from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX +from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS from tests.utils import skip_if_package_missing @@ -45,20 +45,23 @@ class _FakeQwenModel(torch.nn.Module): def from_pretrained(cls, *args, **kwargs): # noqa: ARG003 return cls() - def forward(self, input_ids, attention_mask=None, labels=None, **kwargs): # noqa: ARG002 + def forward( # noqa: ARG002 + self, input_ids, attention_mask=None, labels=None, logits_to_keep=0, **kwargs + ): batch_size, seq_len = input_ids.shape vocab_size = 1000 logits = torch.zeros(batch_size, seq_len, vocab_size) # Place a controlled log-prob at the target token position so the # model returns a predictable reward value. - # The label-masked suffix is the last token (prompt_length = seq_len - 1). + # The label-masked suffix is the last token. # After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored # position is logits[:, -2, :] predicting labels[:, -1]. # We set logits so that log_softmax at the target token ≈ _reward_value. - if labels is not None: - for i in range(batch_size): - target_idx = int(input_ids[i, -1].item()) - logits[i, -2, target_idx] = self._reward_value * -10 # high logit -> high log-prob + for i in range(batch_size): + target_idx = int(input_ids[i, -1].item()) + logits[i, -2, target_idx] = self._reward_value * -10 # high logit -> high log-prob + if logits_to_keep: + logits = logits[:, -logits_to_keep:, :] return SimpleNamespace(logits=logits) @@ -72,17 +75,39 @@ def _patch_build(monkeypatch) -> None: def _make_batch( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, - prompt_length: torch.Tensor | None = None, + labels: torch.Tensor | None = None, + *, + omit: str | None = None, ) -> dict[str, torch.Tensor]: """Build a ``compute_reward``-ready batch using TOPReward's namespaced keys.""" - batch: dict[str, torch.Tensor] = {f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids} - if attention_mask is not None: - batch[f"{TOPREWARD_FEATURE_PREFIX}attention_mask"] = attention_mask - if prompt_length is not None: - batch[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] = prompt_length + batch_size, seq_len = input_ids.shape + if attention_mask is None: + attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long) + batch: dict[str, torch.Tensor] = {} + if labels is not None: + batch[f"{TOPREWARD_FEATURE_PREFIX}labels"] = labels + batch.update( + { + f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids, + f"{TOPREWARD_FEATURE_PREFIX}attention_mask": attention_mask, + f"{TOPREWARD_FEATURE_PREFIX}pixel_values_videos": torch.zeros( + batch_size, 1536, dtype=torch.float32 + ), + f"{TOPREWARD_FEATURE_PREFIX}video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long), + f"{TOPREWARD_FEATURE_PREFIX}mm_token_type_ids": torch.zeros_like(input_ids), + } + ) + if omit is not None: + batch.pop(f"{TOPREWARD_FEATURE_PREFIX}{omit}", None) return batch +def _terminal_labels(input_ids: torch.Tensor) -> torch.Tensor: + labels = torch.full_like(input_ids, -100) + labels[:, -1] = input_ids[:, -1] + return labels + + # --------------------------------------------------------------------------- # Registry + factory # --------------------------------------------------------------------------- @@ -105,11 +130,6 @@ def test_topreward_factory_returns_in_tree_class(): # --------------------------------------------------------------------------- -def test_topreward_config_rejects_bad_reduction(): - with pytest.raises(ValueError, match="reduction must be"): - TOPRewardConfig(device="cpu", reduction="median") - - def test_topreward_config_rejects_zero_max_frames(): with pytest.raises(ValueError, match="max_frames must be >= 1"): TOPRewardConfig(device="cpu", max_frames=0) @@ -142,9 +162,9 @@ def test_topreward_compute_reward_returns_one_scalar_per_sample(monkeypatch): input_ids = torch.randint(0, 100, (2, 10)) attention_mask = torch.ones(2, 10, dtype=torch.long) - prompt_length = torch.tensor([9, 9]) # unmask only the last token + labels = _terminal_labels(input_ids) - batch = _make_batch(input_ids, attention_mask, prompt_length) + batch = _make_batch(input_ids, attention_mask, labels) rewards = model.compute_reward(batch) assert rewards.shape == (2,) @@ -162,9 +182,9 @@ def test_topreward_compute_reward_applies_success_threshold(monkeypatch): input_ids = torch.randint(0, 100, (2, 10)) attention_mask = torch.ones(2, 10, dtype=torch.long) - prompt_length = torch.tensor([9, 9]) + labels = _terminal_labels(input_ids) - batch = _make_batch(input_ids, attention_mask, prompt_length) + batch = _make_batch(input_ids, attention_mask, labels) rewards = model.compute_reward(batch) assert rewards.shape == (2,) @@ -180,7 +200,37 @@ def test_topreward_compute_reward_errors_when_inputs_missing(monkeypatch): model = TOPRewardModel(cfg) with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"): - model.compute_reward({}) + model.compute_reward(_make_batch(torch.randint(0, 100, (1, 10)), omit="input_ids")) + + +@skip_if_package_missing("transformers") +def test_topreward_compute_reward_errors_when_labels_missing(monkeypatch): + from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel + + _patch_build(monkeypatch) + cfg = TOPRewardConfig(device="cpu") + model = TOPRewardModel(cfg) + + input_ids = torch.randint(0, 100, (1, 10)) + with pytest.raises(KeyError, match=r"observation\.topreward\.labels"): + model.compute_reward(_make_batch(input_ids, labels=None)) + + +@skip_if_package_missing("transformers") +def test_topreward_compute_reward_requires_all_encoder_keys(monkeypatch): + from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel + + _patch_build(monkeypatch) + cfg = TOPRewardConfig(device="cpu") + model = TOPRewardModel(cfg) + + input_ids = torch.randint(0, 100, (1, 10)) + labels = _terminal_labels(input_ids) + required_encoder_keys = set(TOPREWARD_INPUT_KEYS) - {"input_ids", "labels"} + + for key in required_encoder_keys: + with pytest.raises(KeyError, match=rf"observation\.topreward\.{key}"): + model.compute_reward(_make_batch(input_ids, labels=labels, omit=key)) # --------------------------------------------------------------------------- @@ -198,7 +248,6 @@ def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path cfg = TOPRewardConfig( device="cpu", vlm_name="Qwen/Qwen3-VL-8B-Instruct", - reduction="sum", fps=4.0, image_key="observation.images.front", ) @@ -217,7 +266,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_ cfg = TOPRewardConfig( device="cpu", vlm_name="Qwen/Qwen3-VL-8B-Instruct", - reduction="sum", fps=4.0, image_key="observation.images.front", add_chat_template=True, @@ -229,7 +277,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_ assert isinstance(reloaded.config, TOPRewardConfig) assert reloaded.config.vlm_name == "Qwen/Qwen3-VL-8B-Instruct" - assert reloaded.config.reduction == "sum" assert reloaded.config.fps == 4.0 assert reloaded.config.image_key == "observation.images.front" assert reloaded.config.add_chat_template is True diff --git a/tests/rewards/test_topreward.py b/tests/rewards/test_topreward.py new file mode 100644 index 000000000..cbf960751 --- /dev/null +++ b/tests/rewards/test_topreward.py @@ -0,0 +1,80 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""End-to-end TOPReward smoke test with the real Qwen3-VL model.""" + +import os + +import pytest +import torch + +pytest.importorskip("transformers") + +from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig # noqa: E402 +from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel # noqa: E402 +from lerobot.rewards.topreward.processor_topreward import ( # noqa: E402 + TOPREWARD_FEATURE_PREFIX, + TOPREWARD_INPUT_KEYS, + make_topreward_pre_post_processors, +) +from tests.utils import require_cuda # noqa: E402 + +pytestmark = pytest.mark.skipif( + os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true", + reason="This test requires downloading and loading Qwen3-VL and is not meant for CI", +) + + +def _make_dummy_topreward_batch(image_key: str, task_key: str) -> dict[str, object]: + num_frames = 4 + image_size = 64 + frames = torch.zeros(1, num_frames, 3, image_size, image_size, dtype=torch.uint8) + for frame_idx in range(num_frames): + frames[0, frame_idx, 0].fill_(min(frame_idx * 48, 255)) + frames[0, frame_idx, 1].fill_(96) + frames[0, frame_idx, 2].fill_(192) + + return { + image_key: frames, + task_key: ["pick up the red cube"], + } + + +@require_cuda +def test_topreward_full_qwen3vl_preprocessor_to_compute_reward(): + cfg = TOPRewardConfig( + vlm_name="Qwen/Qwen3-VL-8B-Instruct", + device="cuda", + max_frames=4, + fps=2.0, + max_input_length=4096, + ) + + preprocessor, _ = make_topreward_pre_post_processors(cfg) + encoded_batch = preprocessor(_make_dummy_topreward_batch(cfg.image_key, cfg.task_key)) + for key in TOPREWARD_INPUT_KEYS: + assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in encoded_batch + + model = TOPRewardModel(cfg) + try: + model.to(cfg.device) + model.eval() + rewards = model.compute_reward(encoded_batch) + finally: + del model + torch.cuda.empty_cache() + + assert rewards.shape == (1,) + assert rewards.dtype == torch.float32 + assert torch.isfinite(rewards).all() diff --git a/tests/rewards/test_topreward_processor.py b/tests/rewards/test_topreward_processor.py index 75d6e7aeb..df379276e 100644 --- a/tests/rewards/test_topreward_processor.py +++ b/tests/rewards/test_topreward_processor.py @@ -16,71 +16,71 @@ from __future__ import annotations -import numpy as np import pytest import torch from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature from lerobot.rewards.topreward.processor_topreward import ( TOPREWARD_FEATURE_PREFIX, + TOPREWARD_INPUT_KEYS, _expand_tasks, - _video_to_numpy, + _prepare_video_batch, ) from lerobot.types import TransitionKey from tests.utils import skip_if_package_missing # --------------------------------------------------------------------------- -# _video_to_numpy — pure (T, C, H, W) -> (T, H, W, C) uint8 conversion +# _prepare_video_batch — raw image/video batch -> (B, T, C, H, W) uint8 # --------------------------------------------------------------------------- -def test_video_to_numpy_chw_float_is_converted_to_thwc_uint8(): - video = torch.rand(4, 3, 8, 8) - array = _video_to_numpy(video, max_frames=None) +def test_prepare_video_batch_batched_chw_float_is_converted_to_uint8(): + video = torch.rand(2, 4, 3, 8, 8) + tensor = _prepare_video_batch(video, max_frames=None) - assert array.shape == (4, 8, 8, 3) - assert array.dtype == np.uint8 - assert array.min() >= 0 and array.max() <= 255 + assert tensor.shape == (2, 4, 3, 8, 8) + assert tensor.dtype == torch.uint8 + assert tensor.min() >= 0 and tensor.max() <= 255 -def test_video_to_numpy_already_thwc_uint8_passes_through(): - video = torch.randint(0, 256, (3, 8, 8, 3), dtype=torch.uint8) - array = _video_to_numpy(video, max_frames=None) +def test_prepare_video_batch_batched_thwc_uint8_is_permuted_to_channel_first(): + video = torch.randint(0, 256, (2, 3, 8, 8, 3), dtype=torch.uint8) + tensor = _prepare_video_batch(video, max_frames=None) - assert array.shape == (3, 8, 8, 3) - assert array.dtype == np.uint8 + assert tensor.shape == (2, 3, 3, 8, 8) + assert tensor.dtype == torch.uint8 -def test_video_to_numpy_max_frames_tail_crops_recent_frames(): - video = torch.zeros(10, 3, 4, 4) +def test_prepare_video_batch_max_frames_tail_crops_recent_frames(): + video = torch.zeros(1, 10, 3, 4, 4) for t in range(10): - video[t] = t / 9.0 + video[:, t] = t / 9.0 - array = _video_to_numpy(video, max_frames=3) + tensor = _prepare_video_batch(video, max_frames=3) - assert array.shape == (3, 4, 4, 3) - assert int(array[0, 0, 0, 0]) == int(round(7 / 9 * 255)) - assert int(array[-1, 0, 0, 0]) == 255 + assert tensor.shape == (1, 3, 3, 4, 4) + assert int(tensor[0, 0, 0, 0, 0]) == int(7 / 9 * 255) + assert int(tensor[0, -1, 0, 0, 0]) == 255 -def test_video_to_numpy_rejects_3d_input(): - with pytest.raises(ValueError, match="Expected channel dim"): - _video_to_numpy(torch.zeros(4, 8, 8), max_frames=None) +def test_prepare_video_batch_rejects_3d_input(): + with pytest.raises(ValueError, match="Expected TOPReward frames"): + _prepare_video_batch(torch.zeros(4, 8, 8), max_frames=None) -def test_video_to_numpy_floats_above_one_pass_through_without_rescaling(): - video = torch.full((1, 3, 2, 2), 5.0) - array = _video_to_numpy(video, max_frames=None) +def test_prepare_video_batch_floats_above_one_are_rescaled_and_clipped(): + video = torch.full((1, 1, 3, 2, 2), 5.0) + tensor = _prepare_video_batch(video, max_frames=None) - assert array.shape == (1, 2, 2, 3) - assert int(array.max()) == 5 + assert tensor.shape == (1, 1, 3, 2, 2) + assert int(tensor.max()) == 255 -def test_video_to_numpy_clips_very_large_floats_to_uint8_max(): - video = torch.full((1, 3, 2, 2), 300.0) - array = _video_to_numpy(video, max_frames=None) +def test_prepare_video_batch_clips_very_large_floats_to_uint8_max(): + video = torch.full((1, 1, 3, 2, 2), 300.0) + tensor = _prepare_video_batch(video, max_frames=None) - assert int(array.max()) == 255 + assert int(tensor.max()) == 255 # --------------------------------------------------------------------------- @@ -124,12 +124,11 @@ def test_expand_tasks_wrong_type_raises(): # --------------------------------------------------------------------------- -# Encoder step — stubbed AutoProcessor + process_vision_info +# Encoder step — stubbed AutoProcessor # --------------------------------------------------------------------------- def _skip_if_topreward_extras_missing(func): - func = skip_if_package_missing("qwen-vl-utils", import_name="qwen_vl_utils")(func) func = skip_if_package_missing("transformers")(func) return func @@ -155,32 +154,20 @@ class _FakeAutoProcessor: def __call__(self, text=None, images=None, videos=None, **kwargs): # noqa: ARG002 seq_len = 10 + batch_size = len(text) if isinstance(text, list) else 1 return { - "input_ids": torch.randint(0, 100, (1, seq_len)), - "attention_mask": torch.ones(1, seq_len, dtype=torch.long), + "input_ids": torch.randint(0, 100, (batch_size, seq_len)), + "attention_mask": torch.ones(batch_size, seq_len, dtype=torch.long), + "pixel_values_videos": torch.zeros(batch_size, 1536, dtype=torch.float32), + "video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long), + "mm_token_type_ids": torch.zeros(batch_size, seq_len, dtype=torch.long), } def _build_step(monkeypatch, **overrides): - import importlib - import sys - import types - from lerobot.rewards.topreward import processor_topreward - from lerobot.utils import import_utils monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor) - - # Stub qwen_vl_utils as a real module object (not MagicMock) so - # ``require_package`` / ``find_spec`` don't choke on a missing ``__spec__``. - fake_qwen_vl = types.ModuleType("qwen_vl_utils") - fake_qwen_vl.process_vision_info = lambda messages: (None, None) # type: ignore[attr-defined] - fake_qwen_vl.__spec__ = importlib.machinery.ModuleSpec("qwen_vl_utils", None) - monkeypatch.setitem(sys.modules, "qwen_vl_utils", fake_qwen_vl) - - # Clear the require_package cache so the stub is picked up. - import_utils._require_package_cache.pop("qwen_vl_utils", None) - return processor_topreward.TOPRewardEncoderProcessorStep(**overrides) @@ -192,27 +179,29 @@ def _make_transition(observation: dict, complementary: dict | None = None) -> di @_skip_if_topreward_extras_missing -def test_encoder_step_emits_input_ids_and_prompt_length(monkeypatch): +def test_encoder_step_emits_input_ids_and_labels(monkeypatch): """The processor must emit Qwen-VL tensors including ``input_ids`` and - ``prompt_length`` under the ``observation.topreward.*`` namespace.""" + ``labels`` under the ``observation.topreward.*`` namespace.""" step = _build_step(monkeypatch) - frames_batch = torch.zeros(1, 4, 3, 8, 8) + frames_batch = torch.zeros(2, 4, 3, 8, 8) out = step( _make_transition( observation={"observation.images.top": frames_batch}, - complementary={"task": "pick"}, + complementary={"task": ["pick", "place"]}, ) ) obs_out = out[TransitionKey.OBSERVATION] - assert f"{TOPREWARD_FEATURE_PREFIX}input_ids" in obs_out - assert f"{TOPREWARD_FEATURE_PREFIX}attention_mask" in obs_out - assert f"{TOPREWARD_FEATURE_PREFIX}prompt_length" in obs_out + for key in TOPREWARD_INPUT_KEYS: + assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in obs_out - prompt_length = obs_out[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] - assert prompt_length.dtype == torch.long - assert prompt_length.shape == (1,) + input_ids = obs_out[f"{TOPREWARD_FEATURE_PREFIX}input_ids"] + labels = obs_out[f"{TOPREWARD_FEATURE_PREFIX}labels"] + assert labels.dtype == torch.long + assert labels.shape == (2, 10) + assert labels[:, :-1].eq(-100).all() + assert labels[:, -1].equal(input_ids[:, -1]) @_skip_if_topreward_extras_missing @@ -255,10 +244,3 @@ def test_encoder_step_rejects_missing_image_key(monkeypatch): step = _build_step(monkeypatch, image_key="observation.images.top") with pytest.raises(KeyError, match="image key"): step(_make_transition(observation={}, complementary={"task": "pick"})) - - -@_skip_if_topreward_extras_missing -def test_encoder_step_rejects_non_dict_observation(monkeypatch): - step = _build_step(monkeypatch) - with pytest.raises(ValueError, match="observation dict"): - step({TransitionKey.OBSERVATION: torch.zeros(1, 3, 8, 8)}) diff --git a/uv.lock b/uv.lock index 2c793451a..60bc9202c 100644 --- a/uv.lock +++ b/uv.lock @@ -3010,7 +3010,6 @@ test = [ { name = "pytest-timeout" }, ] topreward = [ - { name = "qwen-vl-utils" }, { name = "transformers" }, ] training = [ @@ -3158,7 +3157,6 @@ requires-dist = [ { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" }, - { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'topreward'" }, { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" }, { name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" }, { name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" },