mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-27 14:39:43 +00:00
optmize topreward input processing (#3660)
This commit is contained in:
@@ -53,7 +53,7 @@ or, with `uv` from a source checkout:
|
||||
uv sync --extra topreward
|
||||
```
|
||||
|
||||
This pulls in `transformers` and `qwen-vl-utils`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended.
|
||||
This pulls in `transformers`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended.
|
||||
|
||||
## Model Inputs and Outputs
|
||||
|
||||
|
||||
+1
-1
@@ -209,7 +209,7 @@ groot = [
|
||||
"flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
|
||||
]
|
||||
sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
topreward = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
topreward = ["lerobot[transformers-dep]"]
|
||||
xvla = ["lerobot[transformers-dep]"]
|
||||
eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
|
||||
hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
|
||||
|
||||
@@ -107,10 +107,10 @@ def compute_instruction_rewards_for_prefixes(
|
||||
else:
|
||||
prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64))
|
||||
|
||||
episode_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
|
||||
rewards: list[float] = []
|
||||
for length in prefix_lengths:
|
||||
frames = torch.stack([dataset[ep_start + i][image_key] for i in range(int(length))])
|
||||
frames = frames.unsqueeze(0) # (1, T, C, H, W)
|
||||
frames = episode_frames[: int(length)].unsqueeze(0) # (1, T, C, H, W)
|
||||
|
||||
transition = {
|
||||
TransitionKey.OBSERVATION: {image_key: frames},
|
||||
@@ -146,7 +146,6 @@ def compute_topreward_progress(
|
||||
device: str = "cuda",
|
||||
num_samples: int | None = None,
|
||||
fps: float | None = None,
|
||||
reduction: str | None = None,
|
||||
episodes: list[int] | None = None,
|
||||
) -> Path:
|
||||
"""Run TOPReward over a dataset and write per-frame progress."""
|
||||
@@ -154,10 +153,10 @@ def compute_topreward_progress(
|
||||
logging.info(f"Loading TOPReward config from: {reward_model_path}")
|
||||
model = TOPRewardModel.from_pretrained(reward_model_path)
|
||||
config = model.config
|
||||
config.device = device
|
||||
if vlm_name is not None and vlm_name != config.vlm_name:
|
||||
logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}")
|
||||
config.vlm_name = vlm_name
|
||||
config.device = device
|
||||
model = TOPRewardModel(config)
|
||||
else:
|
||||
config_kwargs: dict[str, Any] = {"device": device}
|
||||
@@ -165,8 +164,6 @@ def compute_topreward_progress(
|
||||
config_kwargs["vlm_name"] = vlm_name
|
||||
if fps is not None:
|
||||
config_kwargs["fps"] = fps
|
||||
if reduction is not None:
|
||||
config_kwargs["reduction"] = reduction
|
||||
config = TOPRewardConfig(**config_kwargs)
|
||||
logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}")
|
||||
model = TOPRewardModel(config)
|
||||
@@ -302,9 +299,6 @@ Examples:
|
||||
help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).",
|
||||
)
|
||||
parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.")
|
||||
parser.add_argument(
|
||||
"--reduction", type=str, default=None, choices=["mean", "sum"], help="Override reduction."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
|
||||
)
|
||||
@@ -321,7 +315,6 @@ Examples:
|
||||
device=args.device,
|
||||
num_samples=args.num_samples,
|
||||
fps=args.fps,
|
||||
reduction=args.reduction,
|
||||
episodes=args.episodes,
|
||||
)
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ from lerobot.utils.constants import OBS_IMAGES
|
||||
|
||||
# Default prompt scaffolding from the upstream TOPReward paper / reference
|
||||
# implementation (``QwenClient.compute_instruction_reward``). The prompt
|
||||
# computes the log-likelihood of the suffix ``f"{instruction} ... True"``
|
||||
# given the video, then reduces those token log-probs to a scalar reward.
|
||||
# scores the terminal ``True`` token in ``f"{instruction} ... True"``
|
||||
# given the video.
|
||||
DEFAULT_PROMPT_PREFIX = (
|
||||
"The above video shows a robot manipulation trajectory that completes the following task: "
|
||||
)
|
||||
@@ -67,8 +67,6 @@ class TOPRewardConfig(RewardModelConfig):
|
||||
add_chat_template: If ``True``, wrap the full prompt with the
|
||||
tokenizer's chat template before tokenisation (matches
|
||||
upstream ``add_chat_template=True``).
|
||||
reduction: Reduction over per-token log-probs of the suffix
|
||||
tokens (``"mean"`` or ``"sum"``).
|
||||
success_threshold: Optional log-prob threshold. If finite,
|
||||
:meth:`TOPRewardModel.compute_reward` returns
|
||||
``(reward > success_threshold).float()`` instead of the raw
|
||||
@@ -96,7 +94,6 @@ class TOPRewardConfig(RewardModelConfig):
|
||||
prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
|
||||
add_chat_template: bool = False
|
||||
|
||||
reduction: str = "mean"
|
||||
success_threshold: float = float("-inf")
|
||||
max_input_length: int = 32768
|
||||
|
||||
@@ -116,8 +113,6 @@ class TOPRewardConfig(RewardModelConfig):
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
super().__post_init__()
|
||||
if self.reduction not in {"mean", "sum"}:
|
||||
raise ValueError(f"reduction must be 'mean' or 'sum', got {self.reduction!r}")
|
||||
if self.max_frames is not None and self.max_frames < 1:
|
||||
raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
|
||||
if self.fps <= 0:
|
||||
|
||||
@@ -29,15 +29,16 @@ and returns that log-likelihood as the reward signal.
|
||||
Inference recipe:
|
||||
|
||||
1. The processor builds a chat-style prompt, tokenises it, and emits
|
||||
``input_ids``, ``attention_mask``, vision tensors, and ``prompt_length``.
|
||||
2. The model label-masks everything before ``prompt_length`` with ``-100``.
|
||||
3. Forward the full token sequence through the VLM.
|
||||
4. Read per-token log-probabilities of the unmasked suffix tokens from the
|
||||
logits and reduce them (mean or sum) into a scalar reward.
|
||||
``input_ids``, ``attention_mask``, vision tensors, and ``labels``.
|
||||
The processor label-masks everything except the terminal answer token with
|
||||
``-100``.
|
||||
2. Forward the full token sequence through the VLM.
|
||||
3. Read the terminal answer token log-probability from the logits as the
|
||||
scalar reward.
|
||||
|
||||
With the default ``prompt_suffix_template`` and ``prompt_length = input_len - 1``
|
||||
(mirrored from upstream), the only unmasked token is the literal ``"True"``
|
||||
at the end — the reward is ``log P("True" | video + prompt + instruction)``.
|
||||
With the default ``prompt_suffix_template``, the only unmasked token is the
|
||||
literal ``"True"`` at the end — the reward is
|
||||
``log P("True" | video + prompt + instruction)``.
|
||||
|
||||
This LeRobot port is **inference-only and not trainable** — :meth:`forward`
|
||||
is intentionally inherited from :class:`PreTrainedRewardModel` and raises
|
||||
@@ -66,6 +67,7 @@ from huggingface_hub import HfApi, hf_hub_download
|
||||
from huggingface_hub.constants import CONFIG_NAME
|
||||
from huggingface_hub.errors import HfHubHTTPError
|
||||
from torch import Tensor
|
||||
from torch.nn.functional import cross_entropy
|
||||
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.pretrained import PreTrainedRewardModel
|
||||
@@ -116,57 +118,29 @@ class TOPRewardModel(PreTrainedRewardModel):
|
||||
|
||||
def compute_reward(self, batch: dict[str, Any]) -> Tensor:
|
||||
"""Return one log-prob reward per sample in the batch."""
|
||||
inputs = {
|
||||
key: batch[f"{TOPREWARD_FEATURE_PREFIX}{key}"]
|
||||
for key in TOPREWARD_INPUT_KEYS
|
||||
if f"{TOPREWARD_FEATURE_PREFIX}{key}" in batch
|
||||
}
|
||||
if "input_ids" not in inputs:
|
||||
inputs: dict[str, Any] = {}
|
||||
for key in TOPREWARD_INPUT_KEYS:
|
||||
batch_key = f"{TOPREWARD_FEATURE_PREFIX}{key}"
|
||||
if batch_key not in batch:
|
||||
raise KeyError(
|
||||
f"TOPReward batch missing pre-encoded inputs (expected "
|
||||
f"`{TOPREWARD_FEATURE_PREFIX}input_ids`). Make sure the "
|
||||
f"TOPReward batch missing `{batch_key}`. Make sure the "
|
||||
"TOPRewardEncoderProcessorStep ran before `compute_reward`."
|
||||
)
|
||||
inputs[key] = batch[batch_key]
|
||||
|
||||
prompt_lengths = inputs.pop("prompt_length")
|
||||
device = next(self.model.parameters()).device
|
||||
inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
|
||||
|
||||
labels = inputs["input_ids"].clone()
|
||||
for i, plen in enumerate(prompt_lengths.tolist()):
|
||||
labels[i, : int(plen)] = -100
|
||||
if "attention_mask" in inputs:
|
||||
labels = labels.masked_fill(inputs["attention_mask"] == 0, -100)
|
||||
labels = inputs.pop("labels")
|
||||
inputs["logits_to_keep"] = 2
|
||||
|
||||
self.eval()
|
||||
with torch.no_grad():
|
||||
outputs = self.model(**inputs)
|
||||
|
||||
logits = outputs.logits[:, :-1, :]
|
||||
target_labels = labels[:, 1:]
|
||||
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
mask = target_labels != -100
|
||||
safe_targets = target_labels.masked_fill(~mask, 0)
|
||||
token_log_probs = log_probs.gather(-1, safe_targets.unsqueeze(-1)).squeeze(-1)
|
||||
|
||||
batch_size = inputs["input_ids"].shape[0]
|
||||
rewards = []
|
||||
for i in range(batch_size):
|
||||
sample_log_probs = token_log_probs[i][mask[i]]
|
||||
if sample_log_probs.numel() == 0:
|
||||
raise RuntimeError(
|
||||
"TOPReward could not isolate any suffix tokens to score. Check that "
|
||||
"`prompt_suffix_template` produces at least one tokenised character."
|
||||
)
|
||||
if self.config.reduction == "sum":
|
||||
rewards.append(sample_log_probs.sum().item())
|
||||
else:
|
||||
rewards.append(sample_log_probs.mean().item())
|
||||
|
||||
out = torch.as_tensor(rewards, dtype=torch.float32)
|
||||
logits = outputs.logits
|
||||
rewards = -cross_entropy(logits[:, -2, :].float(), labels[:, -1], reduction="none")
|
||||
if np.isfinite(self.config.success_threshold):
|
||||
out = (out > self.config.success_threshold).float()
|
||||
return out.to(self.config.device or "cpu")
|
||||
rewards = (rewards > self.config.success_threshold).float()
|
||||
return rewards.to(self.config.device or "cpu")
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
"""Save ``config.json`` only."""
|
||||
|
||||
@@ -19,9 +19,7 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torch import Tensor
|
||||
|
||||
from lerobot.configs import PipelineFeatureType, PolicyFeature
|
||||
@@ -60,39 +58,33 @@ _TRUE_ANSWER = "True"
|
||||
TOPREWARD_VLM_INPUT_KEYS = (
|
||||
"input_ids",
|
||||
"attention_mask",
|
||||
"pixel_values",
|
||||
"pixel_values_videos",
|
||||
"image_grid_thw",
|
||||
"video_grid_thw",
|
||||
"second_per_grid_ts",
|
||||
"mm_token_type_ids",
|
||||
)
|
||||
TOPREWARD_METADATA_KEYS = ("prompt_length",)
|
||||
TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + TOPREWARD_METADATA_KEYS
|
||||
TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + ("labels",)
|
||||
|
||||
|
||||
def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray:
|
||||
"""Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array."""
|
||||
def _prepare_video_batch(video: Tensor, *, max_frames: int | None) -> Tensor:
|
||||
"""Return videos as ``(B, T, C, H, W)`` uint8 tensors for Qwen3-VL."""
|
||||
if video.ndim == 4:
|
||||
video = video.unsqueeze(1)
|
||||
elif video.ndim != 5:
|
||||
raise ValueError(
|
||||
f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(video.shape)}"
|
||||
)
|
||||
|
||||
if max_frames is not None:
|
||||
video = video[-max_frames:]
|
||||
if video.shape[1] in (1, 3):
|
||||
video = video.permute(0, 2, 3, 1)
|
||||
elif video.shape[-1] not in (1, 3):
|
||||
video = video[:, -max_frames:]
|
||||
if video.shape[-1] in (1, 3):
|
||||
video = video.permute(0, 1, 4, 2, 3)
|
||||
elif video.shape[2] not in (1, 3):
|
||||
raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
|
||||
|
||||
array = video.detach().cpu().numpy()
|
||||
if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0:
|
||||
array = array * 255.0
|
||||
return np.clip(array, 0, 255).astype(np.uint8)
|
||||
if video.is_floating_point():
|
||||
video = video * 255.0
|
||||
|
||||
|
||||
def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
|
||||
"""Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
|
||||
if frames.ndim != 4:
|
||||
raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
|
||||
if frames.dtype != np.uint8:
|
||||
frames = np.clip(frames, 0, 255).astype(np.uint8)
|
||||
return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
|
||||
return video.clamp(0, 255).to(torch.uint8).contiguous()
|
||||
|
||||
|
||||
def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
|
||||
@@ -120,10 +112,9 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
|
||||
Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and
|
||||
builds the full chat prompt including the instruction suffix. The
|
||||
resulting ``input_ids``, ``attention_mask``, vision tensors, and a
|
||||
per-sample ``prompt_length`` integer are written under the
|
||||
``observation.topreward.*`` namespace so the model can label-mask and
|
||||
forward without re-tokenising.
|
||||
resulting ``input_ids``, ``attention_mask``, vision tensors, and
|
||||
``labels`` are written under the ``observation.topreward.*`` namespace
|
||||
so the model can score without re-tokenising.
|
||||
|
||||
At call time the step reads:
|
||||
|
||||
@@ -131,7 +122,7 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
- ``complementary_data[task_key]``: a string or list of strings.
|
||||
|
||||
and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}<name>"]`` for the
|
||||
Qwen-VL tensors plus ``prompt_length``.
|
||||
Qwen-VL tensors plus ``labels``.
|
||||
"""
|
||||
|
||||
vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
|
||||
@@ -149,35 +140,26 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
require_package("transformers", extra="topreward")
|
||||
require_package("qwen-vl-utils", extra="topreward", import_name="qwen_vl_utils")
|
||||
self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True)
|
||||
|
||||
def __call__(self, transition: EnvTransition) -> EnvTransition:
|
||||
observation = transition.get(TransitionKey.OBSERVATION)
|
||||
complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
|
||||
if not isinstance(observation, dict):
|
||||
raise ValueError("TOPRewardEncoderProcessorStep requires an observation dict")
|
||||
|
||||
if self.image_key not in observation:
|
||||
raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation")
|
||||
|
||||
frames = observation[self.image_key]
|
||||
tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
|
||||
if tensor.ndim == 4:
|
||||
tensor = tensor.unsqueeze(1)
|
||||
elif tensor.ndim != 5:
|
||||
raise ValueError(
|
||||
f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
|
||||
)
|
||||
videos = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
|
||||
videos = _prepare_video_batch(videos, max_frames=self.max_frames)
|
||||
|
||||
batch_size = tensor.shape[0]
|
||||
batch_size = videos.shape[0]
|
||||
tasks = _expand_tasks(
|
||||
complementary.get(self.task_key, self.default_task),
|
||||
batch_size=batch_size,
|
||||
default=self.default_task,
|
||||
)
|
||||
|
||||
encoded = self._encode_batch(tensor, tasks)
|
||||
encoded = self._encode_batch(videos, tasks, batch_size)
|
||||
|
||||
new_observation = dict(observation)
|
||||
for key, value in encoded.items():
|
||||
@@ -187,34 +169,33 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
new_transition[TransitionKey.OBSERVATION] = new_observation
|
||||
return new_transition
|
||||
|
||||
def _encode_batch(self, tensor: Tensor, tasks: list[str]) -> dict[str, Any]:
|
||||
def _encode_batch(self, videos: Tensor, tasks: list[str], batch_size) -> dict[str, Any]:
|
||||
"""Tokenise a batch of (frames, task) pairs into Qwen-VL tensors.
|
||||
|
||||
Processes samples one at a time (each may have a different token
|
||||
length due to different numbers of vision patches), then pads /
|
||||
stacks the results.
|
||||
The loop only builds per-sample chat strings. Tokenisation, padding,
|
||||
video preprocessing, and label construction are batched.
|
||||
"""
|
||||
from qwen_vl_utils import process_vision_info
|
||||
|
||||
batch_size = tensor.shape[0]
|
||||
all_encoded: list[dict[str, Any]] = []
|
||||
all_prompt_lengths: list[int] = []
|
||||
|
||||
for i in range(batch_size):
|
||||
frames_np = _video_to_numpy(tensor[i], max_frames=self.max_frames)
|
||||
pil_frames = _frames_to_pil(frames_np)
|
||||
task = tasks[i]
|
||||
|
||||
instruction_suffix = self.prompt_suffix_template.format(instruction=task)
|
||||
texts: list[str] = []
|
||||
video_metadata = [
|
||||
{
|
||||
"total_num_frames": int(videos.shape[1]),
|
||||
"fps": float(self.fps),
|
||||
"frames_indices": list(range(int(videos.shape[1]))),
|
||||
}
|
||||
for _ in range(batch_size)
|
||||
]
|
||||
eos_token = self._processor.tokenizer.eos_token
|
||||
|
||||
for i in range(batch_size):
|
||||
instruction_suffix = self.prompt_suffix_template.format(instruction=tasks[i])
|
||||
if self.add_chat_template:
|
||||
suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip()
|
||||
templated_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": pil_frames, "fps": self.fps},
|
||||
{"type": "video", "video": videos[i], "fps": self.fps},
|
||||
{"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"},
|
||||
],
|
||||
}
|
||||
@@ -223,13 +204,12 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
templated_messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
full_text = f"{prompt_chat}{_TRUE_ANSWER}"
|
||||
image_inputs, video_inputs = process_vision_info(templated_messages)
|
||||
else:
|
||||
user_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": pil_frames, "fps": self.fps},
|
||||
{"type": "video", "video": videos[i], "fps": self.fps},
|
||||
{"type": "text", "text": self.prompt_prefix},
|
||||
],
|
||||
}
|
||||
@@ -240,59 +220,29 @@ class TOPRewardEncoderProcessorStep(ProcessorStep):
|
||||
if eos_token is not None:
|
||||
prompt_chat = prompt_chat.split(eos_token)[0]
|
||||
full_text = f"{prompt_chat}{instruction_suffix}"
|
||||
image_inputs, video_inputs = process_vision_info(user_messages)
|
||||
|
||||
inputs = self._processor(
|
||||
text=[full_text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
texts.append(full_text)
|
||||
|
||||
result = self._processor(
|
||||
text=texts,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
do_sample_frames=False,
|
||||
padding=True,
|
||||
padding_side="left",
|
||||
return_tensors="pt",
|
||||
)
|
||||
input_ids = result["input_ids"]
|
||||
|
||||
input_len = int(inputs["input_ids"].shape[-1])
|
||||
if input_len > self.max_length:
|
||||
if input_ids.shape[-1] > self.max_length:
|
||||
raise ValueError(
|
||||
f"TOPReward input length {input_len} exceeds max_length "
|
||||
f"TOPReward input length {input_ids.shape[-1]} exceeds max_length "
|
||||
f"{self.max_length}; lower `max_frames` or raise `max_length`."
|
||||
)
|
||||
|
||||
prompt_length = input_len - 1
|
||||
all_encoded.append(inputs)
|
||||
all_prompt_lengths.append(prompt_length)
|
||||
|
||||
result = dict(all_encoded[0]) if batch_size == 1 else self._pad_and_stack(all_encoded)
|
||||
|
||||
result["prompt_length"] = torch.tensor(all_prompt_lengths, dtype=torch.long)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _pad_and_stack(encoded_list: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Right-pad and stack per-sample encoded dicts into a batch."""
|
||||
keys = [k for k in encoded_list[0] if isinstance(encoded_list[0][k], Tensor)]
|
||||
max_len = max(enc["input_ids"].shape[-1] for enc in encoded_list)
|
||||
result: dict[str, Any] = {}
|
||||
|
||||
for key in keys:
|
||||
tensors = [enc[key] for enc in encoded_list]
|
||||
if key in ("input_ids", "attention_mask"):
|
||||
padded = []
|
||||
pad_value = 0
|
||||
for t in tensors:
|
||||
pad_size = max_len - t.shape[-1]
|
||||
if pad_size > 0:
|
||||
padded.append(torch.nn.functional.pad(t, (0, pad_size), value=pad_value))
|
||||
else:
|
||||
padded.append(t)
|
||||
result[key] = torch.cat(padded, dim=0)
|
||||
else:
|
||||
# Vision tensors (pixel_values_videos, image_grid_thw, etc.) are expected
|
||||
# to have matching shapes since max_frames is applied uniformly per batch
|
||||
result[key] = torch.cat(tensors, dim=0)
|
||||
|
||||
for key in encoded_list[0]:
|
||||
if key not in result:
|
||||
result[key] = encoded_list[0][key]
|
||||
labels = torch.full_like(input_ids, -100)
|
||||
labels[:, -1] = input_ids[:, -1]
|
||||
result["labels"] = labels
|
||||
return result
|
||||
|
||||
def transform_features(
|
||||
@@ -325,8 +275,8 @@ def make_topreward_pre_post_processors(
|
||||
"""Pipeline that pre-encodes frames + task into Qwen-VL tensors.
|
||||
|
||||
The preprocessor adds a batch dimension if needed, runs TOPReward's
|
||||
encoder (which tokenises the full prompt and emits ``prompt_length``),
|
||||
and moves everything to the configured device. The postprocessor is
|
||||
encoder (which tokenises the full prompt and emits ``labels``), and
|
||||
moves everything to the configured device. The postprocessor is
|
||||
the identity since TOPReward outputs a single reward tensor.
|
||||
"""
|
||||
preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
|
||||
|
||||
@@ -24,7 +24,7 @@ import torch
|
||||
from lerobot.configs.rewards import RewardModelConfig
|
||||
from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
|
||||
from lerobot.rewards.topreward import TOPRewardConfig
|
||||
from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX
|
||||
from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS
|
||||
from tests.utils import skip_if_package_missing
|
||||
|
||||
|
||||
@@ -45,20 +45,23 @@ class _FakeQwenModel(torch.nn.Module):
|
||||
def from_pretrained(cls, *args, **kwargs): # noqa: ARG003
|
||||
return cls()
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, labels=None, **kwargs): # noqa: ARG002
|
||||
def forward( # noqa: ARG002
|
||||
self, input_ids, attention_mask=None, labels=None, logits_to_keep=0, **kwargs
|
||||
):
|
||||
batch_size, seq_len = input_ids.shape
|
||||
vocab_size = 1000
|
||||
logits = torch.zeros(batch_size, seq_len, vocab_size)
|
||||
# Place a controlled log-prob at the target token position so the
|
||||
# model returns a predictable reward value.
|
||||
# The label-masked suffix is the last token (prompt_length = seq_len - 1).
|
||||
# The label-masked suffix is the last token.
|
||||
# After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored
|
||||
# position is logits[:, -2, :] predicting labels[:, -1].
|
||||
# We set logits so that log_softmax at the target token ≈ _reward_value.
|
||||
if labels is not None:
|
||||
for i in range(batch_size):
|
||||
target_idx = int(input_ids[i, -1].item())
|
||||
logits[i, -2, target_idx] = self._reward_value * -10 # high logit -> high log-prob
|
||||
if logits_to_keep:
|
||||
logits = logits[:, -logits_to_keep:, :]
|
||||
return SimpleNamespace(logits=logits)
|
||||
|
||||
|
||||
@@ -72,17 +75,39 @@ def _patch_build(monkeypatch) -> None:
|
||||
def _make_batch(
|
||||
input_ids: torch.Tensor,
|
||||
attention_mask: torch.Tensor | None = None,
|
||||
prompt_length: torch.Tensor | None = None,
|
||||
labels: torch.Tensor | None = None,
|
||||
*,
|
||||
omit: str | None = None,
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Build a ``compute_reward``-ready batch using TOPReward's namespaced keys."""
|
||||
batch: dict[str, torch.Tensor] = {f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids}
|
||||
if attention_mask is not None:
|
||||
batch[f"{TOPREWARD_FEATURE_PREFIX}attention_mask"] = attention_mask
|
||||
if prompt_length is not None:
|
||||
batch[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] = prompt_length
|
||||
batch_size, seq_len = input_ids.shape
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
|
||||
batch: dict[str, torch.Tensor] = {}
|
||||
if labels is not None:
|
||||
batch[f"{TOPREWARD_FEATURE_PREFIX}labels"] = labels
|
||||
batch.update(
|
||||
{
|
||||
f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids,
|
||||
f"{TOPREWARD_FEATURE_PREFIX}attention_mask": attention_mask,
|
||||
f"{TOPREWARD_FEATURE_PREFIX}pixel_values_videos": torch.zeros(
|
||||
batch_size, 1536, dtype=torch.float32
|
||||
),
|
||||
f"{TOPREWARD_FEATURE_PREFIX}video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
|
||||
f"{TOPREWARD_FEATURE_PREFIX}mm_token_type_ids": torch.zeros_like(input_ids),
|
||||
}
|
||||
)
|
||||
if omit is not None:
|
||||
batch.pop(f"{TOPREWARD_FEATURE_PREFIX}{omit}", None)
|
||||
return batch
|
||||
|
||||
|
||||
def _terminal_labels(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
labels = torch.full_like(input_ids, -100)
|
||||
labels[:, -1] = input_ids[:, -1]
|
||||
return labels
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry + factory
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -105,11 +130,6 @@ def test_topreward_factory_returns_in_tree_class():
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_topreward_config_rejects_bad_reduction():
|
||||
with pytest.raises(ValueError, match="reduction must be"):
|
||||
TOPRewardConfig(device="cpu", reduction="median")
|
||||
|
||||
|
||||
def test_topreward_config_rejects_zero_max_frames():
|
||||
with pytest.raises(ValueError, match="max_frames must be >= 1"):
|
||||
TOPRewardConfig(device="cpu", max_frames=0)
|
||||
@@ -142,9 +162,9 @@ def test_topreward_compute_reward_returns_one_scalar_per_sample(monkeypatch):
|
||||
|
||||
input_ids = torch.randint(0, 100, (2, 10))
|
||||
attention_mask = torch.ones(2, 10, dtype=torch.long)
|
||||
prompt_length = torch.tensor([9, 9]) # unmask only the last token
|
||||
labels = _terminal_labels(input_ids)
|
||||
|
||||
batch = _make_batch(input_ids, attention_mask, prompt_length)
|
||||
batch = _make_batch(input_ids, attention_mask, labels)
|
||||
rewards = model.compute_reward(batch)
|
||||
|
||||
assert rewards.shape == (2,)
|
||||
@@ -162,9 +182,9 @@ def test_topreward_compute_reward_applies_success_threshold(monkeypatch):
|
||||
|
||||
input_ids = torch.randint(0, 100, (2, 10))
|
||||
attention_mask = torch.ones(2, 10, dtype=torch.long)
|
||||
prompt_length = torch.tensor([9, 9])
|
||||
labels = _terminal_labels(input_ids)
|
||||
|
||||
batch = _make_batch(input_ids, attention_mask, prompt_length)
|
||||
batch = _make_batch(input_ids, attention_mask, labels)
|
||||
rewards = model.compute_reward(batch)
|
||||
|
||||
assert rewards.shape == (2,)
|
||||
@@ -180,7 +200,37 @@ def test_topreward_compute_reward_errors_when_inputs_missing(monkeypatch):
|
||||
model = TOPRewardModel(cfg)
|
||||
|
||||
with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"):
|
||||
model.compute_reward({})
|
||||
model.compute_reward(_make_batch(torch.randint(0, 100, (1, 10)), omit="input_ids"))
|
||||
|
||||
|
||||
@skip_if_package_missing("transformers")
|
||||
def test_topreward_compute_reward_errors_when_labels_missing(monkeypatch):
|
||||
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
|
||||
|
||||
_patch_build(monkeypatch)
|
||||
cfg = TOPRewardConfig(device="cpu")
|
||||
model = TOPRewardModel(cfg)
|
||||
|
||||
input_ids = torch.randint(0, 100, (1, 10))
|
||||
with pytest.raises(KeyError, match=r"observation\.topreward\.labels"):
|
||||
model.compute_reward(_make_batch(input_ids, labels=None))
|
||||
|
||||
|
||||
@skip_if_package_missing("transformers")
|
||||
def test_topreward_compute_reward_requires_all_encoder_keys(monkeypatch):
|
||||
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
|
||||
|
||||
_patch_build(monkeypatch)
|
||||
cfg = TOPRewardConfig(device="cpu")
|
||||
model = TOPRewardModel(cfg)
|
||||
|
||||
input_ids = torch.randint(0, 100, (1, 10))
|
||||
labels = _terminal_labels(input_ids)
|
||||
required_encoder_keys = set(TOPREWARD_INPUT_KEYS) - {"input_ids", "labels"}
|
||||
|
||||
for key in required_encoder_keys:
|
||||
with pytest.raises(KeyError, match=rf"observation\.topreward\.{key}"):
|
||||
model.compute_reward(_make_batch(input_ids, labels=labels, omit=key))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -198,7 +248,6 @@ def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path
|
||||
cfg = TOPRewardConfig(
|
||||
device="cpu",
|
||||
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
|
||||
reduction="sum",
|
||||
fps=4.0,
|
||||
image_key="observation.images.front",
|
||||
)
|
||||
@@ -217,7 +266,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
|
||||
cfg = TOPRewardConfig(
|
||||
device="cpu",
|
||||
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
|
||||
reduction="sum",
|
||||
fps=4.0,
|
||||
image_key="observation.images.front",
|
||||
add_chat_template=True,
|
||||
@@ -229,7 +277,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
|
||||
|
||||
assert isinstance(reloaded.config, TOPRewardConfig)
|
||||
assert reloaded.config.vlm_name == "Qwen/Qwen3-VL-8B-Instruct"
|
||||
assert reloaded.config.reduction == "sum"
|
||||
assert reloaded.config.fps == 4.0
|
||||
assert reloaded.config.image_key == "observation.images.front"
|
||||
assert reloaded.config.add_chat_template is True
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""End-to-end TOPReward smoke test with the real Qwen3-VL model."""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
pytest.importorskip("transformers")
|
||||
|
||||
from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig # noqa: E402
|
||||
from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel # noqa: E402
|
||||
from lerobot.rewards.topreward.processor_topreward import ( # noqa: E402
|
||||
TOPREWARD_FEATURE_PREFIX,
|
||||
TOPREWARD_INPUT_KEYS,
|
||||
make_topreward_pre_post_processors,
|
||||
)
|
||||
from tests.utils import require_cuda # noqa: E402
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
|
||||
reason="This test requires downloading and loading Qwen3-VL and is not meant for CI",
|
||||
)
|
||||
|
||||
|
||||
def _make_dummy_topreward_batch(image_key: str, task_key: str) -> dict[str, object]:
|
||||
num_frames = 4
|
||||
image_size = 64
|
||||
frames = torch.zeros(1, num_frames, 3, image_size, image_size, dtype=torch.uint8)
|
||||
for frame_idx in range(num_frames):
|
||||
frames[0, frame_idx, 0].fill_(min(frame_idx * 48, 255))
|
||||
frames[0, frame_idx, 1].fill_(96)
|
||||
frames[0, frame_idx, 2].fill_(192)
|
||||
|
||||
return {
|
||||
image_key: frames,
|
||||
task_key: ["pick up the red cube"],
|
||||
}
|
||||
|
||||
|
||||
@require_cuda
|
||||
def test_topreward_full_qwen3vl_preprocessor_to_compute_reward():
|
||||
cfg = TOPRewardConfig(
|
||||
vlm_name="Qwen/Qwen3-VL-8B-Instruct",
|
||||
device="cuda",
|
||||
max_frames=4,
|
||||
fps=2.0,
|
||||
max_input_length=4096,
|
||||
)
|
||||
|
||||
preprocessor, _ = make_topreward_pre_post_processors(cfg)
|
||||
encoded_batch = preprocessor(_make_dummy_topreward_batch(cfg.image_key, cfg.task_key))
|
||||
for key in TOPREWARD_INPUT_KEYS:
|
||||
assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in encoded_batch
|
||||
|
||||
model = TOPRewardModel(cfg)
|
||||
try:
|
||||
model.to(cfg.device)
|
||||
model.eval()
|
||||
rewards = model.compute_reward(encoded_batch)
|
||||
finally:
|
||||
del model
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
assert rewards.shape == (1,)
|
||||
assert rewards.dtype == torch.float32
|
||||
assert torch.isfinite(rewards).all()
|
||||
@@ -16,71 +16,71 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
|
||||
from lerobot.rewards.topreward.processor_topreward import (
|
||||
TOPREWARD_FEATURE_PREFIX,
|
||||
TOPREWARD_INPUT_KEYS,
|
||||
_expand_tasks,
|
||||
_video_to_numpy,
|
||||
_prepare_video_batch,
|
||||
)
|
||||
from lerobot.types import TransitionKey
|
||||
from tests.utils import skip_if_package_missing
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _video_to_numpy — pure (T, C, H, W) -> (T, H, W, C) uint8 conversion
|
||||
# _prepare_video_batch — raw image/video batch -> (B, T, C, H, W) uint8
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_video_to_numpy_chw_float_is_converted_to_thwc_uint8():
|
||||
video = torch.rand(4, 3, 8, 8)
|
||||
array = _video_to_numpy(video, max_frames=None)
|
||||
def test_prepare_video_batch_batched_chw_float_is_converted_to_uint8():
|
||||
video = torch.rand(2, 4, 3, 8, 8)
|
||||
tensor = _prepare_video_batch(video, max_frames=None)
|
||||
|
||||
assert array.shape == (4, 8, 8, 3)
|
||||
assert array.dtype == np.uint8
|
||||
assert array.min() >= 0 and array.max() <= 255
|
||||
assert tensor.shape == (2, 4, 3, 8, 8)
|
||||
assert tensor.dtype == torch.uint8
|
||||
assert tensor.min() >= 0 and tensor.max() <= 255
|
||||
|
||||
|
||||
def test_video_to_numpy_already_thwc_uint8_passes_through():
|
||||
video = torch.randint(0, 256, (3, 8, 8, 3), dtype=torch.uint8)
|
||||
array = _video_to_numpy(video, max_frames=None)
|
||||
def test_prepare_video_batch_batched_thwc_uint8_is_permuted_to_channel_first():
|
||||
video = torch.randint(0, 256, (2, 3, 8, 8, 3), dtype=torch.uint8)
|
||||
tensor = _prepare_video_batch(video, max_frames=None)
|
||||
|
||||
assert array.shape == (3, 8, 8, 3)
|
||||
assert array.dtype == np.uint8
|
||||
assert tensor.shape == (2, 3, 3, 8, 8)
|
||||
assert tensor.dtype == torch.uint8
|
||||
|
||||
|
||||
def test_video_to_numpy_max_frames_tail_crops_recent_frames():
|
||||
video = torch.zeros(10, 3, 4, 4)
|
||||
def test_prepare_video_batch_max_frames_tail_crops_recent_frames():
|
||||
video = torch.zeros(1, 10, 3, 4, 4)
|
||||
for t in range(10):
|
||||
video[t] = t / 9.0
|
||||
video[:, t] = t / 9.0
|
||||
|
||||
array = _video_to_numpy(video, max_frames=3)
|
||||
tensor = _prepare_video_batch(video, max_frames=3)
|
||||
|
||||
assert array.shape == (3, 4, 4, 3)
|
||||
assert int(array[0, 0, 0, 0]) == int(round(7 / 9 * 255))
|
||||
assert int(array[-1, 0, 0, 0]) == 255
|
||||
assert tensor.shape == (1, 3, 3, 4, 4)
|
||||
assert int(tensor[0, 0, 0, 0, 0]) == int(7 / 9 * 255)
|
||||
assert int(tensor[0, -1, 0, 0, 0]) == 255
|
||||
|
||||
|
||||
def test_video_to_numpy_rejects_3d_input():
|
||||
with pytest.raises(ValueError, match="Expected channel dim"):
|
||||
_video_to_numpy(torch.zeros(4, 8, 8), max_frames=None)
|
||||
def test_prepare_video_batch_rejects_3d_input():
|
||||
with pytest.raises(ValueError, match="Expected TOPReward frames"):
|
||||
_prepare_video_batch(torch.zeros(4, 8, 8), max_frames=None)
|
||||
|
||||
|
||||
def test_video_to_numpy_floats_above_one_pass_through_without_rescaling():
|
||||
video = torch.full((1, 3, 2, 2), 5.0)
|
||||
array = _video_to_numpy(video, max_frames=None)
|
||||
def test_prepare_video_batch_floats_above_one_are_rescaled_and_clipped():
|
||||
video = torch.full((1, 1, 3, 2, 2), 5.0)
|
||||
tensor = _prepare_video_batch(video, max_frames=None)
|
||||
|
||||
assert array.shape == (1, 2, 2, 3)
|
||||
assert int(array.max()) == 5
|
||||
assert tensor.shape == (1, 1, 3, 2, 2)
|
||||
assert int(tensor.max()) == 255
|
||||
|
||||
|
||||
def test_video_to_numpy_clips_very_large_floats_to_uint8_max():
|
||||
video = torch.full((1, 3, 2, 2), 300.0)
|
||||
array = _video_to_numpy(video, max_frames=None)
|
||||
def test_prepare_video_batch_clips_very_large_floats_to_uint8_max():
|
||||
video = torch.full((1, 1, 3, 2, 2), 300.0)
|
||||
tensor = _prepare_video_batch(video, max_frames=None)
|
||||
|
||||
assert int(array.max()) == 255
|
||||
assert int(tensor.max()) == 255
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -124,12 +124,11 @@ def test_expand_tasks_wrong_type_raises():
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Encoder step — stubbed AutoProcessor + process_vision_info
|
||||
# Encoder step — stubbed AutoProcessor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _skip_if_topreward_extras_missing(func):
|
||||
func = skip_if_package_missing("qwen-vl-utils", import_name="qwen_vl_utils")(func)
|
||||
func = skip_if_package_missing("transformers")(func)
|
||||
return func
|
||||
|
||||
@@ -155,32 +154,20 @@ class _FakeAutoProcessor:
|
||||
|
||||
def __call__(self, text=None, images=None, videos=None, **kwargs): # noqa: ARG002
|
||||
seq_len = 10
|
||||
batch_size = len(text) if isinstance(text, list) else 1
|
||||
return {
|
||||
"input_ids": torch.randint(0, 100, (1, seq_len)),
|
||||
"attention_mask": torch.ones(1, seq_len, dtype=torch.long),
|
||||
"input_ids": torch.randint(0, 100, (batch_size, seq_len)),
|
||||
"attention_mask": torch.ones(batch_size, seq_len, dtype=torch.long),
|
||||
"pixel_values_videos": torch.zeros(batch_size, 1536, dtype=torch.float32),
|
||||
"video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
|
||||
"mm_token_type_ids": torch.zeros(batch_size, seq_len, dtype=torch.long),
|
||||
}
|
||||
|
||||
|
||||
def _build_step(monkeypatch, **overrides):
|
||||
import importlib
|
||||
import sys
|
||||
import types
|
||||
|
||||
from lerobot.rewards.topreward import processor_topreward
|
||||
from lerobot.utils import import_utils
|
||||
|
||||
monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor)
|
||||
|
||||
# Stub qwen_vl_utils as a real module object (not MagicMock) so
|
||||
# ``require_package`` / ``find_spec`` don't choke on a missing ``__spec__``.
|
||||
fake_qwen_vl = types.ModuleType("qwen_vl_utils")
|
||||
fake_qwen_vl.process_vision_info = lambda messages: (None, None) # type: ignore[attr-defined]
|
||||
fake_qwen_vl.__spec__ = importlib.machinery.ModuleSpec("qwen_vl_utils", None)
|
||||
monkeypatch.setitem(sys.modules, "qwen_vl_utils", fake_qwen_vl)
|
||||
|
||||
# Clear the require_package cache so the stub is picked up.
|
||||
import_utils._require_package_cache.pop("qwen_vl_utils", None)
|
||||
|
||||
return processor_topreward.TOPRewardEncoderProcessorStep(**overrides)
|
||||
|
||||
|
||||
@@ -192,27 +179,29 @@ def _make_transition(observation: dict, complementary: dict | None = None) -> di
|
||||
|
||||
|
||||
@_skip_if_topreward_extras_missing
|
||||
def test_encoder_step_emits_input_ids_and_prompt_length(monkeypatch):
|
||||
def test_encoder_step_emits_input_ids_and_labels(monkeypatch):
|
||||
"""The processor must emit Qwen-VL tensors including ``input_ids`` and
|
||||
``prompt_length`` under the ``observation.topreward.*`` namespace."""
|
||||
``labels`` under the ``observation.topreward.*`` namespace."""
|
||||
step = _build_step(monkeypatch)
|
||||
|
||||
frames_batch = torch.zeros(1, 4, 3, 8, 8)
|
||||
frames_batch = torch.zeros(2, 4, 3, 8, 8)
|
||||
out = step(
|
||||
_make_transition(
|
||||
observation={"observation.images.top": frames_batch},
|
||||
complementary={"task": "pick"},
|
||||
complementary={"task": ["pick", "place"]},
|
||||
)
|
||||
)
|
||||
|
||||
obs_out = out[TransitionKey.OBSERVATION]
|
||||
assert f"{TOPREWARD_FEATURE_PREFIX}input_ids" in obs_out
|
||||
assert f"{TOPREWARD_FEATURE_PREFIX}attention_mask" in obs_out
|
||||
assert f"{TOPREWARD_FEATURE_PREFIX}prompt_length" in obs_out
|
||||
for key in TOPREWARD_INPUT_KEYS:
|
||||
assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in obs_out
|
||||
|
||||
prompt_length = obs_out[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"]
|
||||
assert prompt_length.dtype == torch.long
|
||||
assert prompt_length.shape == (1,)
|
||||
input_ids = obs_out[f"{TOPREWARD_FEATURE_PREFIX}input_ids"]
|
||||
labels = obs_out[f"{TOPREWARD_FEATURE_PREFIX}labels"]
|
||||
assert labels.dtype == torch.long
|
||||
assert labels.shape == (2, 10)
|
||||
assert labels[:, :-1].eq(-100).all()
|
||||
assert labels[:, -1].equal(input_ids[:, -1])
|
||||
|
||||
|
||||
@_skip_if_topreward_extras_missing
|
||||
@@ -255,10 +244,3 @@ def test_encoder_step_rejects_missing_image_key(monkeypatch):
|
||||
step = _build_step(monkeypatch, image_key="observation.images.top")
|
||||
with pytest.raises(KeyError, match="image key"):
|
||||
step(_make_transition(observation={}, complementary={"task": "pick"}))
|
||||
|
||||
|
||||
@_skip_if_topreward_extras_missing
|
||||
def test_encoder_step_rejects_non_dict_observation(monkeypatch):
|
||||
step = _build_step(monkeypatch)
|
||||
with pytest.raises(ValueError, match="observation dict"):
|
||||
step({TransitionKey.OBSERVATION: torch.zeros(1, 3, 8, 8)})
|
||||
|
||||
@@ -3010,7 +3010,6 @@ test = [
|
||||
{ name = "pytest-timeout" },
|
||||
]
|
||||
topreward = [
|
||||
{ name = "qwen-vl-utils" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
training = [
|
||||
@@ -3158,7 +3157,6 @@ requires-dist = [
|
||||
{ name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'topreward'" },
|
||||
{ name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
|
||||
{ name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
|
||||
{ name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" },
|
||||
|
||||
Reference in New Issue
Block a user