refactor(rewards): clean up TOPReward processor/model

2026-07-23 09:46:00 +00:00 · 2026-05-20 17:39:21 +02:00
parent 70ad322676
commit f6ecb7b955
7 changed files with 568 additions and 928 deletions
@@ -16,67 +16,71 @@

 from __future__ import annotations

-import numpy as np
+from types import SimpleNamespace
+
 import pytest
 import torch

 from lerobot.configs.rewards import RewardModelConfig
 from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
 from lerobot.rewards.topreward import TOPRewardConfig
-from lerobot.rewards.topreward.modeling_topreward import minmax_normalize_rewards
 from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX
 from tests.utils import skip_if_package_missing


-class _FakeTokenizer:
-    """Minimal tokenizer surface used by ``TOPRewardModel._compute_log_prob_reward``."""
-
-    eos_token = "<|endoftext|>"
-
-
-class _FakeProcessor:
-    """Stand-in for the Qwen ``AutoProcessor`` returned by ``from_pretrained``."""
-
-    def __init__(self) -> None:
-        self.tokenizer = _FakeTokenizer()
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
-        return cls()
-
-
 class _FakeQwenModel(torch.nn.Module):
    """Stand-in for ``Qwen3VLForConditionalGeneration``.

-    Provides the minimum surface ``TOPRewardModel`` touches at construction
-    time (a ``parameters()`` iterator for device inference). Actual
-    ``_compute_log_prob_reward`` calls are bypassed by monkey-patching the
-    method directly in the tests, so we never invoke ``self.model(...)``.
+    Returns a ``SimpleNamespace`` with ``logits`` of a controlled shape so
+    the log-prob extraction path in ``compute_reward`` can be exercised
+    without downloading real VLM weights.
    """

    def __init__(self) -> None:
        super().__init__()
        self._param = torch.nn.Parameter(torch.zeros(1))
+        self._reward_value: float = -1.5

    @classmethod
    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
        return cls()

+    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):  # noqa: ARG002
+        batch_size, seq_len = input_ids.shape
+        vocab_size = 1000
+        logits = torch.zeros(batch_size, seq_len, vocab_size)
+        # Place a controlled log-prob at the target token position so the
+        # model returns a predictable reward value.
+        # The label-masked suffix is the last token (prompt_length = seq_len - 1).
+        # After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored
+        # position is logits[:, -2, :] predicting labels[:, -1].
+        # We set logits so that log_softmax at the target token ≈ _reward_value.
+        if labels is not None:
+            for i in range(batch_size):
+                target_idx = int(input_ids[i, -1].item())
+                logits[i, -2, target_idx] = self._reward_value * -10  # high logit -> high log-prob
+        return SimpleNamespace(logits=logits)
+

 def _patch_build(monkeypatch) -> None:
    """Stub out HF AutoX so TOPReward construction is cheap and offline."""
    from lerobot.rewards.topreward import modeling_topreward

    monkeypatch.setattr(modeling_topreward, "Qwen3VLForConditionalGeneration", _FakeQwenModel)
-    monkeypatch.setattr(modeling_topreward, "AutoProcessor", _FakeProcessor)


-def _make_batch(frames: list[np.ndarray], tasks: list[str]) -> dict[str, list]:
+def _make_batch(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+    prompt_length: torch.Tensor | None = None,
+) -> dict[str, torch.Tensor]:
    """Build a ``compute_reward``-ready batch using TOPReward's namespaced keys."""
-    return {
-        f"{TOPREWARD_FEATURE_PREFIX}frames": frames,
-        f"{TOPREWARD_FEATURE_PREFIX}task": tasks,
-    }
+    batch: dict[str, torch.Tensor] = {f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids}
+    if attention_mask is not None:
+        batch[f"{TOPREWARD_FEATURE_PREFIX}attention_mask"] = attention_mask
+    if prompt_length is not None:
+        batch[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"] = prompt_length
+    return batch


 # ---------------------------------------------------------------------------
@@ -121,32 +125,6 @@ def test_topreward_config_rejects_suffix_without_instruction_placeholder():
        TOPRewardConfig(device="cpu", prompt_suffix_template="no placeholder here")


-# ---------------------------------------------------------------------------
-# minmax_normalize_rewards — pure math helper
-# ---------------------------------------------------------------------------
-
-
-def test_minmax_normalize_rewards_maps_min_and_max_to_zero_and_one():
-    values = minmax_normalize_rewards([-3.0, -1.0, 0.0, -2.0])
-    assert values.shape == (4,)
-    assert values[0] == pytest.approx(0.0)
-    assert values[2] == pytest.approx(1.0)
-    # Monotonicity preserved within the input range.
-    assert values[3] == pytest.approx(1.0 / 3.0, abs=1e-6)
-
-
-def test_minmax_normalize_rewards_handles_singleton_and_flat_inputs():
-    # Single element -> mapped to 1.0 (no information to scale).
-    assert minmax_normalize_rewards([42.0]).tolist() == [1.0]
-    # All-equal values -> all ones (avoid divide-by-zero).
-    assert minmax_normalize_rewards([0.5, 0.5, 0.5]).tolist() == [1.0, 1.0, 1.0]
-
-
-def test_minmax_normalize_rewards_empty_input_returns_empty_array():
-    out = minmax_normalize_rewards([])
-    assert out.shape == (0,)
-
-
 # ---------------------------------------------------------------------------
 # compute_reward
 # ---------------------------------------------------------------------------
@@ -154,55 +132,43 @@ def test_minmax_normalize_rewards_empty_input_returns_empty_array():

@skip_if_package_missing("transformers")
 def test_topreward_compute_reward_returns_one_scalar_per_sample(monkeypatch):
+    """``compute_reward`` must return a ``(B,)`` float32 tensor with one
+    log-prob reward per sample, consuming pre-encoded Qwen-VL tensors."""
    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel

    _patch_build(monkeypatch)
    cfg = TOPRewardConfig(device="cpu")
    model = TOPRewardModel(cfg)

-    captured = []
-
-    def fake_log_prob(self, frames, instruction):  # noqa: ARG002
-        captured.append((frames.shape, instruction))
-        return -1.5
-
-    monkeypatch.setattr(TOPRewardModel, "_compute_log_prob_reward", fake_log_prob)
-
-    frames_a = np.zeros((4, 8, 8, 3), dtype=np.uint8)
-    frames_b = np.zeros((6, 8, 8, 3), dtype=np.uint8)
-    batch = _make_batch([frames_a, frames_b], ["pick the cube", "open the drawer"])
+    input_ids = torch.randint(0, 100, (2, 10))
+    attention_mask = torch.ones(2, 10, dtype=torch.long)
+    prompt_length = torch.tensor([9, 9])  # unmask only the last token

+    batch = _make_batch(input_ids, attention_mask, prompt_length)
    rewards = model.compute_reward(batch)

    assert rewards.shape == (2,)
    assert rewards.dtype == torch.float32
-    assert torch.allclose(rewards, torch.tensor([-1.5, -1.5]))
-    # `_compute_log_prob_reward` was called once per sample with the right tasks.
-    assert [task for _, task in captured] == ["pick the cube", "open the drawer"]
-    assert [shape[0] for shape, _ in captured] == [4, 6]


@skip_if_package_missing("transformers")
 def test_topreward_compute_reward_applies_success_threshold(monkeypatch):
-    """When ``success_threshold`` is finite, the model returns binary success
-    instead of the raw log-prob — useful as a drop-in success detector."""
+    """When ``success_threshold`` is finite, the model returns binary success."""
    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel

    _patch_build(monkeypatch)
-    cfg = TOPRewardConfig(device="cpu", success_threshold=-2.0)
+    cfg = TOPRewardConfig(device="cpu", success_threshold=0.0)
    model = TOPRewardModel(cfg)

-    rewards_in = iter([-1.5, -3.0])  # first above threshold, second below
-    monkeypatch.setattr(
-        TOPRewardModel,
-        "_compute_log_prob_reward",
-        lambda _self, _frames, _instr: next(rewards_in),
-    )
+    input_ids = torch.randint(0, 100, (2, 10))
+    attention_mask = torch.ones(2, 10, dtype=torch.long)
+    prompt_length = torch.tensor([9, 9])

-    frames = [np.zeros((2, 8, 8, 3), dtype=np.uint8), np.zeros((2, 8, 8, 3), dtype=np.uint8)]
-    rewards = model.compute_reward(_make_batch(frames, ["task", "task"]))
+    batch = _make_batch(input_ids, attention_mask, prompt_length)
+    rewards = model.compute_reward(batch)

-    assert torch.equal(rewards, torch.tensor([1.0, 0.0]))
+    assert rewards.shape == (2,)
+    assert set(rewards.tolist()).issubset({0.0, 1.0})


@skip_if_package_missing("transformers")
@@ -213,137 +179,10 @@ def test_topreward_compute_reward_errors_when_inputs_missing(monkeypatch):
    cfg = TOPRewardConfig(device="cpu")
    model = TOPRewardModel(cfg)

-    with pytest.raises(KeyError, match=r"observation\.topreward\."):
+    with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"):
        model.compute_reward({})


-@skip_if_package_missing("transformers")
-def test_topreward_compute_reward_errors_when_batch_sizes_mismatch(monkeypatch):
-    """frames and task lists must have matching lengths — a stale processor
-    that produces only one task for a multi-sample batch should surface as
-    an explicit error, not a silent zip truncation."""
-    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
-
-    _patch_build(monkeypatch)
-    cfg = TOPRewardConfig(device="cpu")
-    model = TOPRewardModel(cfg)
-    monkeypatch.setattr(
-        TOPRewardModel,
-        "_compute_log_prob_reward",
-        lambda _self, _frames, _instr: 0.0,
-    )
-
-    frames = [np.zeros((2, 8, 8, 3), dtype=np.uint8), np.zeros((2, 8, 8, 3), dtype=np.uint8)]
-    with pytest.raises(ValueError, match="task batch size"):
-        model.compute_reward(_make_batch(frames, ["only one task"]))
-
-
-# ---------------------------------------------------------------------------
-# predict_curves
-# ---------------------------------------------------------------------------
-
-
-@skip_if_package_missing("transformers")
-def test_topreward_predict_curves_runs_one_forward_per_prefix(monkeypatch):
-    """``predict_curves`` must call the VLM once per prefix length per
-    trajectory and write min-max-normalised values back into the curve."""
-    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
-
-    _patch_build(monkeypatch)
-    cfg = TOPRewardConfig(device="cpu")
-    model = TOPRewardModel(cfg)
-
-    # Simulate a strictly increasing log-prob curve as the prefix grows.
-    call_log: list[int] = []
-
-    def fake_log_prob(self, frames, instruction):  # noqa: ARG002
-        call_log.append(int(frames.shape[0]))
-        return float(frames.shape[0])  # log-prob = prefix length
-
-    monkeypatch.setattr(TOPRewardModel, "_compute_log_prob_reward", fake_log_prob)
-
-    frames = np.zeros((5, 8, 8, 3), dtype=np.uint8)
-    batch = _make_batch([frames], ["lift the cup"])
-    out = model.predict_curves(batch)
-
-    # One forward per prefix length, in order.
-    assert call_log == [1, 2, 3, 4, 5]
-    # (B, T_max) shape, padded with NaN beyond each trajectory's length.
-    assert out["progress"].shape == (1, 5)
-    # Strictly increasing raw rewards -> min-max-normalised to [0, 1] linearly.
-    expected = torch.tensor([[0.0, 0.25, 0.5, 0.75, 1.0]])
-    assert torch.allclose(out["progress"], expected, atol=1e-6)
-
-
-@skip_if_package_missing("transformers")
-def test_topreward_predict_curves_sparse_dense_interpolates_to_full_resolution(monkeypatch):
-    """With ``num_prefixes < N`` the model should score only the requested
-    number of anchor prefixes and linearly interpolate between them — the
-    upstream sparse-dense pattern (``num_samples=15``)."""
-    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
-
-    _patch_build(monkeypatch)
-    cfg = TOPRewardConfig(device="cpu")
-    model = TOPRewardModel(cfg)
-
-    call_log: list[int] = []
-
-    def fake_log_prob(self, frames, instruction):  # noqa: ARG002
-        call_log.append(int(frames.shape[0]))
-        return float(frames.shape[0])
-
-    monkeypatch.setattr(TOPRewardModel, "_compute_log_prob_reward", fake_log_prob)
-
-    frames = np.zeros((9, 8, 8, 3), dtype=np.uint8)
-    out = model.predict_curves(_make_batch([frames], ["lift the cup"]), num_prefixes=3)
-
-    # 3 anchors at linspace(1, 9, 3) -> [1, 5, 9] -> 3 VLM forwards instead of 9.
-    assert call_log == [1, 5, 9]
-    # Returned curve is full resolution (9 frames) and monotone in [0, 1].
-    assert out["progress"].shape == (1, 9)
-    curve = out["progress"][0].numpy()
-    assert curve[0] == pytest.approx(0.0)
-    assert curve[-1] == pytest.approx(1.0)
-    assert np.all(np.diff(curve) >= 0)
-
-
-@skip_if_package_missing("transformers")
-def test_topreward_predict_curves_rejects_invalid_num_prefixes(monkeypatch):
-    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
-
-    _patch_build(monkeypatch)
-    model = TOPRewardModel(TOPRewardConfig(device="cpu"))
-    batch = _make_batch([np.zeros((3, 8, 8, 3), dtype=np.uint8)], ["task"])
-    with pytest.raises(ValueError, match="num_prefixes must be"):
-        model.predict_curves(batch, num_prefixes=0)
-
-
-@skip_if_package_missing("transformers")
-def test_topreward_predict_curves_right_pads_with_nan_for_variable_lengths(monkeypatch):
-    """Trajectories of different lengths in the same batch are right-padded
-    with ``NaN`` so the output is a regular ``(B, T_max)`` tensor."""
-    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
-
-    _patch_build(monkeypatch)
-    cfg = TOPRewardConfig(device="cpu")
-    model = TOPRewardModel(cfg)
-    monkeypatch.setattr(
-        TOPRewardModel,
-        "_compute_log_prob_reward",
-        lambda _self, frames, _instr: float(frames.shape[0]),
-    )
-
-    frames_short = np.zeros((2, 8, 8, 3), dtype=np.uint8)
-    frames_long = np.zeros((4, 8, 8, 3), dtype=np.uint8)
-    out = model.predict_curves(_make_batch([frames_short, frames_long], ["a", "b"]))
-
-    assert out["progress"].shape == (2, 4)
-    # Trailing entries for the shorter trajectory are NaN.
-    assert torch.isnan(out["progress"][0, 2:]).all()
-    # The longer trajectory has no NaNs.
-    assert not torch.isnan(out["progress"][1]).any()
-
-
 # ---------------------------------------------------------------------------
 # Save / load — config-only checkpoint
 # ---------------------------------------------------------------------------
@@ -351,10 +190,6 @@ def test_topreward_predict_curves_right_pads_with_nan_for_variable_lengths(monke

@skip_if_package_missing("transformers")
 def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path):
-    """A TOPReward "checkpoint" is just ``config.json``. Writing
-    ``model.safetensors`` would only duplicate ~16 GB of Qwen weights for
-    no benefit, so :meth:`_save_pretrained` must skip it entirely.
-    """
    from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE

    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
@@ -371,13 +206,11 @@ def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path
    model.save_pretrained(str(tmp_path))

    assert (tmp_path / CONFIG_NAME).exists()
-    # Zero-shot model: no safetensors written by `_save_pretrained`.
    assert not (tmp_path / SAFETENSORS_SINGLE_FILE).exists()


@skip_if_package_missing("transformers")
 def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_path):
-    """Save a TOPRewardConfig locally and reload it — user knobs must survive."""
    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel

    _patch_build(monkeypatch)
@@ -387,7 +220,6 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
        reduction="sum",
        fps=4.0,
        image_key="observation.images.front",
-        use_video_description=True,
        add_chat_template=True,
        success_threshold=-1.5,
    )
@@ -400,16 +232,12 @@ def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_
    assert reloaded.config.reduction == "sum"
    assert reloaded.config.fps == 4.0
    assert reloaded.config.image_key == "observation.images.front"
-    assert reloaded.config.use_video_description is True
    assert reloaded.config.add_chat_template is True
    assert reloaded.config.success_threshold == -1.5


@skip_if_package_missing("transformers")
 def test_topreward_is_not_trainable(monkeypatch):
-    """The whole point of TOPReward is that it is zero-shot.
-    ``is_trainable`` must therefore be ``False`` and ``forward(...)`` must
-    raise the base-class ``NotImplementedError``."""
    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel

    _patch_build(monkeypatch)
@@ -23,11 +23,11 @@ import torch
 from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
 from lerobot.rewards.topreward.processor_topreward import (
    TOPREWARD_FEATURE_PREFIX,
-    TOPRewardEncoderProcessorStep,
    _expand_tasks,
    _video_to_numpy,
 )
 from lerobot.types import TransitionKey
+from tests.utils import skip_if_package_missing

 # ---------------------------------------------------------------------------
 # _video_to_numpy — pure (T, C, H, W) -> (T, H, W, C) uint8 conversion
@@ -35,7 +35,7 @@ from lerobot.types import TransitionKey


 def test_video_to_numpy_chw_float_is_converted_to_thwc_uint8():
-    video = torch.rand(4, 3, 8, 8)  # (T, C, H, W) floats in [0, 1]
+    video = torch.rand(4, 3, 8, 8)
    array = _video_to_numpy(video, max_frames=None)

    assert array.shape == (4, 8, 8, 3)
@@ -52,7 +52,6 @@ def test_video_to_numpy_already_thwc_uint8_passes_through():


 def test_video_to_numpy_max_frames_tail_crops_recent_frames():
-    """``max_frames`` should keep the **last** K frames (most recent)."""
    video = torch.zeros(10, 3, 4, 4)
    for t in range(10):
        video[t] = t / 9.0
@@ -70,8 +69,6 @@ def test_video_to_numpy_rejects_3d_input():


 def test_video_to_numpy_floats_above_one_pass_through_without_rescaling():
-    """If ``array.max() > 1`` the helper assumes the tensor is already in the
-    uint8 range; values pass through unchanged (but are still clipped to 255)."""
    video = torch.full((1, 3, 2, 2), 5.0)
    array = _video_to_numpy(video, max_frames=None)

@@ -127,50 +124,80 @@ def test_expand_tasks_wrong_type_raises():


 # ---------------------------------------------------------------------------
-# Encoder step — input/output shapes + dataclass surface
+# Encoder step — stubbed AutoProcessor + process_vision_info
 # ---------------------------------------------------------------------------


+def _skip_if_topreward_extras_missing(func):
+    func = skip_if_package_missing("qwen-vl-utils", import_name="qwen_vl_utils")(func)
+    func = skip_if_package_missing("transformers")(func)
+    return func
+
+
+class _FakeTokenizer:
+    eos_token = "<|endoftext|>"
+    pad_token = "<|endoftext|>"
+
+    def __call__(self, *args, **kwargs):
+        return {"input_ids": torch.zeros(1, 10, dtype=torch.long)}
+
+
+class _FakeAutoProcessor:
+    def __init__(self) -> None:
+        self.tokenizer = _FakeTokenizer()
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
+        return cls()
+
+    def apply_chat_template(self, messages, **kwargs):  # noqa: ARG002
+        return "fake_prompt_text"
+
+    def __call__(self, text=None, images=None, videos=None, **kwargs):  # noqa: ARG002
+        seq_len = 10
+        return {
+            "input_ids": torch.randint(0, 100, (1, seq_len)),
+            "attention_mask": torch.ones(1, seq_len, dtype=torch.long),
+        }
+
+
+def _build_step(monkeypatch, **overrides):
+    import importlib
+    import sys
+    import types
+
+    from lerobot.rewards.topreward import processor_topreward
+    from lerobot.utils import import_utils
+
+    monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor)
+
+    # Stub qwen_vl_utils as a real module object (not MagicMock) so
+    # ``require_package`` / ``find_spec`` don't choke on a missing ``__spec__``.
+    fake_qwen_vl = types.ModuleType("qwen_vl_utils")
+    fake_qwen_vl.process_vision_info = lambda messages: (None, None)  # type: ignore[attr-defined]
+    fake_qwen_vl.__spec__ = importlib.machinery.ModuleSpec("qwen_vl_utils", None)
+    monkeypatch.setitem(sys.modules, "qwen_vl_utils", fake_qwen_vl)
+
+    # Clear the require_package cache so the stub is picked up.
+    import_utils._require_package_cache.pop("qwen_vl_utils", None)
+
+    return processor_topreward.TOPRewardEncoderProcessorStep(**overrides)
+
+
 def _make_transition(observation: dict, complementary: dict | None = None) -> dict:
-    """Build a tiny ``EnvTransition`` dict for the encoder step."""
    transition: dict = {TransitionKey.OBSERVATION: observation}
    if complementary is not None:
        transition[TransitionKey.COMPLEMENTARY_DATA] = complementary
    return transition


-def test_encoder_step_writes_namespaced_frames_and_task():
-    """The encoder step's output is the contract the model reads from. It
-    must populate exactly two namespaced keys: ``frames`` and ``task``."""
-    step = TOPRewardEncoderProcessorStep(
-        image_key="observation.images.top",
-        task_key="task",
-        max_frames=None,
-    )
+@_skip_if_topreward_extras_missing
+def test_encoder_step_emits_input_ids_and_prompt_length(monkeypatch):
+    """The processor must emit Qwen-VL tensors including ``input_ids`` and
+    ``prompt_length`` under the ``observation.topreward.*`` namespace."""
+    step = _build_step(monkeypatch)

-    frames_batch = torch.zeros(2, 4, 3, 8, 8)  # (B=2, T=4, C, H, W)
-    out = step(
-        _make_transition(
-            observation={"observation.images.top": frames_batch},
-            complementary={"task": ["pick", "place"]},
-        )
-    )
-
-    obs_out = out[TransitionKey.OBSERVATION]
-    frames_out = obs_out[f"{TOPREWARD_FEATURE_PREFIX}frames"]
-    tasks_out = obs_out[f"{TOPREWARD_FEATURE_PREFIX}task"]
-
-    assert len(frames_out) == 2
-    assert all(arr.shape == (4, 8, 8, 3) and arr.dtype == np.uint8 for arr in frames_out)
-    assert tasks_out == ["pick", "place"]
-
-
-def test_encoder_step_adds_singleton_time_dim_for_4d_input():
-    """A ``(B, C, H, W)`` observation is the single-frame case; the encoder
-    must unsqueeze the time dim so the model still sees a video."""
-    step = TOPRewardEncoderProcessorStep(image_key="observation.images.top", max_frames=None)
-
-    frames_batch = torch.zeros(1, 3, 8, 8)  # (B=1, C, H, W) — no time dim
+    frames_batch = torch.zeros(1, 4, 3, 8, 8)
    out = step(
        _make_transition(
            observation={"observation.images.top": frames_batch},
@@ -178,76 +205,60 @@ def test_encoder_step_adds_singleton_time_dim_for_4d_input():
        )
    )

-    frames_out = out[TransitionKey.OBSERVATION][f"{TOPREWARD_FEATURE_PREFIX}frames"]
-    assert len(frames_out) == 1
-    assert frames_out[0].shape == (1, 8, 8, 3)  # (T=1, H, W, C)
+    obs_out = out[TransitionKey.OBSERVATION]
+    assert f"{TOPREWARD_FEATURE_PREFIX}input_ids" in obs_out
+    assert f"{TOPREWARD_FEATURE_PREFIX}attention_mask" in obs_out
+    assert f"{TOPREWARD_FEATURE_PREFIX}prompt_length" in obs_out
+
+    prompt_length = obs_out[f"{TOPREWARD_FEATURE_PREFIX}prompt_length"]
+    assert prompt_length.dtype == torch.long
+    assert prompt_length.shape == (1,)


-def test_encoder_step_uses_default_task_when_complementary_is_missing():
-    step = TOPRewardEncoderProcessorStep(
-        image_key="observation.images.top",
-        default_task="perform the task",
-    )
-
-    frames_batch = torch.zeros(1, 2, 3, 4, 4)
-    out = step(_make_transition(observation={"observation.images.top": frames_batch}))
-
-    tasks_out = out[TransitionKey.OBSERVATION][f"{TOPREWARD_FEATURE_PREFIX}task"]
-    assert tasks_out == ["perform the task"]
-
-
-def test_encoder_step_rejects_missing_image_key():
-    step = TOPRewardEncoderProcessorStep(image_key="observation.images.top")
-    with pytest.raises(KeyError, match="image key"):
-        step(_make_transition(observation={}, complementary={"task": "pick"}))
-
-
-def test_encoder_step_rejects_non_dict_observation():
-    step = TOPRewardEncoderProcessorStep()
-    with pytest.raises(ValueError, match="observation dict"):
-        step({TransitionKey.OBSERVATION: torch.zeros(1, 3, 8, 8)})
-
-
-def test_encoder_step_rejects_3d_or_6d_input():
-    """The encoder accepts ``(B,C,H,W)`` or ``(B,T,C,H,W)`` only."""
-    step = TOPRewardEncoderProcessorStep(image_key="observation.images.top")
-    with pytest.raises(ValueError, match=r"\(B,C,H,W\)"):
-        step(
-            _make_transition(
-                observation={"observation.images.top": torch.zeros(8, 8, 3)},
-                complementary={"task": "pick"},
-            )
-        )
-
-
-def test_encoder_step_get_config_roundtrips_user_fields():
-    """``get_config`` must serialise every user-tunable field — these are
-    what the processor pipeline saves under ``preprocessor_config.json``."""
-    step = TOPRewardEncoderProcessorStep(
+@_skip_if_topreward_extras_missing
+def test_encoder_step_get_config_roundtrips_user_fields(monkeypatch):
+    step = _build_step(
+        monkeypatch,
+        vlm_name="Qwen/Qwen3-VL-8B-Instruct",
        image_key="observation.images.cam_top",
        task_key="task",
        default_task="do the thing",
        max_frames=8,
+        fps=4.0,
+        add_chat_template=True,
+        max_length=2048,
    )

-    assert step.get_config() == {
-        "image_key": "observation.images.cam_top",
-        "task_key": "task",
-        "default_task": "do the thing",
-        "max_frames": 8,
-    }
+    cfg = step.get_config()
+    assert cfg["vlm_name"] == "Qwen/Qwen3-VL-8B-Instruct"
+    assert cfg["image_key"] == "observation.images.cam_top"
+    assert cfg["default_task"] == "do the thing"
+    assert cfg["max_frames"] == 8
+    assert cfg["fps"] == 4.0
+    assert cfg["add_chat_template"] is True
+    assert cfg["max_length"] == 2048


-def test_encoder_step_transform_features_is_identity():
-    """The encoder writes plain Python objects (numpy arrays / strings)
-    into ``observation`` at call time but does NOT advertise new typed
-    features at pipeline-build time — the model reads them via the
-    ``TOPREWARD_FEATURE_PREFIX`` namespace, not via the typed feature map.
-    """
-    step = TOPRewardEncoderProcessorStep()
+@_skip_if_topreward_extras_missing
+def test_encoder_step_transform_features_is_identity(monkeypatch):
+    step = _build_step(monkeypatch)
    features = {
        PipelineFeatureType.OBSERVATION: {
            "observation.images.top": PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL),
        }
    }
    assert step.transform_features(features) == features
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_rejects_missing_image_key(monkeypatch):
+    step = _build_step(monkeypatch, image_key="observation.images.top")
+    with pytest.raises(KeyError, match="image key"):
+        step(_make_transition(observation={}, complementary={"task": "pick"}))
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_rejects_non_dict_observation(monkeypatch):
+    step = _build_step(monkeypatch)
+    with pytest.raises(ValueError, match="observation dict"):
+        step({TransitionKey.OBSERVATION: torch.zeros(1, 3, 8, 8)})