From e86f5af5bf30d7cd442d07b862b3fbb82f5c79b2 Mon Sep 17 00:00:00 2001 From: Khalil Meftah Date: Wed, 27 May 2026 14:24:31 +0200 Subject: [PATCH] feat(rewards): add TOPReward reward model (#3629) * feat(rewards): add TOPReward reward model * refactor(rewards): clean up TOPReward processor/model * fix(rewards/topreward): add missing input keys mm_token_type_ids * fix(rewards/topreward): fix pyproject extra typo and simplify processor (#3653) Add lerobot[topreward] extra to all in pyproject.toml, drop the redundant labels arg in scoring, and collapse the dead-branch shape check in the encoder processor. * optmize topreward input processing (#3660) --------- Co-authored-by: Cole <91766445+jcoleharrison@users.noreply.github.com> Co-authored-by: Haoming Song --- docs/source/_toctree.yml | 2 + docs/source/topreward.mdx | 177 +++++++++ pyproject.toml | 2 + src/lerobot/rewards/__init__.py | 2 + src/lerobot/rewards/factory.py | 19 +- src/lerobot/rewards/topreward/__init__.py | 19 + .../rewards/topreward/compute_rabc_weights.py | 353 ++++++++++++++++++ .../topreward/configuration_topreward.py | 146 ++++++++ .../rewards/topreward/modeling_topreward.py | 238 ++++++++++++ .../rewards/topreward/processor_topreward.py | 305 +++++++++++++++ .../lerobot_rewardmodel_modelcard_template.md | 2 + tests/rewards/test_modeling_topreward.py | 296 +++++++++++++++ tests/rewards/test_topreward.py | 80 ++++ tests/rewards/test_topreward_processor.py | 246 ++++++++++++ uv.lock | 7 +- 15 files changed, 1891 insertions(+), 3 deletions(-) create mode 100644 docs/source/topreward.mdx create mode 100644 src/lerobot/rewards/topreward/__init__.py create mode 100644 src/lerobot/rewards/topreward/compute_rabc_weights.py create mode 100644 src/lerobot/rewards/topreward/configuration_topreward.py create mode 100644 src/lerobot/rewards/topreward/modeling_topreward.py create mode 100644 src/lerobot/rewards/topreward/processor_topreward.py create mode 100644 tests/rewards/test_modeling_topreward.py create mode 100644 tests/rewards/test_topreward.py create mode 100644 tests/rewards/test_topreward_processor.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 412386e2d..527cb7e63 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -73,6 +73,8 @@ - sections: - local: sarm title: SARM + - local: topreward + title: TOPReward title: "Reward Models" - sections: - local: inference diff --git a/docs/source/topreward.mdx b/docs/source/topreward.mdx new file mode 100644 index 000000000..f84fbed49 --- /dev/null +++ b/docs/source/topreward.mdx @@ -0,0 +1,177 @@ +# TOPReward + +TOPReward is a **zero-shot reward model** that extracts token log-probabilities from an off-the-shelf vision-language model (VLM) as a robotic reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood that the instruction is true — no fine-tuning required. + +**Paper**: [TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics](https://arxiv.org/abs/2602.19313) +**Project**: [topreward.github.io](https://topreward.github.io/webpage/) +**Original code**: [github.com/TOPReward/TOPReward](https://github.com/TOPReward/TOPReward) +**Default backbone**: [Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct) + +## Overview + +TOPReward asks a generic VLM how likely a task instruction is, **conditioned on the video** of a robot trying to complete that task. Concretely, given: + +- A trajectory video (a sequence of frames). +- A task instruction (e.g. _"open the drawer"_). + +it builds a chat prompt of the form + +```text +