From 70ad322676225d5642bcb744095eb66f332e407a Mon Sep 17 00:00:00 2001 From: Khalil Meftah Date: Tue, 19 May 2026 18:00:18 +0200 Subject: [PATCH] feat(rewards): add TOPReward reward model --- docs/source/_toctree.yml | 2 + docs/source/topreward.mdx | 191 ++++++ pyproject.toml | 1 + src/lerobot/rewards/__init__.py | 2 + src/lerobot/rewards/factory.py | 19 +- src/lerobot/rewards/topreward/__init__.py | 19 + .../rewards/topreward/compute_rabc_weights.py | 395 ++++++++++++ .../topreward/configuration_topreward.py | 157 +++++ .../rewards/topreward/modeling_topreward.py | 563 ++++++++++++++++++ .../rewards/topreward/processor_topreward.py | 200 +++++++ .../lerobot_rewardmodel_modelcard_template.md | 2 + tests/rewards/test_modeling_topreward.py | 421 +++++++++++++ tests/rewards/test_topreward_processor.py | 253 ++++++++ uv.lock | 8 +- 14 files changed, 2230 insertions(+), 3 deletions(-) create mode 100644 docs/source/topreward.mdx create mode 100644 src/lerobot/rewards/topreward/__init__.py create mode 100644 src/lerobot/rewards/topreward/compute_rabc_weights.py create mode 100644 src/lerobot/rewards/topreward/configuration_topreward.py create mode 100644 src/lerobot/rewards/topreward/modeling_topreward.py create mode 100644 src/lerobot/rewards/topreward/processor_topreward.py create mode 100644 tests/rewards/test_modeling_topreward.py create mode 100644 tests/rewards/test_topreward_processor.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 412386e2d..527cb7e63 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -73,6 +73,8 @@ - sections: - local: sarm title: SARM + - local: topreward + title: TOPReward title: "Reward Models" - sections: - local: inference diff --git a/docs/source/topreward.mdx b/docs/source/topreward.mdx new file mode 100644 index 000000000..dc653f096 --- /dev/null +++ b/docs/source/topreward.mdx @@ -0,0 +1,191 @@ +# TOPReward + +TOPReward is a **zero-shot reward model** that extracts token log-probabilities from an off-the-shelf vision-language model (VLM) as a robotic reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood that the instruction is true — no fine-tuning required. + +**Paper**: [TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics](https://arxiv.org/abs/2602.19313) +**Project**: [topreward.github.io](https://topreward.github.io/webpage/) +**Original code**: [github.com/TOPReward/TOPReward](https://github.com/TOPReward/TOPReward) +**Default backbone**: [Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct) + +## Overview + +TOPReward asks a generic VLM how likely a task instruction is, **conditioned on the video** of a robot trying to complete that task. Concretely, given: + +- A trajectory video (a sequence of frames). +- A task instruction (e.g. _"open the drawer"_). + +it builds a chat prompt of the form + +```text +