From 920c6ef5a2e3eb370b917b42d7c26305692cb39a Mon Sep 17 00:00:00 2001
From: pepijn <pepijn@huggingface.co>
Date: Tue, 26 May 2026 04:42:10 +0000
Subject: [PATCH 01/45] docs(annotate): disable phase-0 vocabulary discovery by
 default in run_hf_job

Heterogeneous datasets (different tasks/scenes across episodes) don't
share a single small subtask + memory vocabulary, so the canonical
vocabulary phase narrowed every episode to the wrong target distribution.
Flip the example to free-form generation by default and document the
``--vocabulary.enabled=true`` switch for homogeneous datasets where the
canonical vocabulary still helps the downstream policy.

No pipeline-code changes: ``VocabularyConfig.enabled`` already gates
phase 0 (see ``executor.py:_run_vocabulary_phase`` and
``VocabularyConfig`` docstring) and falls back to free-form generation.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 examples/annotations/run_hf_job.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index f3e497039..c8219d9e4 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -5,13 +5,16 @@ Spawns one ``h200x2`` job that:
 
   1. installs this branch of ``lerobot`` plus the annotation extras,
   2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
-  3. discovers the dataset's canonical subtask + memory vocabulary
-     from the first 3 sample episodes (phase 0),
-  4. runs the plan / interjections / vqa modules across the dataset
-     (subtasks + memory are constrained to the canonical vocabulary),
-  5. uploads the annotated dataset to ``--dest_repo_id`` (when set)
+  3. runs the plan / interjections / vqa modules across the dataset
+     in free-form mode (phase 0 canonical-vocabulary discovery is
+     disabled — each episode generates its own subtasks + memory),
+  4. uploads the annotated dataset to ``--dest_repo_id`` (when set)
      or back to ``--repo_id``.
 
+Re-enable phase 0 with ``--vocabulary.enabled=true`` (optionally
+``--vocabulary.sample_episodes=N``) when the dataset is homogeneous
+enough to share one subtask + memory vocabulary across all episodes.
+
 Usage:
 
     HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
@@ -54,12 +57,14 @@ CMD = (
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.wrist "
-    # Phase 0 — canonical vocabulary discovery from the first N sample
-    # episodes. The VLM picks the right number of subtask + memory
-    # entries itself from what it sees; the resulting
-    # meta/canonical_vocabulary.json constrains every subtask + memory
-    # string to a small repeatable target distribution.
-    "--vocabulary.sample_episodes=3 "
+    # Phase 0 — canonical vocabulary discovery DISABLED by default.
+    # Heterogeneous datasets (different tasks/scenes across episodes)
+    # don't share a single small subtask + memory vocabulary, so each
+    # episode generates its subtasks + memory free-form. Flip to
+    # ``--vocabulary.enabled=true`` (optionally ``--vocabulary.sample_episodes=N``)
+    # for homogeneous datasets where a shared canonical vocabulary
+    # helps the downstream policy.
+    "--vocabulary.enabled=false "
     # Phase 1 — plan module (subtasks + plan + memory + task_aug).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "

From 1e7c0d6aa18c44503ae9e0e3af2bc6a16b897ab8 Mon Sep 17 00:00:00 2001
From: pepijn <pepijn@huggingface.co>
Date: Tue, 26 May 2026 05:14:30 +0000
Subject: [PATCH 02/45] annotate(plan): force composite-action subtasks; ban
 ultra-fine splits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tighten ``module_1_subtasks.txt`` so the VLM emits one composite
atomic action per subtask instead of decomposing every pick into
``move to X`` / ``grasp X`` / ``lift X``:

- Lock the verb vocabulary to the composite set the low-level
  policy actually learns end-to-end: ``pick up`` (approach + grasp +
  lift), ``put``/``place`` (transport + release), ``push``, ``pull``,
  ``turn``, ``press``, ``open``, ``close``, ``pour``, ``insert``.
  ``go to`` is allowed only as a pure relocation between phases.
- Add an explicit ``Forbidden ultra-fine splits`` block enumerating
  the patterns the VLM was tempted to emit (``move to X``,
  ``reach for X``, ``grasp X``, ``lift X``, ``release X``) and
  instructing it to fold each into its parent composite.
- Rewrite the Good/Bad examples to match the composite contract;
  the previous ``"move to blue cube" / "grasp blue cube" / "lift
  blue cube"`` Good list was actively encouraging the over-
  segmentation pattern this prompt is supposed to prevent.
- Tighten the duration rule: candidates shorter than
  ``min_subtask_seconds`` must be merged into a neighbour rather
  than emitted. Pairs with bumping the runtime floor to 3 s so
  composites have room to land.

Pure prompt change — no code or schema change. Existing canonical-
vocabulary retry path is unaffected (the new verb whitelist lives
in prose, not in the validator).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../prompts/module_1_subtasks.txt             | 58 +++++++++++++++----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index 12bbcfba2..9314282be 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -8,14 +8,42 @@ the robot performs.
 
 {vocabulary_block}Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 
-- Each subtask = one atomic skill the low-level policy can execute.
-- Write each subtask as an IMPERATIVE COMMAND, starting with a verb:
-  move, reach, pick up, grasp, place, put, push, pull, open, close,
-  turn, press, lift, insert, pour...
+- Each subtask = one COMPOSITE atomic skill the low-level policy can
+  execute end-to-end. A "skill" bundles its own approach motion with
+  its terminal action — do NOT split the approach off as its own
+  subtask. The whole-arm policy already learns to reach as part of
+  every manipulation primitive.
+- Write each subtask as an IMPERATIVE COMMAND, starting with one of
+  these verbs (extend only when none fits):
+    pick up <obj>           — approach + grasp + lift in one subtask
+    put <obj> on/in <loc>   — transport + release in one subtask
+    place <obj> on/in <loc> — synonym of "put"; pick one and stay consistent
+    push <obj>              — contact + linear shove
+    pull <obj>              — contact + linear retract
+    turn <knob/dial/handle> — rotary actuation
+    press <button>          — single-press contact
+    open <drawer/door/lid>  — full open motion
+    close <drawer/door/lid> — full close motion
+    pour <src> into <dst>   — tilt + flow
+    insert <obj> into <slot>— alignment + push-fit
+    go to <loc>             — ONLY when no grasp / actuation follows
+                             (e.g. a pure relocation between phases).
+                             If the next subtask grasps something at
+                             that location, drop "go to ..." and just
+                             write "pick up ..." instead.
+- Forbidden ultra-fine splits — the VLM is NOT allowed to emit these
+  as standalone subtasks; fold them into the parent composite:
+    "move to X"   → fold into "pick up X" (or whatever follows)
+    "reach for X" → fold into "pick up X"
+    "grasp X"     → fold into "pick up X"
+    "lift X"      → fold into "pick up X" (or "put X on Y" if it's
+                    the transport phase of a place)
+    "release X"   → fold into "put X on Y" (or "place X in Y")
 - Keep it SHORT — a verb phrase, not a sentence. Drop articles
   ("the", "a") and adverbs ("carefully", "slowly"). Add a "how"
   detail (which hand, which grasp point) ONLY when it is needed to
-  disambiguate.
+  disambiguate. Every subtask must begin with one of the verbs
+  above (no leading nouns, no "then", no "first").
 - NEVER use third person. Never write "the robot", "the arm", "the
   gripper moves", "it picks up" — the robot is implied. Command it,
   do not describe it.
@@ -23,16 +51,22 @@ the robot performs.
   "cube", every subtask says "cube" — never switch to "block". If it
   says "box", never switch to "bin"/"container". Keep vocabulary
   consistent across the whole episode.
-- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
-  "place blue cube in box", "open drawer", "release yellow cube".
-- Bad: "the robot arm moves towards the blue cube" (third person,
-  too long), "carefully pick up the cube" (adverb, article),
-  "release the yellow block" ("block" when the task said "cube").
+- Good: "pick up blue cube", "put blue cube in box", "open drawer",
+  "turn red knob", "press start button", "go to sink".
+- Bad: "move to blue cube" (approach as its own subtask — forbidden,
+  must be folded into "pick up blue cube"); "the robot arm moves
+  towards the blue cube" (third person, too long); "carefully pick
+  up the cube" (adverb, article); "release the yellow block"
+  ("block" when the task said "cube", and "release" must be folded
+  into a "put"/"place" subtask).
 - Subtasks are non-overlapping and cover the full episode in order.
   Choose the cut points yourself based on what you see in the video
   (gripper open/close events, contact, regrasps, transitions).
-- Each subtask spans at least {min_subtask_seconds} seconds.
-- Do not exceed {max_steps} subtasks total.
+- Each subtask spans at least {min_subtask_seconds} seconds. If a
+  candidate span would be shorter, merge it into its neighbour
+  rather than emitting it.
+- Do not exceed {max_steps} subtasks total. Fewer, larger composites
+  are preferred over many micro-steps.
 - Every subtask's [start_time, end_time] must lie within
   [0.0, {episode_duration}] seconds.
 

From f65f3f7a4a8bc2eb405d692ed297b9f9a3828e20 Mon Sep 17 00:00:00 2001
From: Reece O'Mahoney <66252930+reeceomahoney@users.noreply.github.com>
Date: Tue, 26 May 2026 13:01:19 +0100
Subject: [PATCH 03/45] Fix policy.path in YAML configs (PR #3145 followup)
 (#3597)

PR #3145 added YAML support for policy.path but left two bugs:

1. extract_path_fields_from_config only deleted config_data[field] when
   no sibling overrides existed. With siblings, the dict stayed in place
   and draccus crashed decoding it as PreTrainedConfig (no 'type' key).
   Sibling overrides go into _config_yaml_overrides and are applied later
   by from_pretrained(), so the field can always be removed.

2. wrap() updated config_path_cli to the cleaned temp file path but
   never propagated it to the draccus.parse fallback branch. cli_args
   still contained --config_path=<original>, so draccus read the
   original YAML with path: still present.

Tests passed because they (a) called extract_path_fields_from_config
directly and (b) included type: alongside path: in the YAML, sidestepping
both bugs.

Co-authored-by: Steven Palma <imstevenpmwork@ieee.org>
---
 src/lerobot/configs/parser.py  |  11 +++-
 tests/test_yaml_policy_path.py | 116 +++++++++++++++++++++++++++++++--
 2 files changed, 117 insertions(+), 10 deletions(-)

diff --git a/src/lerobot/configs/parser.py b/src/lerobot/configs/parser.py
index d55fa44aa..46cff2b48 100644
--- a/src/lerobot/configs/parser.py
+++ b/src/lerobot/configs/parser.py
@@ -255,8 +255,7 @@ def extract_path_fields_from_config(config_path: str, path_fields: list[str]) ->
             remaining = config_data[field]
             if remaining:
                 _config_yaml_overrides[field] = _flatten_to_cli_args(remaining)
-            else:
-                del config_data[field]
+            del config_data[field]
             modified = True
 
     if not modified:
@@ -311,7 +310,13 @@ def wrap(config_path: Path | None = None) -> Callable[[F], F]:
                     cli_args = filter_arg("config_path", cli_args)
                     cfg = argtype.from_pretrained(config_path_cli, cli_args=cli_args)
                 else:
-                    cfg = draccus.parse(config_class=argtype, config_path=config_path, args=cli_args)
+                    if config_path_cli:
+                        cli_args = filter_arg("config_path", cli_args)
+                    cfg = draccus.parse(
+                        config_class=argtype,
+                        config_path=config_path_cli or config_path,
+                        args=cli_args,
+                    )
             response = fn(cfg, *args, **kwargs)
             return response
 
diff --git a/tests/test_yaml_policy_path.py b/tests/test_yaml_policy_path.py
index 710a71c9a..8d8f7f2ec 100644
--- a/tests/test_yaml_policy_path.py
+++ b/tests/test_yaml_policy_path.py
@@ -1,10 +1,14 @@
 """Tests for policy.path support in YAML config files (issue #2957)."""
 
 import json
+import sys
 import tempfile
+from dataclasses import dataclass, field
+from unittest.mock import patch
 
 import yaml
 
+from lerobot.configs import parser
 from lerobot.configs.parser import (
     _config_path_args,
     _config_yaml_overrides,
@@ -16,7 +20,8 @@ from lerobot.configs.parser import (
 
 
 def test_extract_path_fields_from_yaml():
-    """Test that policy.path is extracted from a YAML config and removed."""
+    """Test that policy.path is extracted from a YAML config and the policy block
+    is removed entirely (siblings are captured separately as cli_overrides)."""
     config = {
         "dataset": {"repo_id": "lerobot/pusht"},
         "policy": {"type": "smolvla", "path": "lerobot/smolvla_base", "push_to_hub": False},
@@ -26,26 +31,33 @@ def test_extract_path_fields_from_yaml():
         config_path = f.name
 
     _config_path_args.clear()
+    _config_yaml_overrides.clear()
     cleaned_path = extract_path_fields_from_config(config_path, ["policy"])
 
     # Path should be extracted and stored
     assert _config_path_args["policy"] == "lerobot/smolvla_base"
 
-    # Cleaned config should not have the path field
+    # Cleaned config should not have the policy block at all -- draccus must not
+    # try to decode it as PreTrainedConfig; the actual config comes from
+    # from_pretrained(path) with the captured overrides applied on top.
     with open(cleaned_path) as f:
         cleaned = yaml.safe_load(f)
-    assert "path" not in cleaned["policy"]
-    assert cleaned["policy"]["type"] == "smolvla"
-    assert cleaned["policy"]["push_to_hub"] is False
+    assert "policy" not in cleaned
 
     # Original dataset should be untouched
     assert cleaned["dataset"]["repo_id"] == "lerobot/pusht"
 
+    # Sibling overrides (excluding type/path) captured for from_pretrained.
+    overrides = get_yaml_overrides("policy")
+    assert any("push_to_hub=false" in o for o in overrides)
+
     _config_path_args.clear()
+    _config_yaml_overrides.clear()
 
 
 def test_extract_path_fields_from_json():
-    """Test that policy.path is extracted from a JSON config."""
+    """Test that policy.path is extracted from a JSON config and the policy
+    block is removed entirely."""
     config = {
         "policy": {"type": "act", "path": "some/local/path"},
     }
@@ -54,15 +66,17 @@ def test_extract_path_fields_from_json():
         config_path = f.name
 
     _config_path_args.clear()
+    _config_yaml_overrides.clear()
     cleaned_path = extract_path_fields_from_config(config_path, ["policy"])
 
     assert _config_path_args["policy"] == "some/local/path"
 
     with open(cleaned_path) as f:
         cleaned = json.load(f)
-    assert "path" not in cleaned["policy"]
+    assert "policy" not in cleaned
 
     _config_path_args.clear()
+    _config_yaml_overrides.clear()
 
 
 def test_extract_no_path_returns_original():
@@ -216,3 +230,91 @@ def test_flatten_nested_with_bools():
     args = _flatten_to_cli_args(d)
     assert "--optimizer.use_warmup=true" in args
     assert "--optimizer.lr=0.01" in args
+
+
+def test_extract_removes_field_with_siblings_and_no_type():
+    """Regression: when policy.path has siblings but no type:, the entire policy
+    block must still be removed from the cleaned config. Otherwise draccus tries
+    to decode the leftover dict as PreTrainedConfig and crashes on the missing
+    type discriminator.
+    """
+    config = {
+        "dataset": {"repo_id": "lerobot/pusht"},
+        "policy": {
+            "path": "lerobot/smolvla_base",
+            "n_action_steps": 10,
+            "dtype": "bfloat16",
+        },
+    }
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+        yaml.dump(config, f)
+        config_path = f.name
+
+    _config_path_args.clear()
+    _config_yaml_overrides.clear()
+    cleaned_path = extract_path_fields_from_config(config_path, ["policy"])
+
+    with open(cleaned_path) as f:
+        cleaned = yaml.safe_load(f) or {}
+    assert "policy" not in cleaned, "policy block should be fully removed when path is present"
+    assert cleaned["dataset"]["repo_id"] == "lerobot/pusht"
+    assert _config_path_args["policy"] == "lerobot/smolvla_base"
+    overrides = get_yaml_overrides("policy")
+    assert any("n_action_steps=10" in o for o in overrides)
+    assert any("dtype=bfloat16" in o for o in overrides)
+
+    _config_path_args.clear()
+    _config_yaml_overrides.clear()
+
+
+@dataclass
+class _DummyNested:
+    foo: int = 0
+
+
+@dataclass
+class _DummyConfig:
+    nested: _DummyNested = field(default_factory=_DummyNested)
+    other: str = "default"
+
+    @classmethod
+    def __get_path_fields__(cls):
+        return ["nested"]
+
+
+def test_wrap_uses_cleaned_config_for_draccus_parse():
+    """Regression: wrap() updates config_path_cli to point at the cleaned temp
+    file but must propagate that to the draccus.parse fallback branch. Without
+    the fix, cli_args still contains --config_path=<original> and draccus reads
+    the original YAML with `path:` still in it, crashing on the unknown field.
+    """
+    config = {
+        "nested": {"path": "some/checkpoint", "foo": 42},
+        "other": "set-via-yaml",
+    }
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+        yaml.dump(config, f)
+        config_path = f.name
+
+    _config_path_args.clear()
+    _config_yaml_overrides.clear()
+
+    captured: dict = {}
+
+    @parser.wrap()
+    def main(cfg: _DummyConfig) -> _DummyConfig:
+        captured["cfg"] = cfg
+        return cfg
+
+    with patch.object(sys, "argv", ["prog", f"--config_path={config_path}"]):
+        main()
+
+    assert captured["cfg"].other == "set-via-yaml"
+    assert _config_path_args["nested"] == "some/checkpoint"
+    # Cleaned config dropped `nested:` entirely; defaults stand for this wrapper
+    # class (a real PreTrainedConfig would now load the checkpoint and apply
+    # the captured yaml_overrides via from_pretrained()).
+    assert captured["cfg"].nested.foo == 0
+
+    _config_path_args.clear()
+    _config_yaml_overrides.clear()

From 5c98e80430d4a747926b45893568e388105a2400 Mon Sep 17 00:00:00 2001
From: Haoming Song <haomingsong24@gmail.com>
Date: Tue, 26 May 2026 20:04:22 +0800
Subject: [PATCH 04/45] fix(gr00t): fix Eagle25VL model and processor crash in
 transformers>=5.4.0, <5.6.0 (#3652)

Co-authored-by: Steven Palma <imstevenpmwork@ieee.org>
---
 .../policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py  | 1 +
 .../groot/eagle2_hg_model/processing_eagle2_5_vl.py         | 1 -
 src/lerobot/policies/groot/processor_groot.py               | 6 +++++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
index 5a66cfbce..6e5532ea4 100755
--- a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
+++ b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
@@ -60,6 +60,7 @@ class Eagle25VLPreTrainedModel(PreTrainedModel):
         "SiglipEncoderLayer",
     ]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
     _supports_flash_attn_2 = True
     _supports_cache_class = True
     _supports_static_cache = True
diff --git a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
index 7b1f67fef..b36e70c47 100755
--- a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
+++ b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
@@ -124,7 +124,6 @@ class Eagle25VLProcessor(ProcessorMixin):
         "videos_kwargs",
         "text_kwargs",
     ]
-    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
diff --git a/src/lerobot/policies/groot/processor_groot.py b/src/lerobot/policies/groot/processor_groot.py
index 3367de711..6848c7c84 100644
--- a/src/lerobot/policies/groot/processor_groot.py
+++ b/src/lerobot/policies/groot/processor_groot.py
@@ -206,7 +206,11 @@ def _build_eagle_processor(tokenizer_assets_repo: str = DEFAULT_TOKENIZER_ASSETS
             "Vendor files are copied during model creation. Create the policy/model first, "
             "or call ensure_eagle_cache_ready() before building processors."
         )
-    proc = AutoProcessor.from_pretrained(str(cache_dir), trust_remote_code=True, use_fast=True)
+    proc = AutoProcessor.from_pretrained(
+        str(cache_dir),
+        trust_remote_code=True,
+        fix_mistral_regex=False,
+    )
     proc.tokenizer.padding_side = "left"
     return proc
 

From e86f5af5bf30d7cd442d07b862b3fbb82f5c79b2 Mon Sep 17 00:00:00 2001
From: Khalil Meftah <khalil.meftah@huggingface.co>
Date: Wed, 27 May 2026 14:24:31 +0200
Subject: [PATCH 05/45] feat(rewards): add TOPReward reward model (#3629)

* feat(rewards): add TOPReward reward model

* refactor(rewards): clean up TOPReward processor/model

* fix(rewards/topreward): add missing input keys mm_token_type_ids

* fix(rewards/topreward): fix pyproject extra typo and simplify processor (#3653)

Add lerobot[topreward] extra to all in
pyproject.toml, drop the redundant labels arg in scoring, and
collapse the dead-branch shape check in the encoder processor.

* optmize topreward input processing (#3660)

---------

Co-authored-by: Cole <91766445+jcoleharrison@users.noreply.github.com>
Co-authored-by: Haoming Song <haomingsong24@gmail.com>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/topreward.mdx                     | 177 +++++++++
 pyproject.toml                                |   2 +
 src/lerobot/rewards/__init__.py               |   2 +
 src/lerobot/rewards/factory.py                |  19 +-
 src/lerobot/rewards/topreward/__init__.py     |  19 +
 .../rewards/topreward/compute_rabc_weights.py | 353 ++++++++++++++++++
 .../topreward/configuration_topreward.py      | 146 ++++++++
 .../rewards/topreward/modeling_topreward.py   | 238 ++++++++++++
 .../rewards/topreward/processor_topreward.py  | 305 +++++++++++++++
 .../lerobot_rewardmodel_modelcard_template.md |   2 +
 tests/rewards/test_modeling_topreward.py      | 296 +++++++++++++++
 tests/rewards/test_topreward.py               |  80 ++++
 tests/rewards/test_topreward_processor.py     | 246 ++++++++++++
 uv.lock                                       |   7 +-
 15 files changed, 1891 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/topreward.mdx
 create mode 100644 src/lerobot/rewards/topreward/__init__.py
 create mode 100644 src/lerobot/rewards/topreward/compute_rabc_weights.py
 create mode 100644 src/lerobot/rewards/topreward/configuration_topreward.py
 create mode 100644 src/lerobot/rewards/topreward/modeling_topreward.py
 create mode 100644 src/lerobot/rewards/topreward/processor_topreward.py
 create mode 100644 tests/rewards/test_modeling_topreward.py
 create mode 100644 tests/rewards/test_topreward.py
 create mode 100644 tests/rewards/test_topreward_processor.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 412386e2d..527cb7e63 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -73,6 +73,8 @@
 - sections:
   - local: sarm
     title: SARM
+  - local: topreward
+    title: TOPReward
   title: "Reward Models"
 - sections:
   - local: inference
diff --git a/docs/source/topreward.mdx b/docs/source/topreward.mdx
new file mode 100644
index 000000000..f84fbed49
--- /dev/null
+++ b/docs/source/topreward.mdx
@@ -0,0 +1,177 @@
+# TOPReward
+
+TOPReward is a **zero-shot reward model** that extracts token log-probabilities from an off-the-shelf vision-language model (VLM) as a robotic reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood that the instruction is true — no fine-tuning required.
+
+**Paper**: [TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics](https://arxiv.org/abs/2602.19313)
+**Project**: [topreward.github.io](https://topreward.github.io/webpage/)
+**Original code**: [github.com/TOPReward/TOPReward](https://github.com/TOPReward/TOPReward)
+**Default backbone**: [Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
+
+## Overview
+
+TOPReward asks a generic VLM how likely a task instruction is, **conditioned on the video** of a robot trying to complete that task. Concretely, given:
+
+- A trajectory video (a sequence of frames).
+- A task instruction (e.g. _"open the drawer"_).
+
+it builds a chat prompt of the form
+
+```text
+<video>
+"The above video shows a robot manipulation trajectory that completes the
+ following task: <instruction> Decide whether the above statement is True
+ or not. The answer is: True"
+```
+
+forwards it through the VLM, label-masks everything except the very last token, and reads back the log-probability of that token — by default the literal `"True"` that closes the suffix template. The resulting `log P("True" | video + prompt + instruction)` is the reward.
+
+Because the method only depends on a frozen VLM, TOPReward is **zero-shot**: there are no fine-tuned weights to host. The "model" in LeRobot is a small wrapper around `transformers`' `Qwen3VLForConditionalGeneration` plus the label-masking logic. The processor owns the tokeniser and builds the full chat prompt (EO-1/Robometer pattern).
+
+## What the LeRobot integration covers
+
+- Standard `reward_model.type=topreward` configuration through LeRobot.
+- VLM loading via the `transformers` `Qwen3VLForConditionalGeneration` API.
+- Prompt assembly + tokenisation in the processor (matching upstream `QwenClient.compute_instruction_reward`).
+- `compute_reward()` returns one scalar log-prob per sample.
+- LeRobot reward-model save/load — `save_pretrained` writes only `config.json` (the VLM is identified by `vlm_name`).
+- An offline labeling script that writes a `topreward_progress.parquet` (SARM-compatible schema) for RA-BC and overlay.
+
+The current LeRobot port supports the **Qwen3-VL client only**. Other upstream clients (Gemini, OpenAI, Gemma, Molmo) can be added as follow-up extras.
+
+## Installation Requirements
+
+1. Install LeRobot following the [Installation Guide](./installation).
+2. Install the TOPReward optional extra:
+
+```bash
+pip install -e ".[topreward]"
+```
+
+or, with `uv` from a source checkout:
+
+```bash
+uv sync --extra topreward
+```
+
+This pulls in `transformers`. The first time you run TOPReward, Hugging Face will also download the VLM weights from the Hub (~16 GB for Qwen3-VL-8B-Instruct). A GPU is strongly recommended.
+
+## Model Inputs and Outputs
+
+TOPReward expects:
+
+- A trajectory video or sequence of frames.
+- A natural-language task description.
+
+In LeRobot datasets the preprocessor reads:
+
+| Config field              | Default                     | Meaning                                       |
+| ------------------------- | --------------------------- | --------------------------------------------- |
+| `reward_model.image_key`  | `observation.images.top`    | Camera observation used by TOPReward          |
+| `reward_model.task_key`   | `task`                      | Key in complementary data for the task string |
+| `reward_model.max_frames` | `16`                        | Cap on frames per sample                      |
+| `reward_model.fps`        | `2.0`                       | Metadata passed to the Qwen video processor   |
+| `reward_model.vlm_name`   | `Qwen/Qwen3-VL-8B-Instruct` | Hugging Face Hub id of the underlying VLM     |
+
+The model returns:
+
+- `compute_reward(batch)`: one log-probability per sample. Higher = better task-video alignment. When `success_threshold` is finite, returns the binary thresholded value instead.
+
+## Usage
+
+### Load the reward model directly
+
+```python
+from lerobot.rewards.topreward import TOPRewardConfig, TOPRewardModel
+
+cfg = TOPRewardConfig(
+    vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+    device="cuda",
+)
+reward_model = TOPRewardModel(cfg)
+```
+
+### Use the reward factory
+
+```python
+from lerobot.rewards import make_reward_model, make_reward_model_config, make_reward_pre_post_processors
+
+cfg = make_reward_model_config(
+    "topreward",
+    vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+    device="cuda",
+    image_key="observation.images.top",
+)
+reward_model = make_reward_model(cfg)
+preprocessor, postprocessor = make_reward_pre_post_processors(cfg)
+```
+
+The preprocessor tokenises the full prompt (video + prefix + instruction suffix), writes Qwen-VL tensors + `prompt_length` under `observation.topreward.*`. The model reads those tensors, label-masks based on `prompt_length`, and extracts the log-prob reward.
+
+### Offline dataset labeling
+
+Write a `topreward_progress.parquet` for RA-BC training and overlay videos:
+
+```bash
+# Sparse-dense (15 anchors per episode, matches upstream)
+uv run python -m lerobot.rewards.topreward.compute_rabc_weights \
+    --dataset-repo-id lerobot/libero_10_image \
+    --num-samples 15 \
+    --device cuda
+```
+
+Then render the progress overlay for any episode:
+
+```bash
+uv run examples/dataset/create_progress_videos.py \
+    --repo-id lerobot/libero_10_image \
+    --episode 0 \
+    --progress-file topreward_progress.parquet \
+    --gif
+```
+
+## Configuration Notes
+
+### Prompt knobs
+
+The default prompt mirrors the upstream paper:
+
+```text
+prompt_prefix = "The above video shows a robot manipulation trajectory that completes the following task: "
+prompt_suffix_template = "{instruction} Decide whether the above statement is True or not. The answer is: True"
+```
+
+Both are exposed on `TOPRewardConfig` for ablation. The suffix template **must** contain `{instruction}`.
+
+### Chat template
+
+`add_chat_template=True` wraps the full prompt (including instruction) with the tokenizer's chat template before tokenisation. Default is `False`, matching the upstream paper's main experiments.
+
+## Limitations
+
+- The current LeRobot port is **inference-only and zero-shot**; `forward()` is not overridden and `is_trainable` returns `False`.
+- Only the **Qwen3-VL family** is supported; other upstream clients are out of scope.
+- TOPReward inherits the underlying VLM's biases.
+
+## References
+
+- [TOPReward project page](https://topreward.github.io/webpage/)
+- [TOPReward paper](https://arxiv.org/abs/2602.19313)
+- [Original TOPReward code](https://github.com/TOPReward/TOPReward)
+- [Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
+
+## Citation
+
+```bibtex
+@article{chen2026topreward,
+  title={TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics},
+  author={Chen, Shirui and Harrison, Cole and Lee, Ying-Chun and Yang, Angela Jin and
+          Ren, Zhongzheng and Ratliff, Lillian J and Duan, Jiafei and Fox, Dieter and
+          Krishna, Ranjay},
+  journal={arXiv preprint arXiv:2602.19313},
+  year={2026}
+}
+```
+
+## License
+
+The original TOPReward codebase is MIT-licensed. The LeRobot port follows the LeRobot Apache 2.0 license; the wrapped Qwen3-VL weights are subject to the original Qwen license.
diff --git a/pyproject.toml b/pyproject.toml
index 5d182648c..264297c5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -211,6 +211,7 @@ groot = [
     "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
 ]
 sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
+topreward = ["lerobot[transformers-dep]"]
 xvla = ["lerobot[transformers-dep]"]
 eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
 hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
@@ -288,6 +289,7 @@ all = [
     "lerobot[libero]; sys_platform == 'linux'",
     "lerobot[metaworld]",
     "lerobot[sarm]",
+    "lerobot[topreward]",
     "lerobot[peft]",
     # "lerobot[unitree_g1]", TODO: Unitree requires specific installation instructions for unitree_sdk2
 ]
diff --git a/src/lerobot/rewards/__init__.py b/src/lerobot/rewards/__init__.py
index 203fe2ee1..ae23424e3 100644
--- a/src/lerobot/rewards/__init__.py
+++ b/src/lerobot/rewards/__init__.py
@@ -21,11 +21,13 @@ from .factory import (
 )
 from .pretrained import PreTrainedRewardModel as PreTrainedRewardModel
 from .sarm.configuration_sarm import SARMConfig as SARMConfig
+from .topreward.configuration_topreward import TOPRewardConfig as TOPRewardConfig
 
 __all__ = [
     # Configuration classes
     "RewardClassifierConfig",
     "SARMConfig",
+    "TOPRewardConfig",
     # Base class
     "PreTrainedRewardModel",
     # Factory functions
diff --git a/src/lerobot/rewards/factory.py b/src/lerobot/rewards/factory.py
index c173f44a5..d500cc593 100644
--- a/src/lerobot/rewards/factory.py
+++ b/src/lerobot/rewards/factory.py
@@ -26,6 +26,7 @@ from lerobot.processor import PolicyAction, PolicyProcessorPipeline
 from .classifier.configuration_classifier import RewardClassifierConfig
 from .pretrained import PreTrainedRewardModel
 from .sarm.configuration_sarm import SARMConfig
+from .topreward.configuration_topreward import TOPRewardConfig
 
 
 def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
@@ -37,7 +38,7 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
 
     Args:
         name: The name of the reward model. Supported names are "reward_classifier",
-              "sarm".
+              "sarm", "topreward".
 
     Returns:
         The reward model class corresponding to the given name.
@@ -53,6 +54,10 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
         from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
 
         return SARMRewardModel
+    elif name == "topreward":
+        from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+        return TOPRewardModel
     else:
         try:
             return _get_reward_model_cls_from_name(name=name)
@@ -69,7 +74,7 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
 
     Args:
         reward_type: The type of the reward model. Supported types include
-                     "reward_classifier", "sarm".
+                     "reward_classifier", "sarm", "topreward".
         **kwargs: Keyword arguments to be passed to the configuration class constructor.
 
     Returns:
@@ -82,6 +87,8 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
         return RewardClassifierConfig(**kwargs)
     elif reward_type == "sarm":
         return SARMConfig(**kwargs)
+    elif reward_type == "topreward":
+        return TOPRewardConfig(**kwargs)
     else:
         try:
             config_cls = RewardModelConfig.get_choice_class(reward_type)
@@ -162,6 +169,14 @@ def make_reward_pre_post_processors(
             dataset_meta=kwargs.get("dataset_meta"),
         )
 
+    elif isinstance(reward_cfg, TOPRewardConfig):
+        from lerobot.rewards.topreward.processor_topreward import make_topreward_pre_post_processors
+
+        return make_topreward_pre_post_processors(
+            config=reward_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+        )
+
     else:
         try:
             processors = _make_processors_from_reward_model_config(
diff --git a/src/lerobot/rewards/topreward/__init__.py b/src/lerobot/rewards/topreward/__init__.py
new file mode 100644
index 000000000..9b03ca866
--- /dev/null
+++ b/src/lerobot/rewards/topreward/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_topreward import TOPRewardConfig
+from .modeling_topreward import TOPRewardModel
+from .processor_topreward import make_topreward_pre_post_processors
+
+__all__ = ["TOPRewardConfig", "TOPRewardModel", "make_topreward_pre_post_processors"]
diff --git a/src/lerobot/rewards/topreward/compute_rabc_weights.py b/src/lerobot/rewards/topreward/compute_rabc_weights.py
new file mode 100644
index 000000000..a448654e5
--- /dev/null
+++ b/src/lerobot/rewards/topreward/compute_rabc_weights.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute per-frame TOPReward progress curves for a LeRobot dataset.
+
+For each episode, scores trajectory prefixes of increasing length using
+the TOPReward reward model, min-max normalises the raw log-prob rewards per episode,
+and writes a parquet file with one row per frame.
+
+The parquet uses the same schema as SARM's :mod:`lerobot.rewards.sarm.compute_rabc_weights`.
+
+Usage:
+    # Sparse-dense mode (15 anchors per episode, matches upstream)
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --num-samples 15
+
+    # Use a different VLM backbone
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --vlm-name Qwen/Qwen3-VL-4B-Instruct
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import torch
+from tqdm import tqdm
+
+from lerobot.datasets import LeRobotDataset
+from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig
+from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+from lerobot.rewards.topreward.processor_topreward import TOPRewardEncoderProcessorStep
+from lerobot.types import TransitionKey
+
+DEFAULT_OUTPUT_FILENAME = "topreward_progress.parquet"
+
+
+def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
+    """Read ``reward_model_path`` from parquet metadata if available."""
+    if not parquet_path.exists():
+        return None
+    try:
+        metadata = pq.read_metadata(parquet_path).schema.to_arrow_schema().metadata
+        if metadata and b"reward_model_path" in metadata:
+            return metadata[b"reward_model_path"].decode()
+    except Exception:  # nosec B110
+        return None
+    return None
+
+
+def _resolve_task(sample: dict[str, Any], default: str) -> str:
+    """Best-effort task extraction from a dataset sample."""
+    task = sample.get("task")
+    if isinstance(task, str) and task:
+        return task
+    return default
+
+
+def normalize_rewards(rewards: list[float] | np.ndarray) -> np.ndarray:
+    """Min-max normalise raw log-prob rewards into ``[0, 1]``."""
+    rewards_arr = np.asarray(rewards, dtype=np.float64)
+    if rewards_arr.size == 0:
+        return rewards_arr.astype(np.float32)
+    if rewards_arr.size == 1:
+        return np.array([1.0], dtype=np.float32)
+    r_min, r_max = rewards_arr.min(), rewards_arr.max()
+    if r_max == r_min:
+        return np.ones_like(rewards_arr, dtype=np.float32)
+    return ((rewards_arr - r_min) / (r_max - r_min)).astype(np.float32)
+
+
+def compute_instruction_rewards_for_prefixes(
+    model: TOPRewardModel,
+    encoder: TOPRewardEncoderProcessorStep,
+    dataset: LeRobotDataset,
+    ep_start: int,
+    num_frames: int,
+    task: str,
+    image_key: str,
+    num_samples: int | None,
+    device: str,
+) -> np.ndarray:
+    """Score an episode via prefix sweep and return a per-frame normalised curve."""
+    if num_samples is None or num_samples >= num_frames:
+        prefix_lengths = np.arange(1, num_frames + 1, dtype=np.int64)
+    else:
+        prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64))
+
+    episode_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
+    rewards: list[float] = []
+    for length in prefix_lengths:
+        frames = episode_frames[: int(length)].unsqueeze(0)  # (1, T, C, H, W)
+
+        transition = {
+            TransitionKey.OBSERVATION: {image_key: frames},
+            TransitionKey.COMPLEMENTARY_DATA: {"task": task},
+        }
+        encoded = encoder(transition)
+        obs = encoded[TransitionKey.OBSERVATION]
+        batch = {
+            key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in obs.items()
+        }
+
+        with torch.no_grad():
+            reward = model.compute_reward(batch)
+        rewards.append(float(reward.item()))
+
+    normalized_rewards = normalize_rewards(rewards)
+
+    if prefix_lengths.shape[0] == num_frames:
+        return normalized_rewards
+
+    return np.interp(
+        np.arange(1, num_frames + 1, dtype=np.float64),
+        prefix_lengths.astype(np.float64),
+        normalized_rewards.astype(np.float64),
+    ).astype(np.float32)
+
+
+def compute_topreward_progress(
+    dataset_repo_id: str,
+    reward_model_path: str | None = None,
+    vlm_name: str | None = None,
+    output_path: str | None = None,
+    device: str = "cuda",
+    num_samples: int | None = None,
+    fps: float | None = None,
+    episodes: list[int] | None = None,
+) -> Path:
+    """Run TOPReward over a dataset and write per-frame progress."""
+    if reward_model_path is not None:
+        logging.info(f"Loading TOPReward config from: {reward_model_path}")
+        model = TOPRewardModel.from_pretrained(reward_model_path)
+        config = model.config
+        config.device = device
+        if vlm_name is not None and vlm_name != config.vlm_name:
+            logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}")
+            config.vlm_name = vlm_name
+            model = TOPRewardModel(config)
+    else:
+        config_kwargs: dict[str, Any] = {"device": device}
+        if vlm_name is not None:
+            config_kwargs["vlm_name"] = vlm_name
+        if fps is not None:
+            config_kwargs["fps"] = fps
+        config = TOPRewardConfig(**config_kwargs)
+        logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}")
+        model = TOPRewardModel(config)
+
+    model.to(device).eval()
+
+    encoder = TOPRewardEncoderProcessorStep(
+        vlm_name=config.vlm_name,
+        image_key=config.image_key,
+        task_key=config.task_key,
+        default_task=config.default_task,
+        max_frames=None,  # no tail-crop: we control prefix length explicitly
+        fps=config.fps,
+        prompt_prefix=config.prompt_prefix,
+        prompt_suffix_template=config.prompt_suffix_template,
+        add_chat_template=config.add_chat_template,
+        max_length=config.max_input_length,
+    )
+
+    image_key = config.image_key
+
+    logging.info(f"Loading dataset: {dataset_repo_id}")
+    dataset = LeRobotDataset(dataset_repo_id, download_videos=True)
+    logging.info(f"Dataset: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
+
+    episode_indices = list(range(dataset.num_episodes)) if episodes is None else episodes
+    logging.info(f"Processing {len(episode_indices)} episode(s)")
+
+    all_index: list[int] = []
+    all_episode: list[int] = []
+    all_frame: list[int] = []
+    all_progress: list[float] = []
+
+    for episode_idx in tqdm(episode_indices, desc="Episodes"):
+        ep = dataset.meta.episodes[episode_idx]
+        ep_start = int(ep["dataset_from_index"])
+        ep_end = int(ep["dataset_to_index"])
+        num_frames = ep_end - ep_start
+        if num_frames <= 0:
+            continue
+
+        first_sample = dataset[ep_start]
+        task = _resolve_task(first_sample, default=config.default_task or "perform the task")
+
+        per_frame = compute_instruction_rewards_for_prefixes(
+            model=model,
+            encoder=encoder,
+            dataset=dataset,
+            ep_start=ep_start,
+            num_frames=num_frames,
+            task=task,
+            image_key=image_key,
+            num_samples=num_samples,
+            device=device,
+        )
+
+        for local in range(num_frames):
+            all_index.append(ep_start + local)
+            all_episode.append(episode_idx)
+            all_frame.append(local)
+            all_progress.append(float(per_frame[local]))
+
+        if device.startswith("cuda"):
+            torch.cuda.empty_cache()
+
+    table = pa.table(
+        {
+            "index": np.asarray(all_index, dtype=np.int64),
+            "episode_index": np.asarray(all_episode, dtype=np.int64),
+            "frame_index": np.asarray(all_frame, dtype=np.int64),
+            "progress_sparse": np.asarray(all_progress, dtype=np.float32),
+        }
+    )
+
+    schema_metadata: dict[bytes, bytes] = {b"vlm_name": config.vlm_name.encode()}
+    if reward_model_path is not None:
+        schema_metadata[b"reward_model_path"] = reward_model_path.encode()
+    table = table.replace_schema_metadata(schema_metadata)
+
+    out = Path(dataset.root) / DEFAULT_OUTPUT_FILENAME if output_path is None else Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(table, out)
+    logging.info(f"Saved {len(table)} frame values to {out}")
+
+    progress_arr = np.asarray(all_progress, dtype=np.float32)
+    if progress_arr.size:
+        logging.info(
+            f"Progress: mean={float(progress_arr.mean()):.4f}, "
+            f"std={float(progress_arr.std()):.4f}, "
+            f"min={float(progress_arr.min()):.4f}, "
+            f"max={float(progress_arr.max()):.4f}"
+        )
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute per-frame TOPReward progress curves for RA-BC weighting.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Sparse-dense mode (matches upstream TOPReward num_samples=15)
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --num-samples 15
+
+    # Use a smaller VLM
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --vlm-name Qwen/Qwen3-VL-4B-Instruct
+        """,
+    )
+    parser.add_argument(
+        "--dataset-repo-id", type=str, required=True, help="HuggingFace dataset repo id or local path."
+    )
+    parser.add_argument(
+        "--reward-model-path", type=str, default=None, help="Optional TOPReward LeRobot config."
+    )
+    parser.add_argument("--vlm-name", type=str, default=None, help="Override the VLM backbone (HF Hub id).")
+    parser.add_argument("--output-path", type=str, default=None, help="Output parquet path.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use (default: cuda).")
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        default=None,
+        help="Anchor prefix samples per episode. None = dense. 15 matches upstream.",
+    )
+    parser.add_argument(
+        "--episodes",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).",
+    )
+    parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.")
+    parser.add_argument(
+        "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    output_path = compute_topreward_progress(
+        dataset_repo_id=args.dataset_repo_id,
+        reward_model_path=args.reward_model_path,
+        vlm_name=args.vlm_name,
+        output_path=args.output_path,
+        device=args.device,
+        num_samples=args.num_samples,
+        fps=args.fps,
+        episodes=args.episodes,
+    )
+
+    print(f"\nTOPReward progress saved to: {output_path}")
+
+    if args.push_to_hub:
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        hub_path = DEFAULT_OUTPUT_FILENAME
+
+        print(f"\nUploading to Hub: {args.dataset_repo_id}/{hub_path}")
+        api.upload_file(
+            path_or_fileobj=str(output_path),
+            path_in_repo=hub_path,
+            repo_id=args.dataset_repo_id,
+            repo_type="dataset",
+        )
+        print(
+            "Successfully uploaded to: "
+            f"https://huggingface.co/datasets/{args.dataset_repo_id}/blob/main/{hub_path}"
+        )
+
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: hf://datasets/{args.dataset_repo_id}/{hub_path}")
+        print("  rabc_head_mode: sparse")
+    else:
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: {output_path}")
+        print("  rabc_head_mode: sparse")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/lerobot/rewards/topreward/configuration_topreward.py b/src/lerobot/rewards/topreward/configuration_topreward.py
new file mode 100644
index 000000000..7302734c8
--- /dev/null
+++ b/src/lerobot/rewards/topreward/configuration_topreward.py
@@ -0,0 +1,146 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.utils.constants import OBS_IMAGES
+
+# Default prompt scaffolding from the upstream TOPReward paper / reference
+# implementation (``QwenClient.compute_instruction_reward``). The prompt
+# scores the terminal ``True`` token in ``f"{instruction} ... True"``
+# given the video.
+DEFAULT_PROMPT_PREFIX = (
+    "The above video shows a robot manipulation trajectory that completes the following task: "
+)
+DEFAULT_PROMPT_SUFFIX_TEMPLATE = (
+    "{instruction} Decide whether the above statement is True or not. The answer is: True"
+)
+
+
+@RewardModelConfig.register_subclass("topreward")
+@dataclass
+class TOPRewardConfig(RewardModelConfig):
+    """Configuration for the TOPReward zero-shot reward model.
+
+    TOPReward is **zero-shot**: it has no learnable parameters of its own.
+    The "model" is a generic vision-language model (default
+    ``Qwen/Qwen3-VL-8B-Instruct``) used with a fixed prompt to extract
+    token log-probabilities as a reward signal. There is therefore no
+    fine-tuned checkpoint to host: ``pretrained_path`` is unused at
+    runtime — the model identity is :attr:`vlm_name` (an HF Hub id).
+
+    Args:
+        vlm_name: Hugging Face Hub id of the underlying VLM. Must be a
+            Qwen3-VL family model (the only client implemented in this
+            LeRobot port).
+        torch_dtype: Torch dtype name passed to the VLM loader
+            (``"auto"``, ``"bfloat16"``, ``"float16"``, ...).
+        attn_implementation: ``transformers`` attention implementation
+            (e.g. ``"flash_attention_2"``, ``"sdpa"``). Defaults to
+            ``None`` so the upstream picks the best available.
+        image_key: Observation key that holds the trajectory frames.
+        task_key: Complementary-data key that holds the task instruction.
+        default_task: Fallback instruction when ``task_key`` is absent.
+        max_frames: Cap on the number of frames fed to the VLM per
+            sample. ``None`` = use all frames.
+        fps: Frames-per-second metadata for the Qwen video processor.
+        prompt_prefix: Text shown to the VLM right after the video and
+            before the suffix template.
+        prompt_suffix_template: Suffix appended after ``prompt_prefix``.
+            Must contain ``{instruction}``; the VLM scores the
+            log-likelihood of the tokens that follow the prefix.
+        add_chat_template: If ``True``, wrap the full prompt with the
+            tokenizer's chat template before tokenisation (matches
+            upstream ``add_chat_template=True``).
+        success_threshold: Optional log-prob threshold. If finite,
+            :meth:`TOPRewardModel.compute_reward` returns
+            ``(reward > success_threshold).float()`` instead of the raw
+            log-prob.
+        max_input_length: Hard limit on the total tokenized input length;
+            samples that exceed it raise a ``ValueError``.
+    """
+
+    # Path to a local LeRobot dir or HF repo that holds a ``config.json``
+    # snapshot of this TOPRewardConfig. The VLM weights themselves are
+    # always identified by ``vlm_name``.
+    pretrained_path: str | None = None
+
+    vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
+    torch_dtype: str = "auto"
+    attn_implementation: str | None = None
+
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 16
+    fps: float = 2.0
+
+    prompt_prefix: str = DEFAULT_PROMPT_PREFIX
+    prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
+    add_chat_template: bool = False
+
+    success_threshold: float = float("-inf")
+    max_input_length: int = 32768
+
+    license: str | None = "mit"  # matches upstream TOPReward
+    tags: list[str] | None = field(
+        default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
+    )
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "REWARD": NormalizationMode.IDENTITY,
+        }
+    )
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.max_frames is not None and self.max_frames < 1:
+            raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
+        if self.fps <= 0:
+            raise ValueError(f"fps must be > 0, got {self.fps}")
+        if "{instruction}" not in self.prompt_suffix_template:
+            raise ValueError(
+                "prompt_suffix_template must contain `{instruction}` so the model "
+                "scores the log-likelihood of the task suffix."
+            )
+        if self.max_input_length <= 0:
+            raise ValueError(f"max_input_length must be > 0, got {self.max_input_length}")
+
+        if self.image_key not in self.input_features:
+            self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
+        self.output_features.setdefault("reward", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+
+    @property
+    def observation_delta_indices(self) -> list[int] | None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> None:
+        return None
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def validate_features(self) -> None:
+        if self.image_key not in self.input_features:
+            raise ValueError(f"TOPReward requires image input feature {self.image_key!r}")
diff --git a/src/lerobot/rewards/topreward/modeling_topreward.py b/src/lerobot/rewards/topreward/modeling_topreward.py
new file mode 100644
index 000000000..4958d5449
--- /dev/null
+++ b/src/lerobot/rewards/topreward/modeling_topreward.py
@@ -0,0 +1,238 @@
+# Copyright 2026 Shirui Chen, Cole Harrison, Ying-Chun Lee, Angela Jin Yang,
+# Zhongzheng Ren, Lillian J. Ratliff, Jiafei Duan, Dieter Fox, Ranjay Krishna
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics.
+
+Paper:         https://arxiv.org/abs/2602.19313
+Project:       https://topreward.github.io/webpage/
+Original code: https://github.com/TOPReward/TOPReward
+Backbone:      https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct  (default)
+
+TOPReward is a **zero-shot** reward model: it has no fine-tuned weights of
+its own. Given a video trajectory and a task instruction, it asks an
+off-the-shelf VLM how likely the instruction is, conditioned on the video,
+and returns that log-likelihood as the reward signal.
+
+Inference recipe:
+
+1. The processor builds a chat-style prompt, tokenises it, and emits
+   ``input_ids``, ``attention_mask``, vision tensors, and ``labels``.
+   The processor label-masks everything except the terminal answer token with
+   ``-100``.
+2. Forward the full token sequence through the VLM.
+3. Read the terminal answer token log-probability from the logits as the
+   scalar reward.
+
+With the default ``prompt_suffix_template``, the only unmasked token is the
+literal ``"True"`` at the end — the reward is
+``log P("True" | video + prompt + instruction)``.
+
+This LeRobot port is **inference-only and not trainable** — :meth:`forward`
+is intentionally inherited from :class:`PreTrainedRewardModel` and raises
+``NotImplementedError``, making :attr:`PreTrainedRewardModel.is_trainable`
+return ``False``.
+
+Because the VLM weights live on the Hugging Face Hub under their canonical
+id (``Qwen/Qwen3-VL-8B-Instruct`` etc.) and TOPReward never modifies them,
+:meth:`_save_pretrained` and :meth:`from_pretrained` are overridden so a
+TOPReward LeRobot "checkpoint" is a single ``config.json`` (the VLM is
+re-fetched from the Hub at load time).
+"""
+
+from __future__ import annotations
+
+import builtins
+import logging
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, TypeVar
+
+import numpy as np
+import torch
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.constants import CONFIG_NAME
+from huggingface_hub.errors import HfHubHTTPError
+from torch import Tensor
+from torch.nn.functional import cross_entropy
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig
+from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING:
+    from lerobot.configs.train import TrainPipelineConfig
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import Qwen3VLForConditionalGeneration
+else:
+    Qwen3VLForConditionalGeneration = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T", bound="TOPRewardModel")
+
+
+def _torch_dtype(name: str) -> torch.dtype | str:
+    """Resolve a torch dtype name; ``"auto"`` is passed through verbatim."""
+    if name == "auto":
+        return "auto"
+    dtype = getattr(torch, name, None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    raise ValueError(f"Unknown torch dtype: {name!r}")
+
+
+class TOPRewardModel(PreTrainedRewardModel):
+    """TOPReward zero-shot reward model."""
+
+    name = "topreward"
+    config_class = TOPRewardConfig
+
+    def __init__(self, config: TOPRewardConfig) -> None:
+        require_package("transformers", extra="topreward")
+        super().__init__(config)
+        self.config = config
+
+        torch_dtype = _torch_dtype(config.torch_dtype)
+        model_kwargs: dict[str, Any] = {"dtype": torch_dtype, "trust_remote_code": True}
+        if config.attn_implementation is not None:
+            model_kwargs["attn_implementation"] = config.attn_implementation
+
+        self.model = Qwen3VLForConditionalGeneration.from_pretrained(config.vlm_name, **model_kwargs)
+
+    def compute_reward(self, batch: dict[str, Any]) -> Tensor:
+        """Return one log-prob reward per sample in the batch."""
+        inputs: dict[str, Any] = {}
+        for key in TOPREWARD_INPUT_KEYS:
+            batch_key = f"{TOPREWARD_FEATURE_PREFIX}{key}"
+            if batch_key not in batch:
+                raise KeyError(
+                    f"TOPReward batch missing `{batch_key}`. Make sure the "
+                    "TOPRewardEncoderProcessorStep ran before `compute_reward`."
+                )
+            inputs[key] = batch[batch_key]
+
+        device = next(self.model.parameters()).device
+        inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
+        labels = inputs.pop("labels")
+        inputs["logits_to_keep"] = 2
+
+        self.eval()
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        logits = outputs.logits
+        rewards = -cross_entropy(logits[:, -2, :].float(), labels[:, -1], reduction="none")
+        if np.isfinite(self.config.success_threshold):
+            rewards = (rewards > self.config.success_threshold).float()
+        return rewards.to(self.config.device or "cpu")
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save ``config.json`` only."""
+        self.config._save_pretrained(save_directory)
+
+    @classmethod
+    def from_pretrained(
+        cls: builtins.type[T],
+        pretrained_name_or_path: str | Path,
+        *,
+        config: RewardModelConfig | None = None,
+        force_download: bool = False,
+        resume_download: bool | None = None,
+        proxies: dict | None = None,
+        token: str | bool | None = None,
+        cache_dir: str | Path | None = None,
+        local_files_only: bool = False,
+        revision: str | None = None,
+        strict: bool = False,  # noqa: ARG003 — accepted for API parity; unused (no safetensors to load)
+        **kwargs: Any,
+    ) -> T:
+        """Load a TOPReward configuration and instantiate the wrapped VLM."""
+        if config is None:
+            config = RewardModelConfig.from_pretrained(
+                pretrained_name_or_path=pretrained_name_or_path,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                token=token,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                revision=revision,
+                **kwargs,
+            )
+        if not isinstance(config, TOPRewardConfig):
+            raise TypeError(
+                f"Expected a TOPRewardConfig, got {type(config).__name__}. Make sure "
+                f"`pretrained_name_or_path={pretrained_name_or_path!r}` points at a "
+                "TOPReward checkpoint."
+            )
+
+        model_id = str(pretrained_name_or_path)
+        if not os.path.isdir(model_id):
+            try:
+                hf_hub_download(
+                    repo_id=model_id,
+                    filename=CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except HfHubHTTPError as e:
+                raise FileNotFoundError(
+                    f"{CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
+                ) from e
+
+        instance = cls(config, **kwargs)
+        instance.to(config.device)
+        instance.eval()
+        return instance
+
+    def push_model_to_hub(self, cfg: TrainPipelineConfig):
+        """Push the TOPReward ``config.json`` + model card to the Hub."""
+        api = HfApi()
+        repo_id = api.create_repo(
+            repo_id=self.config.repo_id, private=self.config.private, exist_ok=True
+        ).repo_id
+
+        with TemporaryDirectory(ignore_cleanup_errors=True) as tmp:
+            saved_path = Path(tmp) / repo_id
+            saved_path.mkdir(parents=True, exist_ok=True)
+
+            self.config._save_pretrained(saved_path)
+
+            card = self.generate_model_card(
+                cfg.dataset.repo_id, self.config.type, self.config.license, self.config.tags
+            )
+            card.save(str(saved_path / "README.md"))
+
+            cfg.save_pretrained(saved_path)
+
+            commit_info = api.upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=saved_path,
+                commit_message="Upload TOPReward config and readme",
+                allow_patterns=["*.json", "*.yaml", "*.md"],
+                ignore_patterns=["*.tmp", "*.log", "*.safetensors"],
+            )
+
+            logger.info(f"Model pushed to {commit_info.repo_url.url}")
diff --git a/src/lerobot/rewards/topreward/processor_topreward.py b/src/lerobot/rewards/topreward/processor_topreward.py
new file mode 100644
index 000000000..ff0646e49
--- /dev/null
+++ b/src/lerobot/rewards/topreward/processor_topreward.py
@@ -0,0 +1,305 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TOPReward pre/post processing pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import Tensor
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    policy_action_to_transition,
+)
+from lerobot.rewards.topreward.configuration_topreward import (
+    DEFAULT_PROMPT_PREFIX,
+    DEFAULT_PROMPT_SUFFIX_TEMPLATE,
+    TOPRewardConfig,
+)
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_IMAGES,
+    OBS_PREFIX,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoProcessor
+else:
+    AutoProcessor = None
+
+TOPREWARD_FEATURE_PREFIX = f"{OBS_PREFIX}topreward."
+
+_TRUE_ANSWER = "True"
+
+TOPREWARD_VLM_INPUT_KEYS = (
+    "input_ids",
+    "attention_mask",
+    "pixel_values_videos",
+    "video_grid_thw",
+    "mm_token_type_ids",
+)
+TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + ("labels",)
+
+
+def _prepare_video_batch(video: Tensor, *, max_frames: int | None) -> Tensor:
+    """Return videos as ``(B, T, C, H, W)`` uint8 tensors for Qwen3-VL."""
+    if video.ndim == 4:
+        video = video.unsqueeze(1)
+    elif video.ndim != 5:
+        raise ValueError(
+            f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(video.shape)}"
+        )
+
+    if max_frames is not None:
+        video = video[:, -max_frames:]
+    if video.shape[-1] in (1, 3):
+        video = video.permute(0, 1, 4, 2, 3)
+    elif video.shape[2] not in (1, 3):
+        raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
+
+    if video.is_floating_point():
+        video = video * 255.0
+
+    return video.clamp(0, 255).to(torch.uint8).contiguous()
+
+
+def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
+    if task is None:
+        task = default
+    if task is None:
+        raise KeyError("TOPReward expected a task description in complementary data")
+    if isinstance(task, str):
+        return [task] * batch_size
+    if isinstance(task, tuple):
+        task = list(task)
+    if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
+        raise TypeError(f"TOPReward task must be a string or list of strings, got {type(task)}")
+    if len(task) == 1 and batch_size > 1:
+        return task * batch_size
+    if len(task) != batch_size:
+        raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
+    return task
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="topreward_encoder")
+class TOPRewardEncoderProcessorStep(ProcessorStep):
+    """Encode raw frames + task into Qwen-VL tensors for the TOPReward model.
+
+    Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and
+    builds the full chat prompt including the instruction suffix. The
+    resulting ``input_ids``, ``attention_mask``, vision tensors, and
+    ``labels`` are written under the ``observation.topreward.*`` namespace
+    so the model can score without re-tokenising.
+
+    At call time the step reads:
+
+    - ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
+    - ``complementary_data[task_key]``: a string or list of strings.
+
+    and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}<name>"]`` for the
+    Qwen-VL tensors plus ``labels``.
+    """
+
+    vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 16
+    fps: float = 2.0
+    prompt_prefix: str = DEFAULT_PROMPT_PREFIX
+    prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
+    add_chat_template: bool = False
+    max_length: int = 32768
+
+    _processor: Any = field(default=None, init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="topreward")
+        self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True)
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        observation = transition.get(TransitionKey.OBSERVATION)
+        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        if self.image_key not in observation:
+            raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation")
+
+        frames = observation[self.image_key]
+        videos = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
+        videos = _prepare_video_batch(videos, max_frames=self.max_frames)
+
+        batch_size = videos.shape[0]
+        tasks = _expand_tasks(
+            complementary.get(self.task_key, self.default_task),
+            batch_size=batch_size,
+            default=self.default_task,
+        )
+
+        encoded = self._encode_batch(videos, tasks, batch_size)
+
+        new_observation = dict(observation)
+        for key, value in encoded.items():
+            new_observation[f"{TOPREWARD_FEATURE_PREFIX}{key}"] = value
+
+        new_transition = transition.copy()
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        return new_transition
+
+    def _encode_batch(self, videos: Tensor, tasks: list[str], batch_size) -> dict[str, Any]:
+        """Tokenise a batch of (frames, task) pairs into Qwen-VL tensors.
+
+        The loop only builds per-sample chat strings. Tokenisation, padding,
+        video preprocessing, and label construction are batched.
+        """
+
+        texts: list[str] = []
+        video_metadata = [
+            {
+                "total_num_frames": int(videos.shape[1]),
+                "fps": float(self.fps),
+                "frames_indices": list(range(int(videos.shape[1]))),
+            }
+            for _ in range(batch_size)
+        ]
+        eos_token = self._processor.tokenizer.eos_token
+
+        for i in range(batch_size):
+            instruction_suffix = self.prompt_suffix_template.format(instruction=tasks[i])
+            if self.add_chat_template:
+                suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip()
+                templated_messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video", "video": videos[i], "fps": self.fps},
+                            {"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"},
+                        ],
+                    }
+                ]
+                prompt_chat = self._processor.apply_chat_template(
+                    templated_messages, tokenize=False, add_generation_prompt=True
+                )
+                full_text = f"{prompt_chat}{_TRUE_ANSWER}"
+            else:
+                user_messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video", "video": videos[i], "fps": self.fps},
+                            {"type": "text", "text": self.prompt_prefix},
+                        ],
+                    }
+                ]
+                prompt_chat = self._processor.apply_chat_template(
+                    user_messages, tokenize=False, add_generation_prompt=False
+                )
+                if eos_token is not None:
+                    prompt_chat = prompt_chat.split(eos_token)[0]
+                full_text = f"{prompt_chat}{instruction_suffix}"
+
+            texts.append(full_text)
+
+        result = self._processor(
+            text=texts,
+            videos=videos,
+            video_metadata=video_metadata,
+            do_sample_frames=False,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        input_ids = result["input_ids"]
+
+        if input_ids.shape[-1] > self.max_length:
+            raise ValueError(
+                f"TOPReward input length {input_ids.shape[-1]} exceeds max_length "
+                f"{self.max_length}; lower `max_frames` or raise `max_length`."
+            )
+
+        labels = torch.full_like(input_ids, -100)
+        labels[:, -1] = input_ids[:, -1]
+        result["labels"] = labels
+        return result
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "vlm_name": self.vlm_name,
+            "image_key": self.image_key,
+            "task_key": self.task_key,
+            "default_task": self.default_task,
+            "max_frames": self.max_frames,
+            "fps": self.fps,
+            "prompt_prefix": self.prompt_prefix,
+            "prompt_suffix_template": self.prompt_suffix_template,
+            "add_chat_template": self.add_chat_template,
+            "max_length": self.max_length,
+        }
+
+
+def make_topreward_pre_post_processors(
+    config: TOPRewardConfig,
+    dataset_stats: dict[str, dict[str, Any]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
+
+    The preprocessor adds a batch dimension if needed, runs TOPReward's
+    encoder (which tokenises the full prompt and emits ``labels``), and
+    moves everything to the configured device. The postprocessor is
+    the identity since TOPReward outputs a single reward tensor.
+    """
+    preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+        steps=[
+            AddBatchDimensionProcessorStep(),
+            TOPRewardEncoderProcessorStep(
+                vlm_name=config.vlm_name,
+                image_key=config.image_key,
+                task_key=config.task_key,
+                default_task=config.default_task,
+                max_frames=config.max_frames,
+                fps=config.fps,
+                prompt_prefix=config.prompt_prefix,
+                prompt_suffix_template=config.prompt_suffix_template,
+                add_chat_template=config.add_chat_template,
+                max_length=config.max_input_length,
+            ),
+            DeviceProcessorStep(device=config.device or "cpu"),
+        ],
+        name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+    )
+    postprocessor = PolicyProcessorPipeline(
+        name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+        to_transition=policy_action_to_transition,
+    )
+    return preprocessor, postprocessor
diff --git a/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md b/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
index 933bf7586..11df95de5 100644
--- a/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
+++ b/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
@@ -13,6 +13,8 @@
 A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
 {% elif model_name == "sarm" %}
 A Success-Aware Reward Model (SARM) predicts a dense reward signal from observations, typically used downstream for reinforcement learning or human-in-the-loop fine-tuning when task success is not directly observable.
+{% elif model_name == "topreward" %}
+TOPReward is a **zero-shot** reward model that extracts token log-probabilities from an off-the-shelf vision-language model (default Qwen3-VL) as a reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood of the instruction being true, with no fine-tuning required.
 {% else %}
 _Reward model type not recognized — please update this template._
 {% endif %}
diff --git a/tests/rewards/test_modeling_topreward.py b/tests/rewards/test_modeling_topreward.py
new file mode 100644
index 000000000..0cd185e12
--- /dev/null
+++ b/tests/rewards/test_modeling_topreward.py
@@ -0,0 +1,296 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the TOPReward reward model."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
+from lerobot.rewards.topreward import TOPRewardConfig
+from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS
+from tests.utils import skip_if_package_missing
+
+
+class _FakeQwenModel(torch.nn.Module):
+    """Stand-in for ``Qwen3VLForConditionalGeneration``.
+
+    Returns a ``SimpleNamespace`` with ``logits`` of a controlled shape so
+    the log-prob extraction path in ``compute_reward`` can be exercised
+    without downloading real VLM weights.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._param = torch.nn.Parameter(torch.zeros(1))
+        self._reward_value: float = -1.5
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
+        return cls()
+
+    def forward(  # noqa: ARG002
+        self, input_ids, attention_mask=None, labels=None, logits_to_keep=0, **kwargs
+    ):
+        batch_size, seq_len = input_ids.shape
+        vocab_size = 1000
+        logits = torch.zeros(batch_size, seq_len, vocab_size)
+        # Place a controlled log-prob at the target token position so the
+        # model returns a predictable reward value.
+        # The label-masked suffix is the last token.
+        # After the causal-LM shift (logits[:, :-1], labels[:, 1:]) the scored
+        # position is logits[:, -2, :] predicting labels[:, -1].
+        # We set logits so that log_softmax at the target token ≈ _reward_value.
+        for i in range(batch_size):
+            target_idx = int(input_ids[i, -1].item())
+            logits[i, -2, target_idx] = self._reward_value * -10  # high logit -> high log-prob
+        if logits_to_keep:
+            logits = logits[:, -logits_to_keep:, :]
+        return SimpleNamespace(logits=logits)
+
+
+def _patch_build(monkeypatch) -> None:
+    """Stub out HF AutoX so TOPReward construction is cheap and offline."""
+    from lerobot.rewards.topreward import modeling_topreward
+
+    monkeypatch.setattr(modeling_topreward, "Qwen3VLForConditionalGeneration", _FakeQwenModel)
+
+
+def _make_batch(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+    labels: torch.Tensor | None = None,
+    *,
+    omit: str | None = None,
+) -> dict[str, torch.Tensor]:
+    """Build a ``compute_reward``-ready batch using TOPReward's namespaced keys."""
+    batch_size, seq_len = input_ids.shape
+    if attention_mask is None:
+        attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
+    batch: dict[str, torch.Tensor] = {}
+    if labels is not None:
+        batch[f"{TOPREWARD_FEATURE_PREFIX}labels"] = labels
+    batch.update(
+        {
+            f"{TOPREWARD_FEATURE_PREFIX}input_ids": input_ids,
+            f"{TOPREWARD_FEATURE_PREFIX}attention_mask": attention_mask,
+            f"{TOPREWARD_FEATURE_PREFIX}pixel_values_videos": torch.zeros(
+                batch_size, 1536, dtype=torch.float32
+            ),
+            f"{TOPREWARD_FEATURE_PREFIX}video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
+            f"{TOPREWARD_FEATURE_PREFIX}mm_token_type_ids": torch.zeros_like(input_ids),
+        }
+    )
+    if omit is not None:
+        batch.pop(f"{TOPREWARD_FEATURE_PREFIX}{omit}", None)
+    return batch
+
+
+def _terminal_labels(input_ids: torch.Tensor) -> torch.Tensor:
+    labels = torch.full_like(input_ids, -100)
+    labels[:, -1] = input_ids[:, -1]
+    return labels
+
+
+# ---------------------------------------------------------------------------
+# Registry + factory
+# ---------------------------------------------------------------------------
+
+
+def test_topreward_config_registered():
+    assert "topreward" in RewardModelConfig.get_known_choices()
+    assert RewardModelConfig.get_choice_class("topreward") is TOPRewardConfig
+    assert isinstance(make_reward_model_config("topreward", device="cpu"), TOPRewardConfig)
+
+
+def test_topreward_factory_returns_in_tree_class():
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    assert get_reward_model_class("topreward") is TOPRewardModel
+
+
+# ---------------------------------------------------------------------------
+# Config validation
+# ---------------------------------------------------------------------------
+
+
+def test_topreward_config_rejects_zero_max_frames():
+    with pytest.raises(ValueError, match="max_frames must be >= 1"):
+        TOPRewardConfig(device="cpu", max_frames=0)
+
+
+def test_topreward_config_rejects_non_positive_fps():
+    with pytest.raises(ValueError, match="fps must be > 0"):
+        TOPRewardConfig(device="cpu", fps=0.0)
+
+
+def test_topreward_config_rejects_suffix_without_instruction_placeholder():
+    with pytest.raises(ValueError, match=r"\{instruction\}"):
+        TOPRewardConfig(device="cpu", prompt_suffix_template="no placeholder here")
+
+
+# ---------------------------------------------------------------------------
+# compute_reward
+# ---------------------------------------------------------------------------
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_compute_reward_returns_one_scalar_per_sample(monkeypatch):
+    """``compute_reward`` must return a ``(B,)`` float32 tensor with one
+    log-prob reward per sample, consuming pre-encoded Qwen-VL tensors."""
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu")
+    model = TOPRewardModel(cfg)
+
+    input_ids = torch.randint(0, 100, (2, 10))
+    attention_mask = torch.ones(2, 10, dtype=torch.long)
+    labels = _terminal_labels(input_ids)
+
+    batch = _make_batch(input_ids, attention_mask, labels)
+    rewards = model.compute_reward(batch)
+
+    assert rewards.shape == (2,)
+    assert rewards.dtype == torch.float32
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_compute_reward_applies_success_threshold(monkeypatch):
+    """When ``success_threshold`` is finite, the model returns binary success."""
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu", success_threshold=0.0)
+    model = TOPRewardModel(cfg)
+
+    input_ids = torch.randint(0, 100, (2, 10))
+    attention_mask = torch.ones(2, 10, dtype=torch.long)
+    labels = _terminal_labels(input_ids)
+
+    batch = _make_batch(input_ids, attention_mask, labels)
+    rewards = model.compute_reward(batch)
+
+    assert rewards.shape == (2,)
+    assert set(rewards.tolist()).issubset({0.0, 1.0})
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_compute_reward_errors_when_inputs_missing(monkeypatch):
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu")
+    model = TOPRewardModel(cfg)
+
+    with pytest.raises(KeyError, match=r"observation\.topreward\.input_ids"):
+        model.compute_reward(_make_batch(torch.randint(0, 100, (1, 10)), omit="input_ids"))
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_compute_reward_errors_when_labels_missing(monkeypatch):
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu")
+    model = TOPRewardModel(cfg)
+
+    input_ids = torch.randint(0, 100, (1, 10))
+    with pytest.raises(KeyError, match=r"observation\.topreward\.labels"):
+        model.compute_reward(_make_batch(input_ids, labels=None))
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_compute_reward_requires_all_encoder_keys(monkeypatch):
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu")
+    model = TOPRewardModel(cfg)
+
+    input_ids = torch.randint(0, 100, (1, 10))
+    labels = _terminal_labels(input_ids)
+    required_encoder_keys = set(TOPREWARD_INPUT_KEYS) - {"input_ids", "labels"}
+
+    for key in required_encoder_keys:
+        with pytest.raises(KeyError, match=rf"observation\.topreward\.{key}"):
+            model.compute_reward(_make_batch(input_ids, labels=labels, omit=key))
+
+
+# ---------------------------------------------------------------------------
+# Save / load — config-only checkpoint
+# ---------------------------------------------------------------------------
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_save_pretrained_writes_only_config_json(monkeypatch, tmp_path):
+    from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
+
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(
+        device="cpu",
+        vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+        fps=4.0,
+        image_key="observation.images.front",
+    )
+    model = TOPRewardModel(cfg)
+    model.save_pretrained(str(tmp_path))
+
+    assert (tmp_path / CONFIG_NAME).exists()
+    assert not (tmp_path / SAFETENSORS_SINGLE_FILE).exists()
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_from_pretrained_local_dir_roundtrips_config(monkeypatch, tmp_path):
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(
+        device="cpu",
+        vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+        fps=4.0,
+        image_key="observation.images.front",
+        add_chat_template=True,
+        success_threshold=-1.5,
+    )
+    TOPRewardModel(cfg).save_pretrained(str(tmp_path))
+
+    reloaded = TOPRewardModel.from_pretrained(str(tmp_path))
+
+    assert isinstance(reloaded.config, TOPRewardConfig)
+    assert reloaded.config.vlm_name == "Qwen/Qwen3-VL-8B-Instruct"
+    assert reloaded.config.fps == 4.0
+    assert reloaded.config.image_key == "observation.images.front"
+    assert reloaded.config.add_chat_template is True
+    assert reloaded.config.success_threshold == -1.5
+
+
+@skip_if_package_missing("transformers")
+def test_topreward_is_not_trainable(monkeypatch):
+    from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = TOPRewardConfig(device="cpu")
+    model = TOPRewardModel(cfg)
+
+    assert model.is_trainable is False
+    with pytest.raises(NotImplementedError, match="not trainable"):
+        model.forward({"x": torch.zeros(1)})
diff --git a/tests/rewards/test_topreward.py b/tests/rewards/test_topreward.py
new file mode 100644
index 000000000..cbf960751
--- /dev/null
+++ b/tests/rewards/test_topreward.py
@@ -0,0 +1,80 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end TOPReward smoke test with the real Qwen3-VL model."""
+
+import os
+
+import pytest
+import torch
+
+pytest.importorskip("transformers")
+
+from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig  # noqa: E402
+from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel  # noqa: E402
+from lerobot.rewards.topreward.processor_topreward import (  # noqa: E402
+    TOPREWARD_FEATURE_PREFIX,
+    TOPREWARD_INPUT_KEYS,
+    make_topreward_pre_post_processors,
+)
+from tests.utils import require_cuda  # noqa: E402
+
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="This test requires downloading and loading Qwen3-VL and is not meant for CI",
+)
+
+
+def _make_dummy_topreward_batch(image_key: str, task_key: str) -> dict[str, object]:
+    num_frames = 4
+    image_size = 64
+    frames = torch.zeros(1, num_frames, 3, image_size, image_size, dtype=torch.uint8)
+    for frame_idx in range(num_frames):
+        frames[0, frame_idx, 0].fill_(min(frame_idx * 48, 255))
+        frames[0, frame_idx, 1].fill_(96)
+        frames[0, frame_idx, 2].fill_(192)
+
+    return {
+        image_key: frames,
+        task_key: ["pick up the red cube"],
+    }
+
+
+@require_cuda
+def test_topreward_full_qwen3vl_preprocessor_to_compute_reward():
+    cfg = TOPRewardConfig(
+        vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+        device="cuda",
+        max_frames=4,
+        fps=2.0,
+        max_input_length=4096,
+    )
+
+    preprocessor, _ = make_topreward_pre_post_processors(cfg)
+    encoded_batch = preprocessor(_make_dummy_topreward_batch(cfg.image_key, cfg.task_key))
+    for key in TOPREWARD_INPUT_KEYS:
+        assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in encoded_batch
+
+    model = TOPRewardModel(cfg)
+    try:
+        model.to(cfg.device)
+        model.eval()
+        rewards = model.compute_reward(encoded_batch)
+    finally:
+        del model
+        torch.cuda.empty_cache()
+
+    assert rewards.shape == (1,)
+    assert rewards.dtype == torch.float32
+    assert torch.isfinite(rewards).all()
diff --git a/tests/rewards/test_topreward_processor.py b/tests/rewards/test_topreward_processor.py
new file mode 100644
index 000000000..df379276e
--- /dev/null
+++ b/tests/rewards/test_topreward_processor.py
@@ -0,0 +1,246 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for TOPReward's pre-processing helpers and encoder step."""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
+from lerobot.rewards.topreward.processor_topreward import (
+    TOPREWARD_FEATURE_PREFIX,
+    TOPREWARD_INPUT_KEYS,
+    _expand_tasks,
+    _prepare_video_batch,
+)
+from lerobot.types import TransitionKey
+from tests.utils import skip_if_package_missing
+
+# ---------------------------------------------------------------------------
+# _prepare_video_batch — raw image/video batch -> (B, T, C, H, W) uint8
+# ---------------------------------------------------------------------------
+
+
+def test_prepare_video_batch_batched_chw_float_is_converted_to_uint8():
+    video = torch.rand(2, 4, 3, 8, 8)
+    tensor = _prepare_video_batch(video, max_frames=None)
+
+    assert tensor.shape == (2, 4, 3, 8, 8)
+    assert tensor.dtype == torch.uint8
+    assert tensor.min() >= 0 and tensor.max() <= 255
+
+
+def test_prepare_video_batch_batched_thwc_uint8_is_permuted_to_channel_first():
+    video = torch.randint(0, 256, (2, 3, 8, 8, 3), dtype=torch.uint8)
+    tensor = _prepare_video_batch(video, max_frames=None)
+
+    assert tensor.shape == (2, 3, 3, 8, 8)
+    assert tensor.dtype == torch.uint8
+
+
+def test_prepare_video_batch_max_frames_tail_crops_recent_frames():
+    video = torch.zeros(1, 10, 3, 4, 4)
+    for t in range(10):
+        video[:, t] = t / 9.0
+
+    tensor = _prepare_video_batch(video, max_frames=3)
+
+    assert tensor.shape == (1, 3, 3, 4, 4)
+    assert int(tensor[0, 0, 0, 0, 0]) == int(7 / 9 * 255)
+    assert int(tensor[0, -1, 0, 0, 0]) == 255
+
+
+def test_prepare_video_batch_rejects_3d_input():
+    with pytest.raises(ValueError, match="Expected TOPReward frames"):
+        _prepare_video_batch(torch.zeros(4, 8, 8), max_frames=None)
+
+
+def test_prepare_video_batch_floats_above_one_are_rescaled_and_clipped():
+    video = torch.full((1, 1, 3, 2, 2), 5.0)
+    tensor = _prepare_video_batch(video, max_frames=None)
+
+    assert tensor.shape == (1, 1, 3, 2, 2)
+    assert int(tensor.max()) == 255
+
+
+def test_prepare_video_batch_clips_very_large_floats_to_uint8_max():
+    video = torch.full((1, 1, 3, 2, 2), 300.0)
+    tensor = _prepare_video_batch(video, max_frames=None)
+
+    assert int(tensor.max()) == 255
+
+
+# ---------------------------------------------------------------------------
+# _expand_tasks — string / list / tuple broadcasting to batch size
+# ---------------------------------------------------------------------------
+
+
+def test_expand_tasks_string_is_broadcast_to_batch_size():
+    assert _expand_tasks("pick up", batch_size=3, default=None) == ["pick up", "pick up", "pick up"]
+
+
+def test_expand_tasks_list_of_matching_size_passes_through():
+    assert _expand_tasks(["a", "b", "c"], batch_size=3, default=None) == ["a", "b", "c"]
+
+
+def test_expand_tasks_tuple_is_normalised_to_list():
+    assert _expand_tasks(("a", "b"), batch_size=2, default=None) == ["a", "b"]
+
+
+def test_expand_tasks_single_element_list_is_broadcast():
+    assert _expand_tasks(["only one"], batch_size=3, default=None) == ["only one"] * 3
+
+
+def test_expand_tasks_size_mismatch_raises():
+    with pytest.raises(ValueError, match="Expected 3 tasks"):
+        _expand_tasks(["a", "b"], batch_size=3, default=None)
+
+
+def test_expand_tasks_missing_uses_default():
+    assert _expand_tasks(None, batch_size=2, default="fallback") == ["fallback", "fallback"]
+
+
+def test_expand_tasks_missing_without_default_raises():
+    with pytest.raises(KeyError, match="task description"):
+        _expand_tasks(None, batch_size=1, default=None)
+
+
+def test_expand_tasks_wrong_type_raises():
+    with pytest.raises(TypeError, match="must be a string or list"):
+        _expand_tasks(42, batch_size=1, default=None)
+
+
+# ---------------------------------------------------------------------------
+# Encoder step — stubbed AutoProcessor
+# ---------------------------------------------------------------------------
+
+
+def _skip_if_topreward_extras_missing(func):
+    func = skip_if_package_missing("transformers")(func)
+    return func
+
+
+class _FakeTokenizer:
+    eos_token = "<|endoftext|>"
+    pad_token = "<|endoftext|>"
+
+    def __call__(self, *args, **kwargs):
+        return {"input_ids": torch.zeros(1, 10, dtype=torch.long)}
+
+
+class _FakeAutoProcessor:
+    def __init__(self) -> None:
+        self.tokenizer = _FakeTokenizer()
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
+        return cls()
+
+    def apply_chat_template(self, messages, **kwargs):  # noqa: ARG002
+        return "fake_prompt_text"
+
+    def __call__(self, text=None, images=None, videos=None, **kwargs):  # noqa: ARG002
+        seq_len = 10
+        batch_size = len(text) if isinstance(text, list) else 1
+        return {
+            "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
+            "attention_mask": torch.ones(batch_size, seq_len, dtype=torch.long),
+            "pixel_values_videos": torch.zeros(batch_size, 1536, dtype=torch.float32),
+            "video_grid_thw": torch.ones(batch_size, 3, dtype=torch.long),
+            "mm_token_type_ids": torch.zeros(batch_size, seq_len, dtype=torch.long),
+        }
+
+
+def _build_step(monkeypatch, **overrides):
+    from lerobot.rewards.topreward import processor_topreward
+
+    monkeypatch.setattr(processor_topreward, "AutoProcessor", _FakeAutoProcessor)
+    return processor_topreward.TOPRewardEncoderProcessorStep(**overrides)
+
+
+def _make_transition(observation: dict, complementary: dict | None = None) -> dict:
+    transition: dict = {TransitionKey.OBSERVATION: observation}
+    if complementary is not None:
+        transition[TransitionKey.COMPLEMENTARY_DATA] = complementary
+    return transition
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_emits_input_ids_and_labels(monkeypatch):
+    """The processor must emit Qwen-VL tensors including ``input_ids`` and
+    ``labels`` under the ``observation.topreward.*`` namespace."""
+    step = _build_step(monkeypatch)
+
+    frames_batch = torch.zeros(2, 4, 3, 8, 8)
+    out = step(
+        _make_transition(
+            observation={"observation.images.top": frames_batch},
+            complementary={"task": ["pick", "place"]},
+        )
+    )
+
+    obs_out = out[TransitionKey.OBSERVATION]
+    for key in TOPREWARD_INPUT_KEYS:
+        assert f"{TOPREWARD_FEATURE_PREFIX}{key}" in obs_out
+
+    input_ids = obs_out[f"{TOPREWARD_FEATURE_PREFIX}input_ids"]
+    labels = obs_out[f"{TOPREWARD_FEATURE_PREFIX}labels"]
+    assert labels.dtype == torch.long
+    assert labels.shape == (2, 10)
+    assert labels[:, :-1].eq(-100).all()
+    assert labels[:, -1].equal(input_ids[:, -1])
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_get_config_roundtrips_user_fields(monkeypatch):
+    step = _build_step(
+        monkeypatch,
+        vlm_name="Qwen/Qwen3-VL-8B-Instruct",
+        image_key="observation.images.cam_top",
+        task_key="task",
+        default_task="do the thing",
+        max_frames=8,
+        fps=4.0,
+        add_chat_template=True,
+        max_length=2048,
+    )
+
+    cfg = step.get_config()
+    assert cfg["vlm_name"] == "Qwen/Qwen3-VL-8B-Instruct"
+    assert cfg["image_key"] == "observation.images.cam_top"
+    assert cfg["default_task"] == "do the thing"
+    assert cfg["max_frames"] == 8
+    assert cfg["fps"] == 4.0
+    assert cfg["add_chat_template"] is True
+    assert cfg["max_length"] == 2048
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_transform_features_is_identity(monkeypatch):
+    step = _build_step(monkeypatch)
+    features = {
+        PipelineFeatureType.OBSERVATION: {
+            "observation.images.top": PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL),
+        }
+    }
+    assert step.transform_features(features) == features
+
+
+@_skip_if_topreward_extras_missing
+def test_encoder_step_rejects_missing_image_key(monkeypatch):
+    step = _build_step(monkeypatch, image_key="observation.images.top")
+    with pytest.raises(KeyError, match="image key"):
+        step(_make_transition(observation={}, complementary={"task": "pick"}))
diff --git a/uv.lock b/uv.lock
index c5f026517..3eb1dda23 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3009,6 +3009,9 @@ test = [
     { name = "pytest-cov" },
     { name = "pytest-timeout" },
 ]
+topreward = [
+    { name = "transformers" },
+]
 training = [
     { name = "accelerate" },
     { name = "av" },
@@ -3167,6 +3170,7 @@ requires-dist = [
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'wallx'" },
     { name = "lerobot", extras = ["smolvla"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["test"], marker = "extra == 'all'" },
+    { name = "lerobot", extras = ["topreward"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["training"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'eo1'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'groot'" },
@@ -3177,6 +3181,7 @@ requires-dist = [
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'pi'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'sarm'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'smolvla'" },
+    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'topreward'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'wallx'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'xvla'" },
     { name = "lerobot", extras = ["video-benchmark"], marker = "extra == 'all'" },
@@ -3244,7 +3249,7 @@ requires-dist = [
     { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
     { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
 
 [[package]]
 name = "librt"

From 24017e960c39a24fe1b6ea6248522460fa5aa4b3 Mon Sep 17 00:00:00 2001
From: Haoquan Fang <71356829+hq-fang@users.noreply.github.com>
Date: Wed, 27 May 2026 09:58:37 -0700
Subject: [PATCH 06/45] Add MolmoAct2 policy (#3604)

* add molmoact2 policy

* add apache headers to molmoact2 files

* simplify molmoact2 package imports

* align molmoact2 feature validation with eo pattern

* remove molmoact2 processor override from factory

* guard molmoact2 transformers imports

* guard molmoact2 processor transformers import

* add scipy dependency to molmoact2 extra

* use a single molmoact2 action queue

* move molmoact2 config logic into config

* fix molmoact2 hf image key resolution

* load molmoact2 without remote code

* lazy import molmoact2 scipy

* format molmoact2 files

* skip molmoact2 tests without optional deps

* fix molmoact2 pre-commit checks

* validate molmoact2 gripper range
---
 docs/source/_toctree.yml                      |    2 +
 docs/source/molmoact2.mdx                     |  433 ++
 docs/source/policy_molmoact2_README.md        |   39 +
 pyproject.toml                                |    7 +-
 src/lerobot/policies/__init__.py              |    2 +
 src/lerobot/policies/factory.py               |   26 +-
 src/lerobot/policies/molmoact2/README.md      |    1 +
 src/lerobot/policies/molmoact2/__init__.py    |   21 +
 .../molmoact2/configuration_molmoact2.py      |  519 ++
 .../policies/molmoact2/hf_model/__init__.py   |   17 +
 .../molmoact2/hf_model/action_tokenizer.py    |  237 +
 .../hf_model/configuration_molmoact2.py       |  553 ++
 .../hf_model/image_processing_molmoact2.py    |  564 ++
 .../policies/molmoact2/hf_model/inference.py  |  748 +++
 .../molmoact2/hf_model/modeling_molmoact2.py  | 4591 +++++++++++++++++
 .../hf_model/processing_molmoact2.py          |  431 ++
 .../hf_model/video_processing_molmoact2.py    |  997 ++++
 .../policies/molmoact2/modeling_molmoact2.py  | 1551 ++++++
 .../policies/molmoact2/processor_molmoact2.py | 1083 ++++
 tests/policies/molmoact2/test_molmoact2.py    | 1397 +++++
 uv.lock                                       |   11 +-
 21 files changed, 13226 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/molmoact2.mdx
 create mode 100644 docs/source/policy_molmoact2_README.md
 create mode 120000 src/lerobot/policies/molmoact2/README.md
 create mode 100644 src/lerobot/policies/molmoact2/__init__.py
 create mode 100644 src/lerobot/policies/molmoact2/configuration_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/__init__.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/inference.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/modeling_molmoact2.py
 create mode 100644 src/lerobot/policies/molmoact2/processor_molmoact2.py
 create mode 100644 tests/policies/molmoact2/test_molmoact2.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 527cb7e63..1d4d9e770 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -59,6 +59,8 @@
     title: π₀-FAST (Pi0Fast)
   - local: pi05
     title: π₀.₅ (Pi05)
+  - local: molmoact2
+    title: MolmoAct2
   - local: eo1
     title: EO-1
   - local: groot
diff --git a/docs/source/molmoact2.mdx b/docs/source/molmoact2.mdx
new file mode 100644
index 000000000..ddd178acd
--- /dev/null
+++ b/docs/source/molmoact2.mdx
@@ -0,0 +1,433 @@
+# MolmoAct2 Policy
+
+MolmoAct2 is the LeRobot policy implementation of
+[MolmoAct2](https://allenai.org/blog/molmoact2), ported into the LeRobot
+training, evaluation, checkpointing, and dataset interfaces for easier use with
+LeRobot datasets.
+
+This implementation currently supports training and evaluation for the regular
+MolmoAct2 model. MolmoAct2-Think, which supports adaptive depth reasoning, is
+not included in this LeRobot policy yet and is coming soon.
+
+For the original MolmoAct2 training code used for the experiments reported in
+the paper, see [allenai/molmoact2](https://github.com/allenai/molmoact2).
+
+## Installation Requirements
+
+Install LeRobot with the MolmoAct2 optional dependencies:
+
+```bash
+pip install -e ".[molmoact2]"
+```
+
+To run the models in this repository, you need an NVIDIA GPU. The measurements
+below were taken on a single NVIDIA H100 80GB with bf16 model loading, LIBERO with two RGB cameras. MolmoAct2 rows use `chunk_size=10`, action dim 7
+padded to `expected_max_action_dim=32`, and `num_flow_timesteps=8`. Training measurements use
+`gradient_checkpointing=true` and include the forward pass, backward pass,
+gradient clipping, optimizer step, and optimizer state allocation. Values are
+peak GPU memory sampled with `nvidia-smi`. Leave a few GiB of headroom for
+dataloader workers, CUDA context, and fragmentation.
+
+Multi-GPU training through `accelerate` increases throughput and global batch
+size, but this LeRobot port does not currently expose the original MolmoAct2
+`fsdp_devices` model-parallel training path. The current training script has
+not been tested for multi-node training.
+
+| Mode                                             | Peak Memory, bs=8 | Peak Memory, bs=16 | Peak Memory, bs=32 |
+| ------------------------------------------------ | ----------------: | -----------------: | -----------------: |
+| Inference, continuous, CUDA graph enabled (bs=1) |          12.1 GiB |                  - |                  - |
+| Fine-tuning, action expert only, continuous      |          16.5 GiB |           18.3 GiB |           21.4 GiB |
+| Fine-tuning, LoRA VLM, both action modes         |          20.2 GiB |           26.8 GiB |           41.3 GiB |
+| Fine-tuning, full model, both action modes       |          48.3 GiB |           49.8 GiB |           60.1 GiB |
+
+The repo has been tested with Ubuntu 22.04.
+
+## Usage
+
+To use MolmoAct2 in a LeRobot training config, set:
+
+```python
+policy.type=molmoact2
+```
+
+## Training
+
+MolmoAct2 can be fine-tuned from either the released MolmoAct2 Hugging Face
+checkpoint format or from a checkpoint already saved by LeRobot. Both routes use
+the same LeRobot training loop, dataset transforms, checkpoint saving, and
+logging. The difference is only how the initial policy weights and processor
+state are loaded.
+
+### Training With Original MolmoAct2 Weight
+
+Use `policy.checkpoint_path` when starting from a released MolmoAct2 checkpoint,
+for example `allenai/MolmoAct2` or `allenai/MolmoAct2-LIBERO`. LeRobot will load
+the original HF model files, then build its own policy processor from the
+dataset metadata and the policy options below.
+
+The command below shows full fine-tuning on the merged LIBERO dataset. It uses
+bf16 model loading, 8 flow timesteps, LeRobot dataset statistics, image
+augmentation, and LeRobot's checkpointing/logging path.
+
+```bash
+accelerate launch \
+  --num_processes=8 \
+  --mixed_precision=bf16 \
+  -m lerobot.scripts.lerobot_train \
+  --dataset.repo_id=allenai/MolmoAct2-LIBERO-Dataset \
+  --dataset.root=/path/to/lerobot/data/allenai/MolmoAct2-LIBERO-Dataset \
+  --dataset.video_backend=pyav \
+  --dataset.image_transforms.enable=true \
+  --policy.type=molmoact2 \
+  --policy.checkpoint_path=allenai/MolmoAct2-LIBERO \
+  --policy.device=cuda \
+  --policy.action_mode=both \
+  --policy.chunk_size=10 \
+  --policy.n_action_steps=10 \
+  --policy.setup_type="single franka robotic arm in libero" \
+  --policy.control_mode="delta end-effector pose" \
+  --policy.image_keys='["observation.images.image","observation.images.wrist_image"]' \
+  --policy.model_dtype=bfloat16 \
+  --policy.num_flow_timesteps=8 \
+  --policy.gradient_checkpointing=true \
+  --policy.freeze_embedding=true \
+  --policy.normalize_gripper=false \
+  --policy.enable_knowledge_insulation=false \
+  --policy.push_to_hub=false \
+  --wandb.enable=true \
+  --wandb.entity=<wandb_entity> \
+  --wandb.project=<wandb_project> \
+  --job_name=<job_name> \
+  --output_dir=outputs/<job_name> \
+  --steps=10000 \
+  --batch_size=32 \
+  --num_workers=4 \
+  --log_freq=20 \
+  --eval_freq=-1 \
+  --save_checkpoint=true \
+  --save_freq=2000
+```
+
+### Training With LeRobot MolmoAct2 Weight
+
+Use `policy.path` when starting from a MolmoAct2 checkpoint that was saved by
+LeRobot, either from a local `pretrained_model` directory or from the Hub. This
+restores the saved LeRobot policy config, model weights, processor, and
+normalization statistics. You can still override training-time options such as
+`batch_size`, `steps`, LoRA flags, or `policy.action_mode`.
+
+```bash
+accelerate launch \
+  --num_processes=8 \
+  --mixed_precision=bf16 \
+  -m lerobot.scripts.lerobot_train \
+  --dataset.repo_id=allenai/MolmoAct2-LIBERO-Dataset \
+  --dataset.root=/path/to/lerobot/data/allenai/MolmoAct2-LIBERO-Dataset \
+  --dataset.video_backend=pyav \
+  --dataset.image_transforms.enable=true \
+  --policy.path=/path/to/pretrained_model \
+  --policy.device=cuda \
+  --policy.action_mode=both \
+  --policy.chunk_size=10 \
+  --policy.n_action_steps=10 \
+  --policy.model_dtype=bfloat16 \
+  --policy.num_flow_timesteps=8 \
+  --policy.gradient_checkpointing=true \
+  --wandb.enable=true \
+  --wandb.entity=<wandb_entity> \
+  --wandb.project=<wandb_project> \
+  --job_name=<job_name> \
+  --output_dir=outputs/<job_name> \
+  --steps=10000 \
+  --batch_size=32 \
+  --num_workers=4 \
+  --log_freq=20 \
+  --eval_freq=-1 \
+  --save_checkpoint=true \
+  --save_freq=2000
+```
+
+### Common Practices
+
+For fine-tuning on a comparatively small dataset, such as a single LIBERO suite
+or a real-world dataset with less than 200 demonstrations, a global batch size of
+16 to 32 is a good starting point. In these settings, `policy.enable_lora_vlm=true` or `policy.train_action_expert_only=true` is also a practical choice. In both
+cases, we intentionally keep the action expert fully trainable, which we found
+to be crucial for model performance. For larger fine-tuning datasets, larger
+global batch sizes and full fine-tuning are usually preferred.
+
+### Common Policy Options
+
+- `policy.checkpoint_path`: original MolmoAct2 HF checkpoint to initialize from.
+  Use this for released MolmoAct2 weights.
+- `policy.path`: LeRobot checkpoint to initialize from. Use this for checkpoints
+  created by LeRobot training.
+- `policy.action_mode`: training target, one of `continuous`, `discrete`, or
+  `both`. `both` trains the flow-matching action expert and the discrete
+  action-token loss.
+- `policy.train_action_expert_only`: trains only parameters whose names contain
+  `action_expert`. It requires `policy.action_mode=continuous`.
+- `policy.enable_lora_vlm`: enables LoRA on VLM linear layers. Use
+  `policy.enable_lora_action_expert=true` only if LoRA should also cover action
+  expert linear layers. When `policy.enable_lora_action_expert=false`, the
+  action expert base weights remain fully trainable while the VLM is trained
+  through LoRA adapters. When `policy.enable_lora_action_expert=true`, the
+  action expert is also adapter-tuned instead of fully fine-tuned.
+- `policy.enable_knowledge_insulation`: when `true`, detaches action-expert
+  context K/V states before the action loss. The default is `false`.
+- `policy.chunk_size`: action horizon used by the policy. For LIBERO we use
+  `10`. This LeRobot port overrides the loaded checkpoint's
+  `max_action_horizon` with this value.
+- `policy.n_action_steps`: number of actions consumed from each predicted
+  chunk before querying the policy again. For LIBERO, set it to `chunk_size`.
+- `policy.setup_type`: text inserted into the prompt to describe the robot and
+  scene, e.g. `single franka robotic arm in libero`. More examples are listed
+  in the `metadata_by_tag` entries of
+  [`norm_stats.json`](https://huggingface.co/allenai/MolmoAct2/blob/main/norm_stats.json).
+- `policy.control_mode`: text inserted into the prompt to describe the action
+  space, e.g. `delta end-effector pose` or `absolute joint pose`.
+- `policy.image_keys`: ordered LeRobot image observation keys passed to the
+  processor.
+- `policy.model_dtype`: checkpoint/forward dtype, one of `float32`,
+  `bfloat16`, or `float16`. Use `bfloat16` for normal training.
+- `policy.num_flow_timesteps`: number of flow-matching timesteps sampled per
+  example during training. We use `8` for fine-tuning.
+- `policy.num_inference_steps`: optional override for continuous action
+  generation steps at inference time.
+- `policy.gradient_checkpointing`: enables checkpointing in the VLM/action path
+  to reduce activation memory.
+- `policy.freeze_embedding`: freezes input embeddings. The default is `true`.
+- `policy.normalize_gripper`: controls whether gripper dimensions are included
+  in state/action quantile normalization. The default is `false`.
+- `policy.normalize_language`: normalizes task strings before prompt
+  construction. The default is `true`.
+- `policy.mask_action_dim_padding`: masks padded dimensions in the flow loss.
+  Released checkpoints use `policy.expected_max_action_dim=32`.
+- `policy.max_sequence_length`: optional manual sequence cap. Leave unset to
+  infer it from images, state dimension, action dimension, action horizon, and
+  discrete-action mode.
+
+### Learning Rates
+
+MolmoAct2 uses parameter-group learning rates to match the original MolmoAct2
+fine-tuning experiments.
+
+- Full fine-tuning uses `policy.optimizer_lr=1e-5` for the VLM,
+  `policy.optimizer_vit_lr=5e-6` for the vision tower,
+  `policy.optimizer_connector_lr=5e-6` for image connector layers, and
+  `policy.optimizer_action_expert_lr=5e-5` for the action expert.
+- LoRA VLM fine-tuning sets the VLM, vision, and connector LoRA parameter
+  groups to `5e-5` when `policy.enable_lora_vlm=true`. By default,
+  `policy.enable_lora_action_expert=false`, so the action expert is still fully
+  fine-tuned with `policy.optimizer_action_expert_lr`. If
+  `policy.enable_lora_action_expert=true`, the action expert is trained through
+  LoRA adapters instead.
+- Action-expert-only fine-tuning trains only the action expert and uses
+  `policy.optimizer_action_expert_lr=5e-5`.
+
+You can override the full fine-tuning and action-expert learning rates with
+`policy.optimizer_lr`, `policy.optimizer_vit_lr`,
+`policy.optimizer_connector_lr`, and `policy.optimizer_action_expert_lr`.
+Scheduler settings can be changed with `policy.scheduler_warmup_steps`,
+`policy.scheduler_decay_steps`, and `policy.scheduler_decay_lr`.
+
+### Dataset Quantile Statistics
+
+MolmoAct2 defaults to quantile normalization for state and action features. If
+your dataset has not been converted with quantile statistics, you can add them
+with:
+
+```bash
+python src/lerobot/datasets/v30/augment_dataset_quantile_stats.py \
+  --repo-id=your_dataset
+```
+
+Alternatively, train MolmoAct2 with mean/std normalization:
+
+```bash
+--policy.normalization_mapping='{"ACTION": "MEAN_STD", "STATE": "MEAN_STD", "VISUAL": "IDENTITY"}'
+```
+
+## Evaluation
+
+Evaluation also supports both LeRobot-saved checkpoints and original MolmoAct2
+HF checkpoints. For LIBERO replication, keep the EGL rendering environment
+fixed and use `policy.per_episode_seed=true`.
+
+**Important:** We found that `num_steps_wait=10` does not reliably let the
+LIBERO scene stabilize and can degrade measured success. All LIBERO evaluation
+results reported here use `num_steps_wait=50`.
+
+### Evaluation With LeRobot MolmoAct2 Weight
+
+Use `policy.path` for a checkpoint saved by LeRobot. The saved processor and
+normalization statistics are restored together with the model.
+
+```bash
+export MUJOCO_GL=egl
+export PYOPENGL_PLATFORM=egl
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+
+lerobot-eval \
+  --policy.path=allenai/MolmoAct2-LIBERO-LeRobot \
+  --policy.inference_action_mode=continuous \
+  --policy.model_dtype=bfloat16 \
+  --policy.use_amp=true \
+  --policy.enable_inference_cuda_graph=true \
+  --policy.device=cuda \
+  --policy.per_episode_seed=true \
+  --policy.eval_seed=1000 \
+  --env.type=libero \
+  --env.task=libero_10,libero_goal,libero_object,libero_spatial \
+  --env.camera_name_mapping='{"agentview_image":"image","robot0_eye_in_hand_image":"wrist_image"}' \
+  --eval.batch_size=1 \
+  --eval.n_episodes=50 \
+  --seed=1000
+```
+
+### Evaluation With Original MolmoAct2 Weight
+
+You can evaluate a released Hugging Face checkpoint directly without first
+converting it to a LeRobot checkpoint. In this case, set
+`policy.checkpoint_path` to the HF model repo and provide `policy.norm_tag`.
+For LIBERO, `policy.norm_tag=libero` loads the LIBERO action/state
+normalization statistics, action horizon, prompt metadata, and image-key order
+from the checkpoint's `norm_stats.json`.
+
+To fully replicate the MolmoAct2 paper results with released Hugging Face
+checkpoints, we recommend using the v0.5.1-pinned
+[`allenai/lerobot` `molmoact2-hf-inference`](https://github.com/allenai/lerobot/tree/molmoact2-hf-inference)
+branch. That branch matches the original evaluation settings used for the
+reported numbers.
+
+```bash
+export MUJOCO_GL=egl
+export PYOPENGL_PLATFORM=egl
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+
+lerobot-eval \
+  --policy.type=molmoact2 \
+  --policy.checkpoint_path=allenai/MolmoAct2-LIBERO \
+  --policy.norm_tag=libero \
+  --policy.inference_action_mode=continuous \
+  --policy.model_dtype=float32 \
+  --policy.use_amp=false \
+  --policy.enable_inference_cuda_graph=true \
+  --policy.device=cuda \
+  --policy.per_episode_seed=true \
+  --policy.eval_seed=1000 \
+  --env.type=libero \
+  --env.task=libero_goal \
+  --env.camera_name_mapping='{"agentview_image":"image","robot0_eye_in_hand_image":"wrist_image"}' \
+  --eval.batch_size=1 \
+  --eval.n_episodes=50 \
+  --seed=1000
+```
+
+Use `--env.task=libero_10,libero_goal,libero_object,libero_spatial` to run the
+full LIBERO suite. The same command works for other released MolmoAct2
+checkpoints as long as the requested `policy.norm_tag` exists in that
+checkpoint's `norm_stats.json`.
+
+### Common Evaluation Options
+
+- `policy.inference_action_mode`: required for rollout. Use `continuous` for
+  flow-matching inference or `discrete` for action-token inference. It must be
+  compatible with the training-time `policy.action_mode` saved in the
+  checkpoint.
+- `policy.path`: LeRobot checkpoint path or Hub repo. Use this for checkpoints
+  saved by LeRobot.
+- `policy.checkpoint_path`: original MolmoAct2 HF checkpoint path or Hub repo.
+  Use this with `policy.type=molmoact2` and `policy.norm_tag`.
+- `policy.norm_tag`: selects normalization statistics, prompt metadata,
+  image-key order, and action horizon from the original checkpoint's
+  `norm_stats.json`. It is required for direct original-HF checkpoint
+  evaluation.
+- `policy.model_dtype`: model load/forward dtype. Use `bfloat16` for normal
+  GPU evaluation. Use `float32` only when you explicitly want fp32 inference.
+- `policy.use_amp`: runs the policy forward under autocast during eval. For
+  `model_dtype=bfloat16`, keep this enabled.
+- `policy.enable_inference_cuda_graph`: enables the MolmoAct2 inference CUDA
+  graph path for faster repeated continuous-action rollout.
+- `policy.per_episode_seed` and `policy.eval_seed`: make stochastic continuous
+  action generation deterministic per episode for replication.
+- `env.task`: comma-separated LIBERO suites or a single suite. Use
+  `libero_10,libero_goal,libero_object,libero_spatial` for the full benchmark.
+- `env.camera_name_mapping`: maps LIBERO camera names to the image keys expected
+  by the policy processor.
+
+## Performance Results
+
+### LIBERO Benchmark Results
+
+MolmoAct2 has demonstrated strong performance on the LIBERO benchmark suite. To
+compare and test its LeRobot implementation, we fine-tuned
+[`allenai/MolmoAct2-LIBERO`](https://huggingface.co/allenai/MolmoAct2-LIBERO)
+for an additional 10k steps on the LIBERO dataset with per-GPU batch size 32 on
+8 H100 GPUs, then compared the results to the original MolmoAct2 reference
+results.
+
+The LeRobot fine-tuned checkpoint reported here is available at
+[`allenai/MolmoAct2-LIBERO-LeRobot`](https://huggingface.co/allenai/MolmoAct2-LIBERO-LeRobot)
+and was trained on
+[`allenai/MolmoAct2-LIBERO-Dataset`](https://huggingface.co/datasets/allenai/MolmoAct2-LIBERO-Dataset).
+
+| Benchmark      | LeRobot Implementation | MolmoAct2 Original |
+| -------------- | ---------------------: | -----------------: |
+| LIBERO Spatial |                  98.4% |              97.8% |
+| LIBERO Object  |                 100.0% |             100.0% |
+| LIBERO Goal    |                  98.0% |              97.8% |
+| LIBERO 10      |                  96.6% |              93.2% |
+| Average        |                 98.25% |             97.20% |
+
+These results demonstrate MolmoAct2's strong performance across diverse robotic
+manipulation tasks. To reproduce them, follow the instructions in the LIBERO
+evaluation section.
+
+## Differences From the Original Implementation
+
+This LeRobot port is intended to match MolmoAct2 behavior while using LeRobot's
+dataset, training, evaluation, checkpoint, and logging infrastructure. The main
+differences from the original training repository are:
+
+- The original paper training stack loads the model in fp32 and trains under
+  mixed precision. This LeRobot port usually loads the checkpoint directly in
+  `policy.model_dtype=bfloat16` for lower memory use.
+- The original repository uses its own FSDP/model-parallel training path. The
+  LeRobot port uses the standard LeRobot/Accelerate training path and has not
+  been tested for multi-node training.
+- The original repository supports sequence packing. The LeRobot port trains on
+  one LeRobot sample per item and pads to an inferred fixed sequence budget.
+- The LeRobot port follows LeRobot's optimizer, scheduler, checkpoint saving,
+  dataset transforms, image augmentation, and Weights & Biases logging
+  conventions.
+- The original training path supports mixed action horizons by padding to
+  `max_action_horizon` and masking padded horizon slots in the action expert
+  self-attention. This is useful when training across datasets with different
+  control frequencies. The LeRobot port currently targets single-dataset
+  fine-tuning, so `policy.chunk_size` overrides the checkpoint
+  `max_action_horizon` and horizon masking is not implemented yet. Support for
+  this mixed-horizon path is planned.
+
+## Citation
+
+```bibtex
+@misc{fang2026molmoact2actionreasoningmodels,
+      title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
+      author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
+      year={2026},
+      eprint={2605.02881},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2605.02881},
+}
+```
+
+## License
+
+This model is licensed under Apache 2.0. It is intended for research and
+educational use in accordance with
+[Ai2's Responsible Use Guidelines](https://allenai.org/responsible-use),
+consistent with [allenai/molmoact2](https://github.com/allenai/molmoact2).
diff --git a/docs/source/policy_molmoact2_README.md b/docs/source/policy_molmoact2_README.md
new file mode 100644
index 000000000..df3a6341e
--- /dev/null
+++ b/docs/source/policy_molmoact2_README.md
@@ -0,0 +1,39 @@
+# MolmoAct2
+
+This repository contains the LeRobot policy implementation of
+[MolmoAct2](https://allenai.org/blog/molmoact2), ported into LeRobot for
+training, evaluation, checkpointing, and dataset compatibility.
+
+This implementation currently supports training and evaluation for the regular
+MolmoAct2 model. MolmoAct2-Think, which supports adaptive depth reasoning, is
+not included in this LeRobot policy yet and is coming soon.
+
+For the original MolmoAct2 training code used for the experiments reported in
+the paper, see [allenai/molmoact2](https://github.com/allenai/molmoact2).
+
+## LIBERO Evaluation
+
+Important: we found that `num_steps_wait=10` does not reliably let the LIBERO
+scene stabilize and can degrade measured success. All LIBERO evaluation results
+reported for this LeRobot implementation use `num_steps_wait=50`.
+
+## Citation
+
+```bibtex
+@misc{fang2026molmoact2actionreasoningmodels,
+      title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
+      author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
+      year={2026},
+      eprint={2605.02881},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2605.02881},
+}
+```
+
+## License
+
+This model is licensed under Apache 2.0. It is intended for research and
+educational use in accordance with
+[Ai2's Responsible Use Guidelines](https://allenai.org/responsible-use),
+consistent with [allenai/molmoact2](https://github.com/allenai/molmoact2).
diff --git a/pyproject.toml b/pyproject.toml
index 264297c5e..a6785c564 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -198,6 +198,7 @@ wallx = [
     "lerobot[qwen-vl-utils-dep]",
 ]
 pi = ["lerobot[transformers-dep]", "lerobot[scipy-dep]"]
+molmoact2 = ["lerobot[transformers-dep]", "lerobot[peft-dep]", "lerobot[scipy-dep]"]
 smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0"]
 multi_task_dit = ["lerobot[transformers-dep]", "lerobot[diffusers-dep]"]
 groot = [
@@ -275,6 +276,7 @@ all = [
     "lerobot[multi_task_dit]",
     "lerobot[wallx]",
     "lerobot[pi]",
+    "lerobot[molmoact2]",
     "lerobot[smolvla]",
     # "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
     "lerobot[xvla]",
@@ -405,8 +407,11 @@ default.extend-ignore-identifiers-re = [
     "ein",
     "thw",
     "inpt",
+    "arange",
+    "is_compileable",
     "ROBOTIS",
-    "OT_VALUE"
+    "OT_VALUE",
+    "VanderBilt"
 ]
 
 # TODO: Uncomment when ready to use
diff --git a/src/lerobot/policies/__init__.py b/src/lerobot/policies/__init__.py
index 3a6b8e5d2..68d23c9ca 100644
--- a/src/lerobot/policies/__init__.py
+++ b/src/lerobot/policies/__init__.py
@@ -20,6 +20,7 @@ from .eo1.configuration_eo1 import EO1Config as EO1Config
 from .factory import get_policy_class, make_policy, make_policy_config, make_pre_post_processors
 from .gaussian_actor.configuration_gaussian_actor import GaussianActorConfig as GaussianActorConfig
 from .groot.configuration_groot import GrootConfig as GrootConfig
+from .molmoact2.configuration_molmoact2 import MolmoAct2Config as MolmoAct2Config
 from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig as MultiTaskDiTConfig
 from .pi0.configuration_pi0 import PI0Config as PI0Config
 from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig
@@ -43,6 +44,7 @@ __all__ = [
     "EO1Config",
     "GaussianActorConfig",
     "GrootConfig",
+    "MolmoAct2Config",
     "MultiTaskDiTConfig",
     "PI0Config",
     "PI0FastConfig",
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 8937bc6ae..05fda05d8 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -49,6 +49,7 @@ from .diffusion.configuration_diffusion import DiffusionConfig
 from .eo1.configuration_eo1 import EO1Config
 from .gaussian_actor.configuration_gaussian_actor import GaussianActorConfig
 from .groot.configuration_groot import GrootConfig
+from .molmoact2.configuration_molmoact2 import MolmoAct2Config
 from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
 from .pi0.configuration_pi0 import PI0Config
 from .pi05.configuration_pi05 import PI05Config
@@ -88,7 +89,8 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
 
     Args:
         name: The name of the policy. Supported names are "tdmpc", "diffusion", "act",
-            "multi_task_dit", "vqbet", "pi0", "pi05", "gaussian_actor", "smolvla", "wall_x".
+            "multi_task_dit", "vqbet", "pi0", "pi05", "gaussian_actor", "smolvla", "wall_x",
+            "molmoact2".
     Returns:
         The policy class corresponding to the given name.
 
@@ -151,6 +153,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
         from .eo1.modeling_eo1 import EO1Policy
 
         return EO1Policy
+    elif name == "molmoact2":
+        from .molmoact2.modeling_molmoact2 import MolmoAct2Policy
+
+        return MolmoAct2Policy
     else:
         try:
             return _get_policy_cls_from_policy_name(name=name)
@@ -168,7 +174,7 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
     Args:
         policy_type: The type of the policy. Supported types include "tdmpc",
                      "multi_task_dit", "diffusion", "act", "vqbet", "pi0", "pi05", "gaussian_actor",
-                     "smolvla", "wall_x".
+                     "smolvla", "wall_x", "molmoact2".
         **kwargs: Keyword arguments to be passed to the configuration class constructor.
 
     Returns:
@@ -203,6 +209,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
         return WallXConfig(**kwargs)
     elif policy_type == "eo1":
         return EO1Config(**kwargs)
+    elif policy_type == "molmoact2":
+        return MolmoAct2Config(**kwargs)
     else:
         try:
             config_cls = PreTrainedConfig.get_choice_class(policy_type)
@@ -231,6 +239,7 @@ class ProcessorConfigKwargs(TypedDict, total=False):
     preprocessor_overrides: dict[str, Any] | None
     postprocessor_overrides: dict[str, Any] | None
     dataset_stats: dict[str, dict[str, torch.Tensor]] | None
+    dataset_meta: Any | None
 
 
 def make_pre_post_processors(
@@ -414,6 +423,15 @@ def make_pre_post_processors(
             dataset_stats=kwargs.get("dataset_stats"),
         )
 
+    elif isinstance(policy_cfg, MolmoAct2Config):
+        from .molmoact2.processor_molmoact2 import make_molmoact2_pre_post_processors
+
+        processors = make_molmoact2_pre_post_processors(
+            config=policy_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+            dataset_meta=kwargs.get("dataset_meta"),
+        )
+
     else:
         try:
             processors = _make_processors_from_policy_config(
@@ -499,6 +517,10 @@ def make_policy(
         action_names = ds_meta.features.get(ACTION, {}).get("names")
         if action_names is not None:
             cfg.action_feature_names = list(action_names)
+    if ds_meta is not None:
+        set_dataset_feature_metadata = getattr(cfg, "set_dataset_feature_metadata", None)
+        if callable(set_dataset_feature_metadata):
+            set_dataset_feature_metadata(ds_meta.features)
 
     kwargs["config"] = cfg
 
diff --git a/src/lerobot/policies/molmoact2/README.md b/src/lerobot/policies/molmoact2/README.md
new file mode 120000
index 000000000..ef419516d
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/README.md
@@ -0,0 +1 @@
+../../../../docs/source/policy_molmoact2_README.md
\ No newline at end of file
diff --git a/src/lerobot/policies/molmoact2/__init__.py b/src/lerobot/policies/molmoact2/__init__.py
new file mode 100644
index 000000000..bfef53bb2
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/__init__.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_molmoact2 import MolmoAct2Config
+from .modeling_molmoact2 import MolmoAct2Policy
+from .processor_molmoact2 import make_molmoact2_pre_post_processors
+
+__all__ = ["MolmoAct2Config", "MolmoAct2Policy", "make_molmoact2_pre_post_processors"]
diff --git a/src/lerobot/policies/molmoact2/configuration_molmoact2.py b/src/lerobot/policies/molmoact2/configuration_molmoact2.py
new file mode 100644
index 000000000..de2585281
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/configuration_molmoact2.py
@@ -0,0 +1,519 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import math
+import os
+from contextlib import suppress
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from huggingface_hub import snapshot_download
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature, PreTrainedConfig
+from lerobot.optim import (
+    AdamWConfig,
+    CosineDecayWithWarmupSchedulerConfig,
+    LRSchedulerConfig,
+    OptimizerConfig,
+)
+from lerobot.utils.constants import ACTION, OBS_STATE
+
+from ..rtc.configuration_rtc import RTCConfig
+
+MOLMOACT2_DEFAULT_NUM_IMAGES = 2
+MOLMOACT2_IMAGE_TOKENS_PER_IMAGE = 196
+MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET = 80
+MOLMOACT2_TASK_TOKEN_BUDGET = 32
+MOLMOACT2_SEQUENCE_LENGTH_MARGIN = 32
+MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE = 64
+MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS = 4
+MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP = 6
+MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM = 0.95
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_checkpoint_location(
+    checkpoint_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    checkpoint_path = str(checkpoint_path or "").strip()
+    if not checkpoint_path:
+        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
+    local_path = Path(checkpoint_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=checkpoint_path,
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+def _load_hf_norm_metadata_for_tag(
+    checkpoint_path: str,
+    *,
+    revision: str | None,
+    force_download: bool,
+    norm_tag: str | None,
+) -> dict[str, Any]:
+    norm_tag = str(norm_tag or "").strip()
+    if not norm_tag:
+        return {}
+    checkpoint_location = Path(
+        _resolve_checkpoint_location(
+            checkpoint_path,
+            revision=revision,
+            force_download=force_download,
+        )
+    )
+    norm_stats_filename = "norm_stats.json"
+    config_path = checkpoint_location / "config.json"
+    if config_path.exists():
+        with suppress(OSError, json.JSONDecodeError):
+            norm_stats_filename = str(
+                json.loads(config_path.read_text()).get("norm_stats_filename") or norm_stats_filename
+            )
+    stats_path = checkpoint_location / norm_stats_filename
+    if not stats_path.exists():
+        raise FileNotFoundError(
+            f"MolmoAct2 HF checkpoint is missing {norm_stats_filename!r}; cannot resolve norm_tag={norm_tag!r}."
+        )
+    payload = json.loads(stats_path.read_text())
+    metadata_by_tag = payload.get("metadata_by_tag")
+    if not isinstance(metadata_by_tag, dict):
+        raise ValueError(f"MolmoAct2 norm stats file {stats_path} has no metadata_by_tag mapping.")
+    metadata = metadata_by_tag.get(norm_tag)
+    if not isinstance(metadata, dict):
+        available = sorted(str(tag) for tag in metadata_by_tag)
+        raise ValueError(f"Unknown MolmoAct2 norm_tag={norm_tag!r}. Available tags: {available}.")
+    return metadata
+
+
+@LRSchedulerConfig.register_subclass("molmoact2_cosine_decay_with_warmup")
+@dataclass
+class MolmoAct2CosineDecayWithWarmupSchedulerConfig(CosineDecayWithWarmupSchedulerConfig):
+    """MolmoAct2-local cosine scheduler with optional decay-step auto-match.
+
+    LeRobot's generic cosine scheduler keeps an explicit integer decay length.
+    For MolmoAct2, leaving num_decay_steps unset means "decay across this run's
+    training steps"; build() is the first point where num_training_steps is known.
+    """
+
+    num_decay_steps: int | None
+
+    def build(self, optimizer, num_training_steps: int):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.peak_lr,
+            decay_lr=self.decay_lr,
+            num_warmup_steps=self.num_warmup_steps,
+            num_decay_steps=num_training_steps if self.num_decay_steps is None else self.num_decay_steps,
+        ).build(optimizer, num_training_steps=num_training_steps)
+
+
+def _round_up(value: int, multiple: int) -> int:
+    return int(math.ceil(value / multiple) * multiple)
+
+
+def infer_molmoact2_max_sequence_length(
+    *,
+    num_images: int,
+    state_dim: int,
+    action_dim: int,
+    action_horizon: int,
+    include_discrete_action: bool,
+) -> int:
+    """Infer the padded text/image sequence cap from MolmoAct2's fixed token layout."""
+    if num_images < 1:
+        num_images = MOLMOACT2_DEFAULT_NUM_IMAGES
+    if state_dim < 0:
+        state_dim = 0
+    if action_dim < 1:
+        action_dim = 1
+    if action_horizon < 1:
+        action_horizon = 1
+
+    image_tokens = num_images * MOLMOACT2_IMAGE_TOKENS_PER_IMAGE
+    prompt_tokens = (
+        MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET
+        + MOLMOACT2_TASK_TOKEN_BUDGET
+        + state_dim
+        + MOLMOACT2_SEQUENCE_LENGTH_MARGIN
+    )
+    action_tokens = 0
+    if include_discrete_action:
+        action_tokens_per_step = max(
+            MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP,
+            math.ceil(action_dim * MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM),
+        )
+        action_tokens = MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS + action_horizon * action_tokens_per_step
+
+    return _round_up(
+        image_tokens + prompt_tokens + action_tokens,
+        MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE,
+    )
+
+
+@PreTrainedConfig.register_subclass("molmoact2")
+@dataclass
+class MolmoAct2Config(PreTrainedConfig):
+    """MolmoAct2 policy backed by the converted HF checkpoint implementation."""
+
+    checkpoint_path: str = "allenai/MolmoAct2"
+    checkpoint_revision: str | None = None
+    checkpoint_force_download: bool = False
+
+    n_obs_steps: int = 1
+    chunk_size: int = 30
+    n_action_steps: int = 30
+
+    action_mode: str = "both"
+    inference_action_mode: str | None = None
+    discrete_action_tokenizer: str = "allenai/MolmoAct2-FAST-Tokenizer"
+    discrete_generation_max_steps: int | None = None
+    norm_tag: str | None = None
+
+    setup_type: str = ""
+    control_mode: str = ""
+    image_keys: list[str] = field(default_factory=list)
+    normalize_language: bool = True
+    add_setup_tokens: bool = True
+    add_control_tokens: bool = True
+    normalize_gripper: bool = False
+    num_state_tokens: int = 256
+    # Leave unset for the default MolmoAct2 sequence budget inferred from the fixed
+    # image/prompt/state/action token layout. Override only for unusual long prompts.
+    max_sequence_length: int | None = None
+
+    # Fixed by released MolmoAct2 checkpoints. We validate this at model load.
+    expected_max_action_dim: int = 32
+
+    # Flow-matching training knobs copied from the original MolmoAct2 training path.
+    num_flow_timesteps: int = 8
+    flow_matching_cutoff: float = 1.0
+    flow_matching_time_offset: float = 0.001
+    flow_matching_time_scale: float = 0.999
+    flow_matching_beta_alpha: float = 1.0
+    flow_matching_beta_beta: float = 1.5
+    num_inference_steps: int | None = None
+    mask_action_dim_padding: bool = True
+    enable_inference_cuda_graph: bool = True
+    # MolmoAct2-local eval option. When enabled, stochastic continuous action
+    # generation uses a rollout-local generator derived from eval_seed.
+    per_episode_seed: bool = False
+    eval_seed: int | None = None
+    rtc_config: RTCConfig | None = None
+
+    # Default is full finetuning with gradients from the action expert flowing into the VLM.
+    enable_lora_vlm: bool = False
+    lora_rank: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_bias: str = "none"
+    enable_lora_action_expert: bool = False
+    enable_knowledge_insulation: bool = False
+    freeze_embedding: bool = True
+    train_action_expert_only: bool = False
+    gradient_checkpointing: bool = False
+
+    model_dtype: str = "bfloat16"
+    softmax_auxiliary_loss: bool = True
+    softmax_auxiliary_loss_scale: float = 1e-4
+    discrete_loss_token_weighting: str = "root_subsegments_root_tokens"
+
+    optimizer_lr: float = 1e-5
+    optimizer_vit_lr: float = 5e-6
+    optimizer_connector_lr: float = 5e-6
+    optimizer_action_expert_lr: float = 5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-6
+    optimizer_weight_decay: float = 0.0
+    optimizer_grad_clip_norm: float = 1.0
+
+    scheduler_warmup_steps: int = 200
+    scheduler_decay_steps: int | None = None
+    scheduler_decay_lr: float = 1e-6
+
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.QUANTILES,
+            "ACTION": NormalizationMode.QUANTILES,
+        }
+    )
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    dataset_feature_names: dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.action_mode not in {"continuous", "discrete", "both"}:
+            raise ValueError(
+                f"Unsupported action_mode={self.action_mode!r}. "
+                "Expected one of {'continuous', 'discrete', 'both'}."
+            )
+        if self.inference_action_mode not in {None, "continuous", "discrete"}:
+            raise ValueError(
+                f"Unsupported inference_action_mode={self.inference_action_mode!r}. "
+                "Expected one of {None, 'continuous', 'discrete'}."
+            )
+        if self.inference_action_mode == "continuous" and self.action_mode == "discrete":
+            raise ValueError("MolmoAct2 action_mode='discrete' cannot run continuous inference.")
+        if self.inference_action_mode == "discrete" and self.action_mode == "continuous":
+            raise ValueError("MolmoAct2 action_mode='continuous' cannot run discrete inference.")
+        if self.train_action_expert_only and self.action_mode != "continuous":
+            raise ValueError("MolmoAct2 train_action_expert_only requires action_mode='continuous'.")
+        if self.train_action_expert_only and self.enable_lora_vlm:
+            raise ValueError("MolmoAct2 train_action_expert_only is incompatible with enable_lora_vlm.")
+        if self.enable_lora_action_expert and not self.enable_lora_vlm:
+            raise ValueError("MolmoAct2 enable_lora_action_expert requires enable_lora_vlm.")
+        if self.chunk_size < 1:
+            raise ValueError(f"chunk_size must be >= 1, got {self.chunk_size}.")
+        if self.n_action_steps < 1:
+            raise ValueError(f"n_action_steps must be >= 1, got {self.n_action_steps}.")
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"n_action_steps ({self.n_action_steps}) cannot exceed chunk_size ({self.chunk_size})."
+            )
+        if self.expected_max_action_dim != 32:
+            raise ValueError("MolmoAct2 released checkpoints use expected_max_action_dim=32.")
+        if self.model_dtype not in {"float32", "bfloat16", "float16"}:
+            raise ValueError(
+                f"Unsupported model_dtype={self.model_dtype!r}. Expected 'float32', 'bfloat16', or 'float16'."
+            )
+        if self.lora_rank < 1:
+            raise ValueError(f"lora_rank must be >= 1, got {self.lora_rank}.")
+        if self.lora_alpha < 1:
+            raise ValueError(f"lora_alpha must be >= 1, got {self.lora_alpha}.")
+        if not 0 <= self.lora_dropout <= 1:
+            raise ValueError(f"lora_dropout must be in [0, 1], got {self.lora_dropout}.")
+        if self.lora_bias not in {"none", "all", "lora_only"}:
+            raise ValueError(
+                f"Unsupported lora_bias={self.lora_bias!r}. Expected one of 'none', 'all', or 'lora_only'."
+            )
+        if self.discrete_loss_token_weighting not in {
+            "none",
+            "token",
+            "root_tokens",
+            "root_subsegments",
+            "root_subsegments_root_tokens",
+        }:
+            raise ValueError(
+                f"Unsupported discrete_loss_token_weighting={self.discrete_loss_token_weighting!r}."
+            )
+        if self.discrete_generation_max_steps is not None and self.discrete_generation_max_steps < 1:
+            raise ValueError(
+                f"discrete_generation_max_steps must be >= 1 or None, got {self.discrete_generation_max_steps}."
+            )
+        if self.max_sequence_length is not None and self.max_sequence_length < 1:
+            raise ValueError(f"max_sequence_length must be >= 1 or None, got {self.max_sequence_length}.")
+
+    def inferred_max_sequence_length(
+        self,
+        *,
+        num_images: int | None = None,
+        state_dim: int | None = None,
+        action_dim: int | None = None,
+        action_horizon: int | None = None,
+        include_discrete_action: bool | None = None,
+    ) -> int:
+        if self.max_sequence_length is not None:
+            return int(self.max_sequence_length)
+
+        if num_images is None:
+            num_images = len(self.image_keys) or len(self.image_features) or MOLMOACT2_DEFAULT_NUM_IMAGES
+        if state_dim is None:
+            state_feature = self.robot_state_feature
+            state_dim = int(state_feature.shape[0]) if state_feature is not None else 0
+        if action_dim is None:
+            action_feature = self.action_feature
+            action_dim = (
+                int(action_feature.shape[0]) if action_feature is not None else self.expected_max_action_dim
+            )
+        if action_horizon is None:
+            action_horizon = self.chunk_size
+        if include_discrete_action is None:
+            include_discrete_action = self.action_mode in {"discrete", "both"}
+
+        return infer_molmoact2_max_sequence_length(
+            num_images=int(num_images),
+            state_dim=int(state_dim),
+            action_dim=int(action_dim),
+            action_horizon=int(action_horizon),
+            include_discrete_action=bool(include_discrete_action),
+        )
+
+    @property
+    def observation_delta_indices(self) -> None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> list[int]:
+        return list(range(self.chunk_size))
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def get_optimizer_preset(self) -> OptimizerConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=self.optimizer_grad_clip_norm,
+        )
+
+    def get_scheduler_preset(self) -> LRSchedulerConfig | None:
+        return MolmoAct2CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+
+    def set_dataset_feature_metadata(self, features: dict[str, Any]) -> None:
+        self.dataset_feature_names = {}
+        for key in (ACTION, OBS_STATE):
+            feature = features.get(key) if isinstance(features, dict) else None
+            if isinstance(feature, dict) and feature.get("names") is not None:
+                self.dataset_feature_names[key] = feature["names"]
+
+    def validate_features(self) -> None:
+        """Validate and set up MolmoAct2 input and output features."""
+        image_features = [key for key, feat in self.input_features.items() if feat.type == FeatureType.VISUAL]
+        if not image_features:
+            raise ValueError(
+                "MolmoAct2 policy requires at least one visual input feature. "
+                "No features of type FeatureType.VISUAL found in input_features."
+            )
+
+        if OBS_STATE not in self.input_features:
+            state_feature = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(0,),
+            )
+            self.input_features[OBS_STATE] = state_feature
+
+        if ACTION not in self.output_features:
+            action_feature = PolicyFeature(
+                type=FeatureType.ACTION,
+                shape=(self.expected_max_action_dim,),
+            )
+            self.output_features[ACTION] = action_feature
+
+    def apply_norm_tag_metadata(self) -> None:
+        if not str(self.norm_tag or "").strip():
+            return
+        metadata = _load_hf_norm_metadata_for_tag(
+            self.checkpoint_path,
+            revision=self.checkpoint_revision,
+            force_download=bool(self.checkpoint_force_download),
+            norm_tag=self.norm_tag,
+        )
+        if metadata.get("action_horizon") is not None:
+            self.chunk_size = int(metadata["action_horizon"])
+        if metadata.get("n_action_steps") is not None:
+            self.n_action_steps = int(metadata["n_action_steps"])
+        if not self.setup_type and metadata.get("setup_type") is not None:
+            self.setup_type = str(metadata["setup_type"])
+        if not self.control_mode and metadata.get("control_mode") is not None:
+            self.control_mode = str(metadata["control_mode"])
+
+    def saved_policy_action_mode(self) -> str | None:
+        pretrained_path = getattr(self, "pretrained_path", None)
+        if pretrained_path is None:
+            return None
+        config_path = Path(pretrained_path) / "config.json"
+        if not config_path.exists():
+            return None
+        try:
+            mode = json.loads(config_path.read_text()).get("action_mode")
+        except (OSError, json.JSONDecodeError):
+            return None
+        if mode in {"continuous", "discrete", "both"}:
+            return str(mode)
+        return None
+
+    def training_action_mode(self, saved_policy_action_mode: str | None = None) -> str:
+        return saved_policy_action_mode or self.action_mode
+
+    def validate_inference_action_mode(self, saved_policy_action_mode: str | None = None) -> None:
+        requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            return
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='discrete' and cannot run "
+                "continuous inference."
+            )
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='continuous' and cannot run "
+                "discrete inference. Train with action_mode='both' or action_mode='discrete' first."
+            )
+
+    def validate_checkpoint_action_mode(
+        self,
+        checkpoint_action_mode: str,
+        *,
+        has_action_expert: bool,
+    ) -> None:
+        if self.action_mode == "both" and checkpoint_action_mode != "both":
+            raise ValueError(
+                f"action_mode='both' requires checkpoint action_mode='both', got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode == "discrete" and checkpoint_action_mode not in {"discrete", "both"}:
+            raise ValueError(
+                f"action_mode='discrete' requires checkpoint action_mode in {{'discrete', 'both'}}, "
+                f"got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode in {"continuous", "both"} and not has_action_expert:
+            raise ValueError("Continuous MolmoAct2 training requires an action expert checkpoint.")
+
+    def resolve_inference_action_mode(
+        self,
+        requested_mode: str | None,
+        saved_policy_action_mode: str | None = None,
+    ) -> str:
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode is None:
+            requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            raise ValueError(
+                "MolmoAct2 inference requires `inference_action_mode` to be set explicitly "
+                "to either 'continuous' or 'discrete'."
+            )
+        if requested_mode not in {"continuous", "discrete"}:
+            raise ValueError("MolmoAct2 inference_action_mode must be either 'continuous' or 'discrete'.")
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError("MolmoAct2 action_mode='discrete' checkpoint cannot run continuous inference.")
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError("MolmoAct2 action_mode='continuous' checkpoint cannot run discrete inference.")
+        return requested_mode
diff --git a/src/lerobot/policies/molmoact2/hf_model/__init__.py b/src/lerobot/policies/molmoact2/hf_model/__init__.py
new file mode 100644
index 000000000..39b15cb3a
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
diff --git a/src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py b/src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py
new file mode 100644
index 000000000..f7dacbce6
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+import logging
+import os
+from pathlib import Path
+from typing import ClassVar
+
+import numpy as np
+from tokenizers import ByteLevelBPETokenizer
+from tokenizers.trainers import BpeTrainer
+from huggingface_hub import snapshot_download
+from transformers import PreTrainedTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_tokenizer_location(
+    tokenizer_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    local_path = Path(str(tokenizer_path)).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=str(tokenizer_path),
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+class UniversalActionProcessor(ProcessorMixin):
+    attributes: ClassVar[list[str]] = ["tokenizer"]
+    tokenizer_class: str = "AutoTokenizer"
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerFast,
+        scale: float = 10,
+        vocab_size: int = 1024,
+        min_token: int = 0,
+        *,
+        action_dim: int | None = None,
+        time_horizon: int | None = None,
+    ):
+        self.scale = scale
+        self.vocab_size = vocab_size
+        self.min_token = min_token
+
+        # Action horizon and dimension needed during decoding. These can be specified
+        # in three ways (in order of priority):
+        # 1. passed in as kwargs to decode()
+        # 2. in the constructor
+        # 3. cached from the last time decode() was called
+        self.time_horizon = time_horizon
+        self.action_dim = action_dim
+        self.called_time_horizon = time_horizon
+        self.called_action_dim = action_dim
+
+        super().__init__(tokenizer)
+        self.bpe_tokenizer = self.tokenizer
+
+    def __call__(self, action_chunk: np.array) -> np.array:
+        from scipy.fft import dct
+
+        assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
+        if action_chunk.ndim == 2:
+            action_chunk = action_chunk[None, ...]
+
+        # Cache the time horizon and action dimension for decoding
+        self.called_time_horizon = action_chunk.shape[-2]
+        self.called_action_dim = action_chunk.shape[-1]
+
+        dct_coeff = dct(action_chunk, axis=1, norm="ortho")
+        dct_coeff = np.around(dct_coeff * self.scale)
+        tokens = []
+        for elem in dct_coeff:
+            token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
+            tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
+        return tokens
+
+    def decode(
+        self,
+        tokens: list[list[int]],
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> np.array:
+        from scipy.fft import idct
+
+        self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
+        self.action_dim = action_dim or self.action_dim or self.called_action_dim
+
+        # Cache the time horizon and action dimension for the next call
+        self.called_time_horizon = self.time_horizon
+        self.called_action_dim = self.action_dim
+
+        assert self.time_horizon is not None and self.action_dim is not None, (
+            "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
+        )
+
+        decoded_actions = []
+        for token in tokens:
+            try:
+                decoded_tokens = self.bpe_tokenizer.decode(token)
+                decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
+                decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
+                assert decoded_dct_coeff.shape == (
+                    self.time_horizon,
+                    self.action_dim,
+                ), (
+                    f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
+                )
+            except Exception as e:
+                print(f"Error decoding tokens: {e}")
+                print(f"Tokens: {token}")
+                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
+            decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
+        return np.stack(decoded_actions)
+
+    @classmethod
+    def fit(
+        cls,
+        action_data: list[np.array],
+        scale: float = 10,
+        vocab_size: int = 1024,
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> "UniversalActionProcessor":
+        from scipy.fft import dct
+
+        # Run DCT over all inputs
+        dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
+
+        # Quantize and find min token
+        max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
+        min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
+        min_vocab_size = max_token - min_token
+
+        assert min_vocab_size <= vocab_size, (
+            f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
+        )
+        if min_vocab_size + 100 > vocab_size:
+            logging.warning(
+                f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
+                f"size {vocab_size}, consider increasing vocab size"
+            )
+
+        # Make token iterator for BPE training
+        def _token_iter():
+            for tokens in dct_tokens:
+                rounded_tokens = np.around(tokens * scale) - min_token
+                rounded_tokens = rounded_tokens.astype(int)
+                string = "".join(map(chr, rounded_tokens))
+                yield string
+
+        # Train BPE tokenizer
+        bpe = ByteLevelBPETokenizer()
+
+        # Set up the entire range of possible tokens as the initial alphabet
+        alphabet = [chr(i) for i in range(max_token - min_token + 1)]
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=2,
+            show_progress=True,
+            special_tokens=[],
+            initial_alphabet=alphabet,
+            max_token_length=10000,
+        )
+
+        # Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
+        # because it doesn't support custom alphabets)
+        bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
+
+        return cls(
+            PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
+            scale=scale,
+            vocab_size=vocab_size,
+            min_token=min_token,
+            time_horizon=time_horizon,
+            action_dim=action_dim,
+        )
+
+    @classmethod
+    def from_pretrained_local(
+        cls,
+        pretrained_model_name_or_path: str,
+        *,
+        revision: str | None = None,
+        force_download: bool = False,
+    ) -> "UniversalActionProcessor":
+        location = Path(
+            _resolve_tokenizer_location(
+                pretrained_model_name_or_path,
+                revision=revision,
+                force_download=force_download,
+            )
+        )
+        processor_config = {}
+        processor_config_path = location / "processor_config.json"
+        if processor_config_path.exists():
+            import json
+
+            processor_config = json.loads(processor_config_path.read_text())
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(str(location))
+        return cls(
+            tokenizer,
+            scale=processor_config.get("scale", 10),
+            vocab_size=processor_config.get("vocab_size", 1024),
+            min_token=processor_config.get("min_token", 0),
+            action_dim=processor_config.get("action_dim"),
+            time_horizon=processor_config.get("time_horizon"),
+        )
diff --git a/src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py b/src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py
new file mode 100644
index 000000000..29da68c14
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py
@@ -0,0 +1,553 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""
+MolmoAct2 configuration
+"""
+
+from typing import Optional, Any
+
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class MolmoAct2VitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
+    It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
+    defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
+
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> configuration = MolmoAct2VitConfig()
+
+    >>> # Initializing a MolmoAct2VisionTransformer (with random weights)
+    >>> model = MolmoAct2VisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2"
+    base_config_key = "vit_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+
+
+class MolmoAct2AdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
+    It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
+    defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
+
+    >>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+
+    >>> # Initializing a MolmoAct2VisionBackbone (with random weights)
+    >>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
+
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+
+    model_type = "molmoact2"
+    base_config_key = "adapter_config"
+
+    def __init__(
+        self,
+        vit_layers: tuple = (-3, -9),
+        pooling_attention_mask: bool = False,
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        self.vit_layers = vit_layers
+        self.pooling_attention_mask = pooling_attention_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+
+
+class MolmoAct2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
+    `MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
+
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> configuration = MolmoAct2TextConfig()
+
+    >>> # Initializing a MolmoAct2TextModel (with random weights)
+    >>> model = MolmoAct2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: int | None = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: dict[str, Any] = None,
+        rope_scaling_layers: list[int] | None = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings, attn_implementation=attn_implementation, **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_scaling_layers = rope_scaling_layers
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+
+class MolmoAct2ActionExpertConfig(PretrainedConfig):
+    r"""Configuration for the MolmoAct2 modern action expert."""
+
+    model_type = "molmoact2_action_expert"
+    base_config_key = "action_expert_config"
+
+    def __init__(
+        self,
+        max_action_horizon: int = 32,
+        max_action_dim: int = 32,
+        hidden_size: int = 1024,
+        num_layers: int = 32,
+        num_heads: int = 16,
+        mlp_ratio: float = 8.0 / 3.0,
+        ffn_multiple_of: int = 256,
+        timestep_embed_dim: int = 256,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        context_layer_norm: bool = True,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+        causal_attn: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_action_horizon = max_action_horizon
+        self.max_action_dim = max_action_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.ffn_multiple_of = ffn_multiple_of
+        self.timestep_embed_dim = timestep_embed_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.context_layer_norm = context_layer_norm
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.rope = rope
+        self.causal_attn = causal_attn
+
+    def to_dict(self):
+        output = super().to_dict()
+        # These are derived from the parent MolmoAct2Config for HF exports. Keeping
+        # them out of the public nested config avoids duplicated sources of truth.
+        output.pop("max_action_horizon", None)
+        output.pop("max_action_dim", None)
+        return output
+
+
+class MolmoAct2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
+    It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
+
+    Example:
+
+    ```python
+    >>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
+
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> vit_config = MolmoAct2VitConfig()
+
+    >>> # Initializing a MolmoAct2AdapterConfig
+    >>> adapter_config = MolmoAct2AdapterConfig()
+
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> text_config = MolmoAct2TextConfig()
+
+    >>> # Initializing a MolmoAct2Config
+    >>> configuration = MolmoAct2Config(
+    >>>     vit_config=vit_config,
+    >>>     adapter_config=adapter_config,
+    >>>     text_config=text_config,
+    >>>     image_start_token_id=151936,
+    >>>     image_end_token_id=151937,
+    >>>     image_patch_id=151938,
+    >>>     image_col_id=151939,
+    >>>     low_res_image_start_token_id=151940,
+    >>>     image_low_res_id=151942,
+    >>>     frame_start_token_id=151943,
+    >>>     frame_end_token_id=151944,
+    >>> )
+
+    >>> # Initializing a model
+    >>> model = MolmoAct2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2"
+    sub_configs = {
+        "text_config": MolmoAct2TextConfig,
+        "vit_config": MolmoAct2VitConfig,
+        "adapter_config": MolmoAct2AdapterConfig,
+        "action_expert_config": MolmoAct2ActionExpertConfig,
+    }
+
+    def __init__(
+        self,
+        vit_config: MolmoAct2VitConfig = None,
+        adapter_config: MolmoAct2AdapterConfig = None,
+        text_config: MolmoAct2TextConfig = None,
+        action_expert_config: MolmoAct2ActionExpertConfig = None,
+        image_start_token_id: int = None,
+        low_res_image_start_token_id: int = None,
+        image_end_token_id: int = None,
+        image_low_res_id: int = None,
+        image_patch_id: int = None,
+        image_col_id: int = None,
+        frame_start_token_id: int = None,
+        frame_end_token_id: int = None,
+        use_frame_special_tokens: bool = True,
+        initializer_range: float = 0.02,
+        add_action_expert: bool = True,
+        max_action_dim: int = 32,
+        max_action_horizon: int = 30,
+        n_obs_steps: int = 30,
+        action_mode: str = "both",
+        state_format: str = "discrete",
+        flow_matching_num_steps: int = 10,
+        flow_matching_cutoff: float = 1.0,
+        flow_matching_time_offset: float = 0.001,
+        flow_matching_time_scale: float = 0.999,
+        flow_matching_beta_alpha: float = 1.0,
+        flow_matching_beta_beta: float = 1.5,
+        mask_action_dim_padding: bool = True,
+        enable_depth_reasoning: bool = False,
+        depth_mode: int = 2,
+        num_depth_codes: int = 100,
+        action_expert_depth_gate: bool = False,
+        action_expert_depth_gate_per_layer: bool = False,
+        action_expert_depth_gate_init_bias: float = -4.0,
+        action_output_token_id: int = None,
+        action_start_token_id: int = None,
+        action_end_token_id: int = None,
+        action_token_start_id: int = None,
+        num_action_tokens: int = 0,
+        depth_output_token_id: int = None,
+        depth_start_token_id: int = None,
+        depth_end_token_id: int = None,
+        depth_token_start_id: int = None,
+        num_depth_tokens: int = 0,
+        state_start_token_id: int = None,
+        state_end_token_id: int = None,
+        state_token_start_id: int = None,
+        num_state_tokens: int = 0,
+        add_setup_tokens: bool = True,
+        add_control_tokens: bool = True,
+        norm_stats_filename: str = "norm_stats.json",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoAct2VitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoAct2VitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoAct2AdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if text_config is None:
+            self.text_config = MolmoAct2TextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = MolmoAct2TextConfig(**text_config)
+        else:
+            self.text_config = text_config
+        self.add_action_expert = bool(add_action_expert)
+        if not self.add_action_expert:
+            self.action_expert_config = None
+        elif action_expert_config is None:
+            self.action_expert_config = MolmoAct2ActionExpertConfig(
+                max_action_horizon=max_action_horizon,
+                max_action_dim=max_action_dim,
+                num_layers=self.text_config.num_hidden_layers,
+            )
+        elif isinstance(action_expert_config, dict):
+            self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
+        else:
+            self.action_expert_config = action_expert_config
+        if self.add_action_expert:
+            self.action_expert_config.max_action_dim = int(max_action_dim)
+            self.action_expert_config.max_action_horizon = int(max_action_horizon)
+            self._validate_release_action_config(
+                state_format=state_format,
+            )
+        self.image_start_token_id = image_start_token_id
+        self.low_res_image_start_token_id = low_res_image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_low_res_id = image_low_res_id
+        self.image_high_res_id = image_patch_id
+        self.image_patch_id = image_patch_id
+        self.image_col_id = image_col_id
+        self.frame_start_token_id = frame_start_token_id
+        self.frame_end_token_id = frame_end_token_id
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.initializer_range = initializer_range
+        self.max_action_dim = max_action_dim
+        self.max_action_horizon = max_action_horizon
+        self.n_obs_steps = n_obs_steps
+        self.action_mode = action_mode
+        self.state_format = state_format
+        self.flow_matching_num_steps = flow_matching_num_steps
+        self.flow_matching_cutoff = flow_matching_cutoff
+        self.flow_matching_time_offset = flow_matching_time_offset
+        self.flow_matching_time_scale = flow_matching_time_scale
+        self.flow_matching_beta_alpha = flow_matching_beta_alpha
+        self.flow_matching_beta_beta = flow_matching_beta_beta
+        self.mask_action_dim_padding = mask_action_dim_padding
+        self.enable_depth_reasoning = enable_depth_reasoning
+        self.depth_mode = depth_mode
+        self.num_depth_codes = num_depth_codes
+        self.action_expert_depth_gate = action_expert_depth_gate
+        self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
+        self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
+        self.action_output_token_id = action_output_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.action_token_start_id = action_token_start_id
+        self.num_action_tokens = num_action_tokens
+        self.depth_output_token_id = depth_output_token_id
+        self.depth_start_token_id = depth_start_token_id
+        self.depth_end_token_id = depth_end_token_id
+        self.depth_token_start_id = depth_token_start_id
+        self.num_depth_tokens = num_depth_tokens
+        self.state_start_token_id = state_start_token_id
+        self.state_end_token_id = state_end_token_id
+        self.state_token_start_id = state_token_start_id
+        self.num_state_tokens = num_state_tokens
+        self.add_setup_tokens = add_setup_tokens
+        self.add_control_tokens = add_control_tokens
+        self.norm_stats_filename = norm_stats_filename
+
+    @staticmethod
+    def _validate_release_action_config(
+        *,
+        state_format: str,
+    ) -> None:
+        if state_format != "discrete":
+            raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
+
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+
+    @property
+    def num_attention_heads(self):
+        return self.text_config.num_attention_heads
+
+    @property
+    def num_key_value_heads(self):
+        return self.text_config.num_key_value_heads
+
+    @property
+    def head_dim(self):
+        return self.text_config.head_dim
+
+    @property
+    def num_hidden_layers(self):
+        return self.text_config.num_hidden_layers
+
+    @property
+    def hidden_size(self):
+        return self.text_config.hidden_size
+
+    @property
+    def vocab_size(self):
+        return self.text_config.vocab_size
+
+    @property
+    def max_position_embeddings(self):
+        return self.text_config.max_position_embeddings
+
+
+MolmoAct2VitConfig.register_for_auto_class()
+MolmoAct2AdapterConfig.register_for_auto_class()
+MolmoAct2TextConfig.register_for_auto_class()
+MolmoAct2ActionExpertConfig.register_for_auto_class()
+MolmoAct2Config.register_for_auto_class()
diff --git a/src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py b/src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py
new file mode 100644
index 000000000..a172c8477
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Image processor class for MolmoAct2"""
+
+from typing import Optional, Union
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    valid_images,
+    to_numpy_array,
+)
+from transformers.image_transforms import convert_to_rgb
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+
+
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
+        )
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+
+    return resized
+
+
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i * j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0] * x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+
+    # The original size can be zero in rare cases if the image is smaller than the margin
+    # In those cases letting the scale become infinite means the tiling is based on the
+    # other side, or falls back to the smallest tiling
+    with np.errstate(divide="ignore"):
+        required_scale_d = (candidate_resolutions.astype(np.float32) / original_size,)
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+
+
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image,
+        base_image_input_size,
+        resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w * crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+
+
+def build_overlapping_crops(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Decompose an image into a set of overlapping crops
+
+    :return crop_arr: [n_crops, h, w, 3] The crops
+    :return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
+                        the crops were extracted from, what patch in `crop_arr` it corresponds to
+    """
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    assert base_image_input_size[0] == base_image_input_size[1]
+
+    left_margin, right_margin = overlap_margins
+    total_margin_pixels = image_patch_size * (right_margin + left_margin)  # pixels removed per dim
+    crop_patches = base_image_input_size[0] // image_patch_size  # patches per crop dim
+    crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+    crop_window_size = crop_window_patches * image_patch_size
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+
+    # Decide how to tile the image, to account for the overlap margins we compute the tiling
+    # as if we had an image without the margins and were using a crop size without the margins
+    tiling = select_tiling(
+        original_image_h - total_margin_pixels,
+        original_image_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+
+    src = resize_image(
+        image,
+        [
+            tiling[0] * crop_window_size + total_margin_pixels,
+            tiling[1] * crop_window_size + total_margin_pixels,
+        ],
+        resample,
+    )
+    src = normalize_image(src, image_mean, image_std)
+
+    # Now we have to split the image into crops, and track what patches came from
+    # where in `patch_idx_arr`
+    n_crops = tiling[0] * tiling[1]
+    crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
+    patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
+    on_crop = 0
+    for i in range(tiling[0]):
+        # Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
+        # which results in overlapping crop windows
+        y0 = i * crop_window_size
+        for j in range(tiling[1]):
+            x0 = j * crop_window_size
+            crop_arr[on_crop] = src[y0 : y0 + crop_size, x0 : x0 + crop_size]
+            patch_idx = np.arange(crop_patch_w * crop_patch_h).reshape(crop_patch_h, crop_patch_w)
+            patch_idx += on_crop * crop_patch_h * crop_patch_w
+
+            # Mask out idx that are in the overlap region
+            if i != 0:
+                patch_idx[:left_margin, :] = -1
+            if j != 0:
+                patch_idx[:, :left_margin] = -1
+            if i != tiling[0] - 1:
+                patch_idx[-right_margin:, :] = -1
+            if j != tiling[1] - 1:
+                patch_idx[:, -right_margin:] = -1
+            patch_idx_arr[on_crop] = patch_idx
+            on_crop += 1
+
+    # `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
+    # so it is ordered left-to-right order
+    patch_idx_arr = np.reshape(patch_idx_arr, [tiling[0], tiling[1], crop_patch_h, crop_patch_w])
+    patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
+    patch_idx_arr = np.reshape(patch_idx_arr, [-1])
+
+    # Now get the parts not in the overlap region, so it should map each patch in `src`
+    # to the correct patch it should come from in `crop_arr`
+    patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
+        src.shape[0] // image_patch_size,
+        src.shape[1] // image_patch_size,
+    )
+    return crop_arr, patch_idx_arr
+
+
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size * c])
+        return array
+
+
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(
+        idx_arr,
+        [[h_pad // 2, (h_pad + 1) // 2], [w_pad // 2, (w_pad + 1) // 2]],
+        mode="constant",
+        constant_values=-1,
+    )
+    return einops.rearrange(idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+
+
+def image_to_patches_and_grids(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+    crop_mode: str = "overlap-and-resize-c2",
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each (low-res, high-res) image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+
+    if crop_mode == "resize":
+        resized, resize_idx = build_resized_image(
+            image,
+            base_image_input_size,
+            resample,
+            image_mean,
+            image_std,
+            image_patch_size,
+        )
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        resized_h, resized_w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+        image_grid = [np.array([resized_h, resized_w, 0, 0])]
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            resize_idx,
+        )
+
+    if crop_mode not in {"overlap-and-resize-c2", "overlap-and-resize"}:
+        raise ValueError(f"Unsupported MolmoAct2 image crop_mode {crop_mode!r}.")
+
+    crop_arr, patch_idx_arr = build_overlapping_crops(
+        image,
+        max_crops,
+        overlap_margins,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h * pooling_w])
+
+    # Finally do the same for the global image
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    crop_arr = np.concatenate([resized, crop_arr], 0)
+
+    resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    resized_h, resized_w = resize_idx.shape[:2]
+    resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+
+    # Global image goes first, so the order of patches in previous crops gets increased
+    pooling_idx = np.where(pooling_idx >= 0, pooling_idx + crop_patch_h * crop_patch_w, -1)
+    pooling_idx = np.concatenate([resize_idx, pooling_idx])
+    image_grid = [np.array([resized_h, resized_w, h, w])]
+
+    return (np.stack(image_grid, 0), batch_pixels_to_patches(crop_arr, image_patch_size), pooling_idx)
+
+
+class MolmoAct2ImagesKwargs(ImagesKwargs, total=False):
+    max_crops: int | None
+    overlap_margins: list[int] | None
+    crop_mode: str | None
+    patch_size: int | None
+    pooling_size: list[int] | None
+
+
+class MolmoAct2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MolmoAct2 image processor that preprocesses images for the model.
+
+    Args:
+        size (`dict[str, int]` *optional*, defaults to `{"height": 378, "width": 378}`):
+            Size of the image after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use when resizing the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        max_crops (`int`, *optional*, defaults to `8`):
+            Maximum number of crops to use per image.
+        overlap_margins (`list[int]`, *optional*, defaults to `[4, 4]`):
+            Overlap margins to use.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spatial patch size of the vision encoder.
+        pooling_size (`list[int]`, *optional*, defaults to `[2, 2]`):
+            The pooling size of the vision adapter.
+    """
+
+    model_input_names = ["pixel_values", "image_token_pooling", "image_grids", "image_num_crops"]
+
+    def __init__(
+        self,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        max_crops: int = 8,
+        overlap_margins: list[int] = [4, 4],
+        crop_mode: str = "overlap-and-resize-c2",
+        patch_size: int = 14,
+        pooling_size: list[int] = [2, 2],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 378, "width": 378}
+        size = get_size_dict(size, default_to_square=True)
+        self.size = size
+
+        self.resample = resample
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.crop_mode = crop_mode
+        self.patch_size = patch_size
+        self.pooling_size = pooling_size
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool | None = None,
+        max_crops: int | None = None,
+        overlap_margins: list[int] | None = None,
+        crop_mode: str | None = None,
+        patch_size: int | None = None,
+        pooling_size: list[int] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            max_crops (`int`, *optional*, defaults to `self.max_crops`):
+                Maximum number of crops to use per image.
+            overlap_margins (`list[int]`, *optional*, defaults to `self.overlap_margins`):
+                Overlap margins to use.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values`: The preprocessed images.
+                - `image_token_pooling`: The indices of the patches in `crops` to pool for each token in `image_tokens`.
+                - `image_grids`: The image grids.
+                - `image_num_crops`: The number of crops for each image.
+        """
+        if size is not None:
+            if "height" not in size or "width" not in size:
+                raise ValueError("size must contain 'height' and 'width' keys.")
+        else:
+            size = {**self.size}
+
+        base_image_input_size = [size["height"], size["width"]]
+
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        crop_mode = crop_mode or self.crop_mode
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+
+        image_pooling_h, image_pooling_w = pooling_size
+
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        data = {}
+        if images is not None:
+            batch_grids = []
+            batch_crops = []
+            batch_pooled_patches_idx = []
+            batch_num_crops = []
+
+            for image in images:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    image,
+                    max_crops,
+                    overlap_margins,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                    crop_mode,
+                )
+                batch_grids.append(image_grid)
+                batch_crops.append(crops)
+                batch_pooled_patches_idx.append(pooled_idx)
+                batch_num_crops.append(crops.shape[0])
+
+            pixel_values = np.concatenate(batch_crops, 0)
+            image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+            image_grids = np.concatenate(batch_grids, 0)
+            image_num_crops = np.array(batch_num_crops)
+
+            data.update(
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+            )
+
+        return BatchFeature(data, tensor_type=return_tensors)
+
+
+MolmoAct2ImageProcessor.register_for_auto_class()
diff --git a/src/lerobot/policies/molmoact2/hf_model/inference.py b/src/lerobot/policies/molmoact2/hf_model/inference.py
new file mode 100644
index 000000000..2c0243880
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/inference.py
@@ -0,0 +1,748 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Inference utilities for MolmoAct2"""
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+from collections.abc import Iterable, Sequence
+
+import torch
+from torch.nn import functional as F
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+
+
+@dataclass
+class _ActionFlowInputs:
+    trajectory: torch.Tensor
+    context: Any
+    modulations: Sequence[Any]
+    action_dim_is_pad: torch.Tensor | None
+
+
+@dataclass
+class _ActionFlowCudaGraph:
+    key: tuple[Any, ...]
+    graph: torch.cuda.CUDAGraph
+    static_inputs: _ActionFlowInputs
+    output: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphLayerStage:
+    residual: torch.Tensor
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphPostStage:
+    graph: torch.cuda.CUDAGraph
+    attn_context: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraph:
+    cache_key: tuple[Any, ...]
+    pre_graph: torch.cuda.CUDAGraph
+    token_ids: torch.Tensor
+    cos: torch.Tensor
+    sin: torch.Tensor
+    positions: torch.Tensor
+    stages: Sequence[_DepthDecodeCudaGraphLayerStage]
+    post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
+    output: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphSpec:
+    eligible: bool
+    cache_key_prefix: tuple[Any, ...]
+    num_hidden_layers: int
+    head_dim: int
+    num_attention_heads: int
+
+
+def _cache_seq_len_int(past_key_values: Cache | None) -> int:
+    if past_key_values is None:
+        return 0
+    seq_len = past_key_values.get_seq_length()
+    if torch.is_tensor(seq_len):
+        return int(seq_len.item())
+    return int(seq_len)
+
+
+def _cache_max_len_int(past_key_values: Cache | None) -> int:
+    if past_key_values is None:
+        return -1
+    max_len = past_key_values.get_max_cache_shape()
+    if torch.is_tensor(max_len):
+        return int(max_len.item())
+    return int(max_len)
+
+
+def _iter_cache_key_values(
+    past_key_values: Cache,
+) -> Iterable[tuple[torch.Tensor | None, torch.Tensor | None]]:
+    layers = getattr(past_key_values, "layers", None)
+    if layers is not None:
+        for layer in layers:
+            yield getattr(layer, "keys", None), getattr(layer, "values", None)
+        return
+    for layer in past_key_values:
+        yield layer[0], layer[1]
+
+
+class _DepthDecodeStaticLayerCache:
+    is_compileable = False
+    is_sliding = False
+
+    def __init__(self, max_cache_len: int) -> None:
+        self.max_cache_len = int(max_cache_len)
+        self.cumulative_length = 0
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+
+    def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        bsz, n_heads = key_states.shape[:2]
+        self.keys = torch.empty(
+            (bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
+            dtype=key_states.dtype,
+            device=key_states.device,
+        )
+        self.values = torch.empty(
+            (bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
+            dtype=value_states.dtype,
+            device=value_states.device,
+        )
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.keys is None:
+            self._allocate(key_states, value_states)
+        start = self.cumulative_length
+        end = start + key_states.shape[-2]
+        if end > self.max_cache_len:
+            raise RuntimeError(f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}.")
+        self.keys[:, :, start:end, :].copy_(key_states)
+        self.values[:, :, start:end, :].copy_(value_states)
+        self.cumulative_length = end
+        return self.keys[:, :, :end, :], self.values[:, :, :end, :]
+
+    def get_seq_length(self) -> int:
+        return self.cumulative_length
+
+    def get_max_cache_shape(self) -> int:
+        return -1
+
+    def reset(self) -> None:
+        self.cumulative_length = 0
+
+
+class _DepthDecodeStaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
+        text_config = config.get_text_config(decoder=True)
+        super().__init__(
+            layers=[
+                _DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
+                for _ in range(text_config.num_hidden_layers)
+            ]
+        )
+
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_seq_length()
+
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_max_cache_shape()
+
+    def reset(self) -> None:
+        for layer in self.layers:
+            layer.reset()
+
+
+class ActionCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.enabled = True
+        self.action_flow_graph: _ActionFlowCudaGraph | None = None
+
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+
+    def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
+        action_model = self.model
+        if not self.enabled:
+            return False
+        if action_model.training or action_model._require_action_expert().training:
+            return False
+        if inputs.trajectory.device.type != "cuda":
+            return False
+
+        def all_on_cuda():
+            yield inputs.trajectory
+            for k, v in inputs.context.kv_contexts:
+                yield k
+                yield v
+            for t in (
+                inputs.context.cross_mask,
+                inputs.context.self_mask,
+                inputs.context.valid_action,
+                inputs.action_dim_is_pad,
+            ):
+                if t is not None:
+                    yield t
+            if inputs.context.rope_cache is not None:
+                yield from inputs.context.rope_cache
+            for step in inputs.modulations:
+                yield step.conditioning
+                for block_modulation in step.block_modulations:
+                    yield from block_modulation
+                yield from step.final_modulation
+
+        return all(t.device.type == "cuda" for t in all_on_cuda())
+
+    def run_action_flow(
+        self,
+        inputs: _ActionFlowInputs,
+        steps: int,
+        run_loop,
+    ) -> torch.Tensor:
+        key = _cuda_graph_key(inputs, steps)
+        cache = self.action_flow_graph
+        if cache is None or cache.key != key:
+            static_inputs = _clone_static_inputs(inputs)
+            graph, output = _capture_cuda_graph(
+                lambda: run_loop(static_inputs, steps),
+                inputs.trajectory.device,
+                after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
+            )
+            cache = _ActionFlowCudaGraph(
+                key=key,
+                graph=graph,
+                static_inputs=static_inputs,
+                output=output,
+            )
+            self.action_flow_graph = cache
+        else:
+            _copy_inputs_(cache.static_inputs, inputs)
+
+        cache.graph.replay()
+        return cache.output.clone()
+
+
+class DepthDecodeCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.backbone = model.model
+        self.enabled = True
+        self.graph: _DepthDecodeCudaGraph | None = None
+        self.graph_spec: _DepthDecodeCudaGraphSpec | None = None
+
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+
+    def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
+        return _DepthDecodeStaticCache(
+            config=self.model.config.text_config,
+            max_cache_len=max_cache_len,
+        )
+
+    def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
+        static = self.graph_spec
+        if static is None:
+            cfg = self.backbone.transformer.config
+            rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
+            static = _DepthDecodeCudaGraphSpec(
+                eligible=(
+                    not cfg.norm_after
+                    and cfg.rope_scaling_layers is None
+                    and getattr(rotary_emb, "rope_type", None) == "default"
+                    and cfg._attn_implementation == "sdpa"
+                ),
+                cache_key_prefix=(
+                    cfg.hidden_size,
+                    cfg.num_attention_heads,
+                    cfg.num_key_value_heads,
+                    cfg.head_dim,
+                    cfg.num_hidden_layers,
+                    cfg.use_qk_norm,
+                    cfg.qk_norm_type,
+                    cfg._attn_implementation,
+                ),
+                num_hidden_layers=cfg.num_hidden_layers,
+                head_dim=cfg.head_dim,
+                num_attention_heads=cfg.num_attention_heads,
+            )
+            self.graph_spec = static
+        return static
+
+    def can_use(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> bool:
+        if not self.enabled or self.model.training or self.backbone.transformer.training:
+            return False
+        if next_input_ids.device.type != "cuda":
+            return False
+        if next_input_ids.ndim != 2 or next_input_ids.shape[0] != 1 or next_input_ids.shape[1] != 1:
+            return False
+        if not isinstance(past_key_values, _DepthDecodeStaticCache):
+            return False
+        if not torch.is_tensor(attention_bias) or attention_bias.device != next_input_ids.device:
+            return False
+        return self._depth_decode_spec().eligible
+
+    def _depth_decode_key(
+        self,
+        next_input_ids: torch.Tensor,
+        attention_bias: torch.Tensor,
+    ) -> tuple[Any, ...]:
+        device = next_input_ids.device
+        return (
+            self._depth_decode_spec().cache_key_prefix,
+            device.type,
+            device.index,
+            self.model.lm_head.weight.dtype,
+            attention_bias.shape[-1],
+        )
+
+    def _select_depth_decode_rope(self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int) -> None:
+        emb = self.backbone.transformer.rotary_emb
+        cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
+        sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
+
+    def _depth_decode_pre_layer(
+        self,
+        layer_idx: int,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        residual = hidden_states
+        hidden_states = block.attn_norm(hidden_states)
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, attention.head_dim)
+        qkv = attention.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+
+        apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
+        norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
+
+        if apply_qk_norm and not norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+
+        if norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        query_states, key_states = _apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        return residual, query_states, key_states, value_states
+
+    def _depth_decode_pre0(
+        self,
+        token_ids: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        inputs_embeds = self.model._embed_base_tokens(token_ids)
+        return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
+
+    def _depth_decode_post_layer(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        input_shape = residual.shape[:-1]
+        attn_output = attn_context.reshape(*input_shape, -1).contiguous()
+        attn_output = attention.attn_out(attn_output)
+        hidden_states = residual + block.dropout(attn_output)
+
+        residual = hidden_states
+        hidden_states = block.ff_norm(hidden_states)
+        hidden_states = block.mlp(hidden_states)
+        hidden_states = residual + block.dropout(hidden_states)
+        return hidden_states
+
+    def _depth_decode_post_and_pre_next(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
+
+    def _depth_decode_last_post(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self.backbone.transformer.ln_f(hidden_states)
+
+    def _build_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        text_config = self.backbone.transformer.config
+        device = next_input_ids.device
+        dtype = self.model.lm_head.weight.dtype
+        static = self._depth_decode_spec()
+        num_layers = static.num_hidden_layers
+        head_dim = static.head_dim
+        max_cache_len = int(attention_bias.shape[-1])
+        max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
+        self.backbone.transformer.prepare_rope_cache(device=device, max_seq_len=max_rope_len)
+
+        token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
+        cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
+        sin = torch.empty_like(cos)
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        context_shape = (1, 1, static.num_attention_heads, head_dim)
+
+        token_ids.copy_(next_input_ids)
+        self._select_depth_decode_rope(cos, sin, past_length=past_length)
+
+        pre_graph, pre_output = _capture_cuda_graph(
+            lambda: self._depth_decode_pre0(token_ids, cos, sin),
+            device,
+        )
+        stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
+        post_graphs = []
+        for layer_idx in range(num_layers - 1):
+            stage = stages[-1]
+            attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+            graph, output = _capture_cuda_graph(
+                lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
+                    self._depth_decode_post_and_pre_next(
+                        layer_idx,
+                        stage.residual,
+                        attn_context,
+                        cos,
+                        sin,
+                    )
+                ),
+                device,
+            )
+            post_graphs.append(_DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context))
+            stages.append(_DepthDecodeCudaGraphLayerStage(*output))
+
+        last_stage = stages[-1]
+        last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+        last_graph, last_output = _capture_cuda_graph(
+            lambda: self._depth_decode_last_post(
+                num_layers - 1,
+                last_stage.residual,
+                last_attn_context,
+            ),
+            device,
+        )
+        post_graphs.append(_DepthDecodeCudaGraphPostStage(graph=last_graph, attn_context=last_attn_context))
+        return _DepthDecodeCudaGraph(
+            cache_key=self._depth_decode_key(next_input_ids, attention_bias),
+            pre_graph=pre_graph,
+            token_ids=token_ids,
+            cos=cos,
+            sin=sin,
+            positions=positions,
+            stages=tuple(stages),
+            post_graphs=tuple(post_graphs),
+            output=last_output,
+        )
+
+    def _get_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        key = self._depth_decode_key(next_input_ids, attention_bias)
+        decode_graph = self.graph
+        if decode_graph is None or decode_graph.cache_key != key:
+            decode_graph = self._build_depth_decode_graph(
+                next_input_ids,
+                past_length=past_length,
+                attention_bias=attention_bias,
+            )
+            self.graph = decode_graph
+        else:
+            decode_graph.token_ids.copy_(next_input_ids)
+            self._select_depth_decode_rope(decode_graph.cos, decode_graph.sin, past_length=past_length)
+        return decode_graph
+
+    def _run_depth_decode_attention_core(
+        self,
+        layer_idx: int,
+        stage: _DepthDecodeCudaGraphLayerStage,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        cache_position: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        attention = self.backbone.transformer.blocks[layer_idx].self_attn
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_values.update(
+            stage.key,
+            stage.value,
+            layer_idx,
+            cache_kwargs,
+        )
+        key_states = _repeat_kv(key_states, attention.num_key_value_groups)
+        value_states = _repeat_kv(value_states, attention.num_key_value_groups)
+        attn_output = F.scaled_dot_product_attention(
+            stage.query,
+            key_states,
+            value_states,
+            attn_mask=attention_bias,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return attn_output.transpose(1, 2)
+
+    def run(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        past_length: int,
+    ) -> tuple[torch.Tensor, Cache]:
+        end = past_length + 1
+        decode_graph = self._get_depth_decode_graph(
+            next_input_ids,
+            past_length=past_length,
+            attention_bias=attention_bias,
+        )
+        cache_position = decode_graph.positions[past_length:end]
+        attention_bias_q = attention_bias[:, :, past_length:end, :end]
+
+        decode_graph.pre_graph.replay()
+
+        for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
+            attn_context = self._run_depth_decode_attention_core(
+                layer_idx,
+                decode_graph.stages[layer_idx],
+                past_key_values=past_key_values,
+                attention_bias=attention_bias_q,
+                cache_position=cache_position,
+                cos=decode_graph.cos,
+                sin=decode_graph.sin,
+            )
+            post_graph.attn_context.copy_(attn_context)
+            post_graph.graph.replay()
+
+        return decode_graph.output, past_key_values
+
+
+def _cuda_graph_tensor_signature(
+    tensor: torch.Tensor | None,
+) -> tuple[Any, ...] | None:
+    if tensor is None:
+        return None
+    return (
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        str(tensor.dtype),
+        str(tensor.device),
+    )
+
+
+def _cuda_graph_context_signature(context: Any) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
+        sig(context.cross_mask),
+        sig(context.self_mask),
+        sig(context.valid_action),
+        None if context.rope_cache is None else tuple(sig(t) for t in context.rope_cache),
+    )
+
+
+def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return tuple(
+        (
+            sig(step.conditioning),
+            tuple(tuple(sig(t) for t in block_modulation) for block_modulation in step.block_modulations),
+            tuple(sig(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+
+
+def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        sig(inputs.trajectory),
+        _cuda_graph_context_signature(inputs.context),
+        _cuda_graph_modulation_signature(inputs.modulations),
+        sig(inputs.action_dim_is_pad),
+        int(steps),
+    )
+
+
+def _clone_static_tensor(tensor: torch.Tensor | None) -> torch.Tensor | None:
+    if tensor is None:
+        return None
+    static = torch.empty_strided(
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        device=tensor.device,
+        dtype=tensor.dtype,
+    )
+    static.copy_(tensor)
+    return static
+
+
+def _clone_static_context(context: Any) -> Any:
+    rope_cache = None
+    if context.rope_cache is not None:
+        rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
+    return context.__class__(
+        kv_contexts=tuple((_clone_static_tensor(k), _clone_static_tensor(v)) for k, v in context.kv_contexts),
+        cross_mask=_clone_static_tensor(context.cross_mask),
+        self_mask=_clone_static_tensor(context.self_mask),
+        valid_action=_clone_static_tensor(context.valid_action),
+        rope_cache=rope_cache,
+    )
+
+
+def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
+    return tuple(
+        step.__class__(
+            conditioning=_clone_static_tensor(step.conditioning),
+            block_modulations=tuple(
+                tuple(_clone_static_tensor(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            final_modulation=tuple(_clone_static_tensor(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+
+
+def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
+    return _ActionFlowInputs(
+        trajectory=_clone_static_tensor(inputs.trajectory),
+        context=_clone_static_context(inputs.context),
+        modulations=_clone_static_modulations(inputs.modulations),
+        action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
+    )
+
+
+def _copy_context_(dst: Any, src: Any) -> None:
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
+        dst_k.copy_(src_k)
+        dst_v.copy_(src_v)
+    if src.cross_mask is not None:
+        dst.cross_mask.copy_(src.cross_mask)
+    if src.self_mask is not None:
+        dst.self_mask.copy_(src.self_mask)
+    if src.valid_action is not None:
+        dst.valid_action.copy_(src.valid_action)
+    if src.rope_cache is not None:
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
+            dst_tensor.copy_(src_tensor)
+
+
+def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
+    dst.trajectory.copy_(src.trajectory)
+    _copy_context_(dst.context, src.context)
+    if src.action_dim_is_pad is not None:
+        dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def _capture_cuda_graph(
+    fn,
+    device: torch.device,
+    *,
+    after_warmup=None,
+) -> tuple[torch.cuda.CUDAGraph, Any]:
+    warmup_stream = torch.cuda.Stream(device=device)
+    warmup_stream.wait_stream(torch.cuda.current_stream(device))
+    with torch.cuda.stream(warmup_stream):
+        fn()
+    torch.cuda.current_stream(device).wait_stream(warmup_stream)
+    if after_warmup is not None:
+        after_warmup()
+
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        output = fn()
+    return graph, output
diff --git a/src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py b/src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py
new file mode 100644
index 000000000..4c36b04c8
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py
@@ -0,0 +1,4591 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Modeling code for MolmoAct2"""
+
+import json
+import math
+import os
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from collections.abc import Callable, Mapping, Sequence
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import create_causal_mask, create_masks_for_generate
+from transformers.modeling_flash_attention_utils import (
+    FlashAttentionKwargs,
+    _flash_attention_forward,
+    flash_attn_supports_top_left_mask,
+)
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    ModelOutput,
+    TransformersKwargs,
+    can_return_tuple,
+    logging,
+)
+
+from .configuration_molmoact2 import (
+    MolmoAct2ActionExpertConfig,
+    MolmoAct2AdapterConfig,
+    MolmoAct2Config,
+    MolmoAct2TextConfig,
+    MolmoAct2VitConfig,
+)
+from .inference import (
+    ActionCudaGraphManager,
+    DepthDecodeCudaGraphManager,
+    _ActionFlowInputs,
+    _cache_max_len_int,
+    _cache_seq_len_int,
+    _iter_cache_key_values,
+)
+
+logger = logging.get_logger(__name__)
+
+
+ACTION_START_TOKEN = "<action_start>"  # nosec B105
+ACTION_END_TOKEN = "<action_end>"  # nosec B105
+ACTION_OUTPUT_TOKEN = "<action_output>"  # nosec B105
+STATE_START_TOKEN = "<state_start>"  # nosec B105
+STATE_END_TOKEN = "<state_end>"  # nosec B105
+STATE_TOKEN_PREFIX = "<state_"  # nosec B105
+DEPTH_START_TOKEN = "<depth_start>"  # nosec B105
+DEPTH_END_TOKEN = "<depth_end>"  # nosec B105
+DEPTH_OUTPUT_TOKEN = "<depth_output>"  # nosec B105
+DEPTH_TOKEN_PREFIX = "<depth_"  # nosec B105
+SETUP_START_TOKEN = "<setup_start>"  # nosec B105
+SETUP_END_TOKEN = "<setup_end>"  # nosec B105
+CONTROL_START_TOKEN = "<control_start>"  # nosec B105
+CONTROL_END_TOKEN = "<control_end>"  # nosec B105
+
+_QUESTION_TRAILING_SENTENCE_PUNCTUATION = ".,!?;:,…"
+_QUESTION_TRAILING_CLOSERS = "\"'”’)]}"
+_QUESTION_SURROUNDING_DELIMITERS = "\"'`“”‘’[](){}"
+_QUESTION_PREFIX_PATTERNS = tuple(
+    re.compile(pattern, flags=re.IGNORECASE)
+    for pattern in (
+        r"^(?:task|instruction|language[_ ]instruction|goal)\s*[:\-]\s*",
+        r"^(?:the\s+task\s+is\s+to|your\s+task\s+is\s+to)\s+",
+    )
+)
+
+_DEPTH_REASONING_PATCH_SIZE = 32
+_DEPTH_REASONING_THRESHOLD = 0.996
+
+
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def _round_up_multiple(value: int, multiple_of: int) -> int:
+    if multiple_of <= 0:
+        return value
+    return int(math.ceil(value / multiple_of) * multiple_of)
+
+
+def _init_linear(linear: nn.Linear, *, zero: bool = False, scale: float = 1.0) -> None:
+    if zero:
+        nn.init.zeros_(linear.weight)
+    else:
+        nn.init.xavier_uniform_(linear.weight)
+        if scale != 1.0:
+            with torch.no_grad():
+                linear.weight.mul_(scale)
+    if linear.bias is not None:
+        nn.init.zeros_(linear.bias)
+
+
+@dataclass
+class ActionExpertContext:
+    kv_contexts: Sequence[tuple[torch.Tensor, torch.Tensor]]
+    cross_mask: torch.Tensor | None
+    self_mask: torch.Tensor | None
+    valid_action: torch.Tensor | None
+    rope_cache: tuple[torch.Tensor, torch.Tensor] | None = None
+
+
+@dataclass
+class ActionExpertStepModulation:
+    conditioning: torch.Tensor
+    block_modulations: Sequence[tuple[torch.Tensor, ...]]
+    final_modulation: tuple[torch.Tensor, torch.Tensor]
+
+
+class ActionExpertRMSNorm(nn.Module):
+    def __init__(
+        self,
+        size: int,
+        *,
+        eps: float = 1e-6,
+        elementwise_affine: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.size = size
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(size, device=device))
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            dtype = x.dtype
+            x_float = x.to(torch.float32)
+            variance = x_float.pow(2).mean(dim=-1, keepdim=True)
+            out = x_float * torch.rsqrt(variance + self.eps)
+            out = out.to(dtype)
+        if self.weight is not None:
+            out = out * self.weight
+        return out
+
+    def reset_parameters(self) -> None:
+        if self.weight is not None:
+            nn.init.ones_(self.weight)
+
+
+class ActionExpertRotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, base: float = 10000.0) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("RoPE requires an even head_dim.")
+        self.head_dim = head_dim
+        self.base = base
+
+    def build_cache(
+        self,
+        *,
+        seq_len: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        half_dim = self.head_dim // 2
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, half_dim, device=device, dtype=torch.float32) / max(half_dim, 1))
+        )
+        positions = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(positions, inv_freq)
+        cos = freqs.cos().to(dtype=dtype).view(1, 1, seq_len, half_dim)
+        sin = freqs.sin().to(dtype=dtype).view(1, 1, seq_len, half_dim)
+        return cos, sin
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        *,
+        rope_cache: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if rope_cache is None:
+            rope_cache = self.build_cache(seq_len=q.shape[-2], device=q.device, dtype=q.dtype)
+        cos, sin = rope_cache
+        half_dim = self.head_dim // 2
+
+        def _apply(x: torch.Tensor) -> torch.Tensor:
+            x1, x2 = x[..., :half_dim], x[..., half_dim:]
+            return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+
+        return _apply(q), _apply(k)
+
+
+class ActionExpertSelfAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        *,
+        attn_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        use_rope: bool = True,
+    ) -> None:
+        super().__init__()
+        if hidden_size % num_heads != 0:
+            raise ValueError("hidden_size must be divisible by num_heads")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.attn_dropout = attn_dropout
+        self.q_norm = ActionExpertRMSNorm(self.head_dim, eps=qk_norm_eps) if qk_norm else None
+        self.k_norm = ActionExpertRMSNorm(self.head_dim, eps=qk_norm_eps) if qk_norm else None
+        self.rope = ActionExpertRotaryEmbedding(self.head_dim) if use_rope else None
+        self.qkv = nn.Linear(hidden_size, hidden_size * 3)
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+        self.out_drop = nn.Dropout(proj_dropout)
+
+    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.q_norm is None or self.k_norm is None:
+            return q, k
+        return self.q_norm(q), self.k_norm(k)
+
+    def _attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        *,
+        attn_mask: torch.Tensor | None = None,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        dropout_p = self.attn_dropout if self.training else 0.0
+        out = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+        )
+        return out.transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        attn_mask: torch.Tensor | None = None,
+        is_causal: bool = False,
+        rope_cache: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        qkv = self.qkv(x).view(bsz, seq_len, 3, self.num_heads, self.head_dim)
+        q = qkv[:, :, 0].transpose(1, 2)
+        k = qkv[:, :, 1].transpose(1, 2)
+        v = qkv[:, :, 2].contiguous()
+        q, k = self._apply_qk_norm(q, k)
+        if self.rope is not None:
+            q, k = self.rope(q, k, rope_cache=rope_cache)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        out = self._attention(q, k, v, attn_mask=attn_mask, is_causal=is_causal)
+        out = out.reshape(bsz, seq_len, self.hidden_size)
+        return self.out_drop(self.out_proj(out))
+
+
+class ActionExpertCrossAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        *,
+        attn_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        if hidden_size % num_heads != 0:
+            raise ValueError("hidden_size must be divisible by num_heads")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.attn_dropout = attn_dropout
+        self.q_norm = ActionExpertRMSNorm(self.head_dim, eps=qk_norm_eps) if qk_norm else None
+        self.k_norm = ActionExpertRMSNorm(self.head_dim, eps=qk_norm_eps) if qk_norm else None
+        self.q_proj = nn.Linear(hidden_size, hidden_size)
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+        self.out_drop = nn.Dropout(proj_dropout)
+
+    def _as_heads(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 4:
+            if x.shape[2] == self.num_heads:
+                return x
+            if x.shape[1] == self.num_heads:
+                return x.transpose(1, 2).contiguous()
+            raise ValueError(f"Unexpected cross-attention KV shape {tuple(x.shape)}")
+        if x.dim() != 3:
+            raise ValueError(f"Expected 3D/4D cross-attention KV, got {tuple(x.shape)}")
+        bsz, seq_len, _ = x.shape
+        return x.view(bsz, seq_len, self.num_heads, self.head_dim)
+
+    def _attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        *,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        dropout_p = self.attn_dropout if self.training else 0.0
+        out = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=False,
+        )
+        return out.transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        kv_k: torch.Tensor,
+        kv_v: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        bsz, tgt_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, tgt_len, self.num_heads, self.head_dim)
+        k = self._as_heads(kv_k)
+        v = self._as_heads(kv_v)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        if self.q_norm is not None:
+            q = self.q_norm(q)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        out = self._attention(q, k, v, attn_mask=attn_mask)
+        out = out.reshape(bsz, tgt_len, self.hidden_size)
+        return self.out_drop(self.out_proj(out))
+
+
+class ActionExpertMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        *,
+        mlp_ratio: float,
+        multiple_of: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        inner_dim = _round_up_multiple(int(hidden_size * mlp_ratio), multiple_of)
+        self.up_proj = nn.Linear(hidden_size, inner_dim)
+        self.gate_proj = nn.Linear(hidden_size, inner_dim)
+        self.down_proj = nn.Linear(inner_dim, hidden_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.gate_proj(x)) * self.up_proj(x)
+        x = self.dropout(x)
+        x = self.down_proj(x)
+        return self.dropout(x)
+
+
+class ActionExpertModulation(nn.Module):
+    def __init__(self, hidden_size: int, num_chunks: int) -> None:
+        super().__init__()
+        self.act = nn.SiLU()
+        self.linear = nn.Linear(hidden_size, num_chunks * hidden_size)
+
+    def forward(self, conditioning: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(conditioning))
+
+
+class ActionExpertBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        *,
+        mlp_ratio: float,
+        ffn_multiple_of: int,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+    ) -> None:
+        super().__init__()
+        self.self_norm = ActionExpertRMSNorm(hidden_size, eps=1e-6)
+        self.cross_norm = ActionExpertRMSNorm(hidden_size, eps=1e-6)
+        self.ff_norm = ActionExpertRMSNorm(hidden_size, eps=1e-6)
+        self.self_attn = ActionExpertSelfAttention(
+            hidden_size,
+            num_heads,
+            attn_dropout=attn_dropout,
+            proj_dropout=dropout,
+            qk_norm=qk_norm,
+            qk_norm_eps=qk_norm_eps,
+            use_rope=rope,
+        )
+        self.cross_attn = ActionExpertCrossAttention(
+            hidden_size,
+            num_heads,
+            attn_dropout=attn_dropout,
+            proj_dropout=dropout,
+            qk_norm=qk_norm,
+            qk_norm_eps=qk_norm_eps,
+        )
+        self.mlp = ActionExpertMLP(
+            hidden_size,
+            mlp_ratio=mlp_ratio,
+            multiple_of=ffn_multiple_of,
+            dropout=dropout,
+        )
+        self.modulation = ActionExpertModulation(hidden_size, 9)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditioning: torch.Tensor,
+        *,
+        cross_kv: tuple[torch.Tensor, torch.Tensor],
+        self_attn_mask: torch.Tensor | None = None,
+        attn_mask: torch.Tensor | None = None,
+        is_causal: bool = False,
+        modulation: tuple[torch.Tensor, ...] | None = None,
+        rope_cache: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        if modulation is None:
+            modulation = self.modulation(conditioning).chunk(9, dim=1)
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mca,
+            scale_mca,
+            gate_mca,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+        ) = modulation
+        x = x + gate_msa.unsqueeze(1) * self.self_attn(
+            _modulate(self.self_norm(x), shift_msa, scale_msa),
+            attn_mask=self_attn_mask,
+            is_causal=is_causal,
+            rope_cache=rope_cache,
+        )
+        x = x + gate_mca.unsqueeze(1) * self.cross_attn(
+            _modulate(self.cross_norm(x), shift_mca, scale_mca),
+            kv_k=cross_kv[0],
+            kv_v=cross_kv[1],
+            attn_mask=attn_mask,
+        )
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(_modulate(self.ff_norm(x), shift_mlp, scale_mlp))
+        return x
+
+
+class ActionExpertFinalLayer(nn.Module):
+    def __init__(self, hidden_size: int, output_dim: int) -> None:
+        super().__init__()
+        self.norm = ActionExpertRMSNorm(hidden_size, eps=1e-6)
+        self.modulation = ActionExpertModulation(hidden_size, 2)
+        self.linear = nn.Linear(hidden_size, output_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditioning: torch.Tensor,
+        *,
+        modulation: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        if modulation is None:
+            modulation = self.modulation(conditioning).chunk(2, dim=1)
+        shift, scale = modulation
+        return self.linear(_modulate(self.norm(x), shift, scale))
+
+
+class SinusoidalTimeEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        if timesteps.dim() > 1:
+            timesteps = timesteps.view(timesteps.shape[0], -1)[:, 0]
+        half_dim = self.dim // 2
+        freq = torch.exp(
+            torch.arange(half_dim, device=timesteps.device, dtype=timesteps.dtype)
+            * (-math.log(10000.0) / max(half_dim - 1, 1))
+        )
+        args = timesteps[:, None] * freq[None, :]
+        emb = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
+        if self.dim % 2 == 1:
+            emb = F.pad(emb, (0, 1))
+        return emb
+
+
+class ActionExpert(nn.Module):
+    """Modern MolmoAct2 action expert embedded in the local LeRobot implementation."""
+
+    def __init__(
+        self,
+        config: MolmoAct2ActionExpertConfig,
+        *,
+        llm_dim: int,
+        llm_kv_dim: int,
+        llm_num_layers: int,
+        device=None,
+    ):
+        super().__init__()
+        if config.num_layers != llm_num_layers:
+            raise ValueError(
+                "MolmoAct2 HF action expert supports only per-layer conditioning with one "
+                f"action block per LLM layer (action={config.num_layers}, llm={llm_num_layers})."
+            )
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.llm_dim = llm_dim
+        self.llm_kv_dim = llm_kv_dim
+        self.action_head_dim = config.hidden_size // config.num_heads
+
+        self.time_embed = nn.Sequential(
+            SinusoidalTimeEmbedding(config.timestep_embed_dim),
+            nn.Linear(config.timestep_embed_dim, config.hidden_size, device=device),
+            nn.SiLU(),
+            nn.Linear(config.hidden_size, config.hidden_size, device=device),
+        )
+        self.action_embed = nn.Linear(config.max_action_dim, config.hidden_size, device=device)
+        self.context_k_proj = nn.Linear(self.llm_kv_dim, config.hidden_size, bias=False, device=device)
+        self.context_v_proj = nn.Linear(self.llm_kv_dim, config.hidden_size, bias=False, device=device)
+        self.context_norm = (
+            ActionExpertRMSNorm(config.hidden_size, eps=1e-6) if config.context_layer_norm else nn.Identity()
+        )
+        self._modulation_cache_key: tuple[Any, ...] | None = None
+        self._modulation_cache_value: Sequence[ActionExpertStepModulation] | None = None
+        self.blocks = nn.ModuleList(
+            [
+                ActionExpertBlock(
+                    config.hidden_size,
+                    config.num_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    ffn_multiple_of=config.ffn_multiple_of,
+                    attn_dropout=config.attn_dropout,
+                    dropout=config.dropout,
+                    qk_norm=config.qk_norm,
+                    qk_norm_eps=config.qk_norm_eps,
+                    rope=config.rope,
+                )
+                for _ in range(config.num_layers)
+            ]
+        )
+        self.final_layer = ActionExpertFinalLayer(config.hidden_size, config.max_action_dim)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        for module in self.time_embed.modules():
+            if isinstance(module, nn.Linear):
+                _init_linear(module)
+        _init_linear(self.action_embed)
+        _init_linear(self.context_k_proj)
+        _init_linear(self.context_v_proj)
+        if isinstance(self.context_norm, ActionExpertRMSNorm):
+            self.context_norm.reset_parameters()
+        residual_scale = (2 * max(self.config.num_layers, 1)) ** -0.5
+        for block in self.blocks:
+            _init_linear(block.self_attn.qkv)
+            _init_linear(block.self_attn.out_proj, scale=residual_scale)
+            _init_linear(block.cross_attn.q_proj)
+            _init_linear(block.cross_attn.out_proj, scale=residual_scale)
+            _init_linear(block.mlp.up_proj)
+            _init_linear(block.mlp.gate_proj)
+            _init_linear(block.mlp.down_proj, scale=residual_scale)
+            _init_linear(block.modulation.linear, zero=True)
+            block.self_norm.reset_parameters()
+            block.cross_norm.reset_parameters()
+            block.ff_norm.reset_parameters()
+            if block.self_attn.q_norm is not None:
+                block.self_attn.q_norm.reset_parameters()
+            if block.self_attn.k_norm is not None:
+                block.self_attn.k_norm.reset_parameters()
+            if block.cross_attn.q_norm is not None:
+                block.cross_attn.q_norm.reset_parameters()
+            if block.cross_attn.k_norm is not None:
+                block.cross_attn.k_norm.reset_parameters()
+        self.final_layer.norm.reset_parameters()
+        _init_linear(self.final_layer.modulation.linear, zero=True)
+        _init_linear(self.final_layer.linear, zero=True)
+
+    def _reshape_hidden_to_heads(self, x: torch.Tensor) -> torch.Tensor:
+        return x.view(x.shape[0], x.shape[1], self.config.num_heads, self.action_head_dim)
+
+    def _time_conditioning(self, timesteps: torch.Tensor) -> torch.Tensor:
+        conditioning = self.time_embed[0](timesteps)
+        first_linear = self.time_embed[1]
+        if isinstance(first_linear, nn.Linear):
+            conditioning = conditioning.to(dtype=first_linear.weight.dtype)
+        for module in list(self.time_embed.children())[1:]:
+            conditioning = module(conditioning)
+        return conditioning
+
+    def _project_kv_tensor(self, x: torch.Tensor, proj: nn.Linear) -> torch.Tensor:
+        flat = self.context_norm(proj(x))
+        return self._reshape_hidden_to_heads(flat)
+
+    def _prepare_kv_context(
+        self,
+        encoder_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]],
+    ) -> Sequence[tuple[torch.Tensor, torch.Tensor]]:
+        if len(encoder_kv_states) != len(self.blocks):
+            raise ValueError(
+                f"Expected {len(self.blocks)} KV layers for per-layer conditioning, "
+                f"got {len(encoder_kv_states)}."
+            )
+        kv_contexts = []
+        for block, (k_in, v_in) in zip(self.blocks, encoder_kv_states):
+            k_ctx = self._project_kv_tensor(k_in, self.context_k_proj)
+            v_ctx = self._project_kv_tensor(v_in, self.context_v_proj)
+            k_norm = block.cross_attn.k_norm
+            if k_norm is not None:
+                k_ctx = k_norm(k_ctx.transpose(1, 2)).transpose(1, 2)
+            kv_contexts.append((k_ctx, v_ctx))
+        return kv_contexts
+
+    @staticmethod
+    def _build_cross_attention_mask(
+        encoder_attention_mask: torch.Tensor | None,
+        batch_size: int,
+        dtype: torch.dtype,
+    ) -> torch.Tensor | None:
+        if encoder_attention_mask is None:
+            return None
+        mask = encoder_attention_mask[:, None, None, :].to(dtype=dtype)
+        return (1.0 - mask) * torch.finfo(dtype).min
+
+    def _build_self_attention_mask(
+        self,
+        action_attention_mask: torch.Tensor | None,
+        seq_len: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor | None:
+        mask = None
+        if action_attention_mask is not None:
+            valid = action_attention_mask.to(device=device, dtype=torch.bool)
+            key_mask = (~valid)[:, None, None, :].to(dtype=dtype)
+            mask = key_mask * torch.finfo(dtype).min
+        if self.config.causal_attn:
+            causal = torch.ones(seq_len, seq_len, device=device, dtype=torch.bool).triu(diagonal=1)
+            causal = causal.unsqueeze(0).unsqueeze(0).to(dtype=dtype) * torch.finfo(dtype).min
+            mask = causal if mask is None else mask + causal
+        return mask
+
+    def prepare_context(
+        self,
+        *,
+        encoder_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]],
+        encoder_attention_mask: torch.Tensor | None = None,
+        action_attention_mask: torch.Tensor | None = None,
+        state_embeddings: torch.Tensor | None = None,
+        batch_size: int,
+        seq_len: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> ActionExpertContext:
+        if state_embeddings is not None:
+            raise ValueError(
+                "MolmoAct2 HF action expert supports only discrete state tokens. "
+                "Continuous state embeddings are not supported."
+            )
+        valid_action = None
+        if action_attention_mask is not None:
+            valid_action = action_attention_mask.to(device=device, dtype=dtype).unsqueeze(-1)
+        rope_cache = None
+        if len(self.blocks) > 0 and self.blocks[0].self_attn.rope is not None:
+            rope_cache = self.blocks[0].self_attn.rope.build_cache(
+                seq_len=seq_len,
+                device=device,
+                dtype=dtype,
+            )
+        kv_contexts = self._prepare_kv_context(encoder_kv_states)
+        cross_mask = self._build_cross_attention_mask(
+            encoder_attention_mask,
+            batch_size,
+            dtype,
+        )
+        self_mask = self._build_self_attention_mask(action_attention_mask, seq_len, device, dtype)
+        return ActionExpertContext(
+            kv_contexts=kv_contexts,
+            cross_mask=cross_mask,
+            self_mask=self_mask,
+            valid_action=valid_action,
+            rope_cache=rope_cache,
+        )
+
+    def prepare_modulation_cache(
+        self,
+        timesteps: Sequence[torch.Tensor],
+    ) -> Sequence[ActionExpertStepModulation]:
+        cache = []
+        for idx, step_t in enumerate(timesteps):
+            conditioning = self._time_conditioning(step_t)
+            block_modulations = []
+            for block in self.blocks:
+                block_modulations.append(tuple(block.modulation(conditioning).chunk(9, dim=1)))
+            final_modulation = tuple(self.final_layer.modulation(conditioning).chunk(2, dim=1))
+            cache.append(
+                ActionExpertStepModulation(
+                    conditioning=conditioning,
+                    block_modulations=block_modulations,
+                    final_modulation=final_modulation,
+                )
+            )
+        return cache
+
+    def get_or_prepare_modulation_cache(
+        self,
+        timesteps: Sequence[torch.Tensor],
+        *,
+        cache_key: tuple[Any, ...] | None = None,
+    ) -> Sequence[ActionExpertStepModulation]:
+        if self.training or cache_key is None:
+            return self.prepare_modulation_cache(timesteps)
+        if self._modulation_cache_key == cache_key and self._modulation_cache_value is not None:
+            return self._modulation_cache_value
+        cached = self.prepare_modulation_cache(timesteps)
+        self._modulation_cache_key = cache_key
+        self._modulation_cache_value = cached
+        return cached
+
+    def forward_with_context(
+        self,
+        actions: torch.Tensor,
+        timesteps: torch.Tensor,
+        *,
+        context: ActionExpertContext,
+        modulation: ActionExpertStepModulation | None = None,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = actions.shape
+        if seq_len > self.config.max_action_horizon:
+            raise ValueError(
+                f"Action sequence length {seq_len} exceeds configured max_action_horizon={self.config.max_action_horizon}"
+            )
+        if modulation is None:
+            conditioning = self._time_conditioning(timesteps)
+            block_modulations: Sequence[tuple[torch.Tensor, ...] | None] = [None] * len(self.blocks)
+            final_modulation = None
+        else:
+            conditioning = modulation.conditioning
+            block_modulations = modulation.block_modulations
+            final_modulation = modulation.final_modulation
+        x = self.action_embed(actions)
+        if context.valid_action is not None:
+            x = x * context.valid_action
+        for idx, (block, kv_context, block_modulation) in enumerate(
+            zip(self.blocks, context.kv_contexts, block_modulations)
+        ):
+            x = block(
+                x,
+                conditioning,
+                cross_kv=kv_context,
+                self_attn_mask=context.self_mask,
+                attn_mask=context.cross_mask,
+                is_causal=self.config.causal_attn,
+                modulation=block_modulation,
+                rope_cache=context.rope_cache,
+            )
+            if context.valid_action is not None:
+                x = x * context.valid_action
+        out = self.final_layer(x, conditioning, modulation=final_modulation)
+        if context.valid_action is not None:
+            out = out * context.valid_action
+        return out
+
+    def forward(
+        self,
+        actions: torch.Tensor,
+        timesteps: torch.Tensor,
+        *,
+        encoder_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]],
+        encoder_attention_mask: torch.Tensor | None = None,
+        action_attention_mask: torch.Tensor | None = None,
+        state_embeddings: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = actions.shape
+        context = self.prepare_context(
+            encoder_kv_states=encoder_kv_states,
+            encoder_attention_mask=encoder_attention_mask,
+            action_attention_mask=action_attention_mask,
+            state_embeddings=state_embeddings,
+            batch_size=bsz,
+            seq_len=seq_len,
+            device=actions.device,
+            dtype=actions.dtype,
+        )
+        return self.forward_with_context(actions, timesteps, context=context)
+
+
+def _to_numpy(value: Any) -> np.ndarray:
+    if isinstance(value, np.ndarray):
+        return value
+    if torch.is_tensor(value):
+        return value.detach().cpu().numpy()
+    return np.asarray(value)
+
+
+def _to_array(value: Any) -> np.ndarray | None:
+    if value is None:
+        return None
+    if torch.is_tensor(value):
+        tensor = value.detach()
+        if tensor.dtype in (torch.bfloat16, torch.float16):
+            tensor = tensor.float()
+        return tensor.cpu().numpy().astype(np.float32, copy=False)
+    return np.asarray(value, dtype=np.float32)
+
+
+def _to_mask(value: Any, fallback_like: np.ndarray | None) -> np.ndarray | None:
+    if value is None:
+        return None
+    mask = np.asarray(value, dtype=np.bool_)
+    if fallback_like is not None and mask.shape != fallback_like.shape:
+        mask = np.broadcast_to(mask, fallback_like.shape)
+    return mask
+
+
+def _feature_dim_from_stats(stats: Mapping[str, Any] | None) -> int | None:
+    if not isinstance(stats, Mapping):
+        return None
+    for key in (
+        "mean",
+        "std",
+        "min",
+        "max",
+        "q01",
+        "q99",
+        "q10",
+        "q90",
+        "mask",
+        "names",
+    ):
+        value = stats.get(key)
+        if value is None:
+            continue
+        arr = np.asarray(value)
+        if arr.shape:
+            return int(arr.shape[-1])
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
+            return int(len(value))
+    return None
+
+
+class _FeatureNormalizer:
+    def __init__(
+        self,
+        *,
+        mode: str,
+        mean: np.ndarray | None = None,
+        std: np.ndarray | None = None,
+        min_val: np.ndarray | None = None,
+        max_val: np.ndarray | None = None,
+        q_low: np.ndarray | None = None,
+        q_high: np.ndarray | None = None,
+        mask: np.ndarray | None = None,
+        zero_mask: np.ndarray | None = None,
+    ):
+        self.mode = mode
+        self.mean = mean
+        self.std = std
+        self.min_val = min_val
+        self.max_val = max_val
+        self.q_low = q_low
+        self.q_high = q_high
+        self.mask = mask
+        self.zero_mask = zero_mask
+
+    @classmethod
+    def from_stats(cls, stats: Mapping[str, Any] | None, mode: str) -> Optional["_FeatureNormalizer"]:
+        if stats is None:
+            return None
+        raw_mask = stats.get("mask") if isinstance(stats, Mapping) else None
+        if mode == "none":
+            fallback = None
+            for key in (
+                "mean",
+                "std",
+                "min",
+                "max",
+                "q01",
+                "q99",
+                "q10",
+                "q90",
+                "mask",
+            ):
+                fallback = _to_array(stats.get(key))
+                if fallback is not None:
+                    break
+            return cls(mode=mode, mask=_to_mask(raw_mask, fallback))
+        if mode == "mean_std":
+            mean = _to_array(stats.get("mean"))
+            std = _to_array(stats.get("std"))
+            if mean is None or std is None:
+                raise ValueError("norm_mode='mean_std' requires mean and std stats.")
+            return cls(mode=mode, mean=mean, std=std, mask=_to_mask(raw_mask, mean))
+        if mode == "min_max":
+            min_val = _to_array(stats.get("min"))
+            max_val = _to_array(stats.get("max"))
+            if min_val is None or max_val is None:
+                raise ValueError("norm_mode='min_max' requires min and max stats.")
+            return cls(
+                mode=mode,
+                min_val=min_val,
+                max_val=max_val,
+                mask=_to_mask(raw_mask, min_val),
+                zero_mask=(min_val == max_val),
+            )
+        if mode in {"q01_q99", "q10_q90"}:
+            low_key, high_key = ("q01", "q99") if mode == "q01_q99" else ("q10", "q90")
+            q_low = _to_array(stats.get(low_key))
+            q_high = _to_array(stats.get(high_key))
+            if q_low is None or q_high is None:
+                raise ValueError(f"norm_mode={mode!r} requires {low_key} and {high_key} stats.")
+            min_val = _to_array(stats.get("min"))
+            max_val = _to_array(stats.get("max"))
+            fallback = min_val if min_val is not None else q_low
+            zero_mask = None if min_val is None or max_val is None else (min_val == max_val)
+            return cls(
+                mode=mode,
+                min_val=min_val,
+                max_val=max_val,
+                q_low=q_low,
+                q_high=q_high,
+                mask=_to_mask(raw_mask, fallback),
+                zero_mask=zero_mask,
+            )
+        raise ValueError(f"Unsupported robot normalization mode {mode!r}.")
+
+    def normalize(self, x: Any) -> Any:
+        arr = _to_array(x)
+        if arr is None:
+            return None
+        eps = 1e-6
+        if self.mode == "none":
+            normed = arr
+        elif self.mode == "mean_std":
+            normed = (arr - self.mean) / np.maximum(self.std, eps)
+        elif self.mode == "min_max":
+            normed = 2.0 * (arr - self.min_val) / np.maximum(self.max_val - self.min_val, eps) - 1.0
+        elif self.mode in {"q01_q99", "q10_q90"}:
+            normed = 2.0 * (arr - self.q_low) / np.maximum(self.q_high - self.q_low, eps) - 1.0
+        else:
+            normed = arr
+        if self.mode in {"min_max", "q01_q99", "q10_q90"}:
+            normed = np.clip(normed, -1.0, 1.0)
+        if self.mask is not None:
+            normed = np.where(self.mask, normed, arr)
+        if self.zero_mask is not None:
+            normed = np.where(self.zero_mask, 0.0, normed)
+        if torch.is_tensor(x):
+            return torch.as_tensor(normed, device=x.device, dtype=x.dtype)
+        return normed
+
+    def unnormalize(self, x: Any) -> Any:
+        arr = _to_array(x)
+        if arr is None:
+            return None
+        if self.mode in {"min_max", "q01_q99", "q10_q90"}:
+            arr = np.clip(arr, -1.0, 1.0)
+        if self.mode == "none":
+            out = arr
+        elif self.mode == "mean_std":
+            out = arr * self.std + self.mean
+        elif self.mode == "min_max":
+            out = (arr + 1.0) * (self.max_val - self.min_val) / 2.0 + self.min_val
+        elif self.mode in {"q01_q99", "q10_q90"}:
+            out = (arr + 1.0) * (self.q_high - self.q_low) / 2.0 + self.q_low
+        else:
+            out = arr
+        if self.mask is not None:
+            out = np.where(self.mask, out, arr)
+        if torch.is_tensor(x):
+            return torch.as_tensor(out, device=x.device, dtype=x.dtype)
+        return out
+
+
+class _RobotStats:
+    def __init__(self, payload: Mapping[str, Any]):
+        self.norm_mode = str(payload.get("norm_mode", "min_max"))
+        self.metadata_by_tag: dict[str, dict[str, Any]] = {
+            str(tag): dict(metadata or {})
+            for tag, metadata in dict(payload.get("metadata_by_tag") or {}).items()
+        }
+        self.action_normalizers = {}
+        self.state_normalizers = {}
+        for tag, metadata in self.metadata_by_tag.items():
+            if metadata.get("action_stats") is not None:
+                self.action_normalizers[tag] = _FeatureNormalizer.from_stats(
+                    metadata.get("action_stats"),
+                    self.norm_mode,
+                )
+            if metadata.get("state_stats") is not None:
+                self.state_normalizers[tag] = _FeatureNormalizer.from_stats(
+                    metadata.get("state_stats"),
+                    self.norm_mode,
+                )
+
+    def validate_tag(self, norm_tag: str | None) -> str:
+        tag = str(norm_tag or "").strip()
+        if not tag:
+            raise ValueError("MolmoAct2 `predict_action` requires `norm_tag`.")
+        if tag not in self.metadata_by_tag:
+            allowed = ", ".join(sorted(self.metadata_by_tag))
+            raise ValueError(f"Unknown MolmoAct2 normalization tag {tag!r}. Allowed tags: {allowed}.")
+        return tag
+
+    def get_metadata(self, norm_tag: str | None) -> dict[str, Any]:
+        if norm_tag is None:
+            return {}
+        return dict(self.metadata_by_tag.get(str(norm_tag), {}) or {})
+
+    def normalize_state(self, state: Any, norm_tag: str) -> Any:
+        normalizer = self.state_normalizers.get(str(norm_tag))
+        return state if normalizer is None else normalizer.normalize(state)
+
+    def unnormalize_action(self, action: Any, norm_tag: str) -> Any:
+        normalizer = self.action_normalizers.get(str(norm_tag))
+        return action if normalizer is None else normalizer.unnormalize(action)
+
+    def get_action_dim(self, norm_tag: str) -> int | None:
+        metadata = self.get_metadata(norm_tag)
+        stats = metadata.get("action_stats")
+        dim = _feature_dim_from_stats(stats)
+        return dim
+
+    def get_state_dim(self, norm_tag: str) -> int | None:
+        metadata = self.get_metadata(norm_tag)
+        return _feature_dim_from_stats(metadata.get("state_stats"))
+
+    def get_action_horizon(self, norm_tag: str) -> int | None:
+        return self._get_positive_int(norm_tag, "action_horizon")
+
+    def get_n_action_steps(self, norm_tag: str) -> int | None:
+        return self._get_positive_int(norm_tag, "n_action_steps")
+
+    def _get_positive_int(self, norm_tag: str, key: str) -> int | None:
+        value = self.get_metadata(norm_tag).get(key)
+        if value is None:
+            return None
+        value = int(value)
+        if value < 1:
+            raise ValueError(f"Robot metadata for norm_tag={norm_tag!r} must define {key} >= 1.")
+        return value
+
+
+def _normalize_image_for_cache(image: Any) -> np.ndarray:
+    arr = np.asarray(image)
+    if arr.ndim == 2:
+        arr = np.stack([arr] * 3, axis=-1)
+    if arr.ndim == 3 and arr.shape[0] in {1, 3, 4} and arr.shape[-1] not in {1, 3, 4}:
+        arr = np.moveaxis(arr, 0, -1)
+    if arr.ndim == 3 and arr.shape[-1] == 1:
+        arr = np.repeat(arr, 3, axis=-1)
+    if arr.dtype in (np.float32, np.float64):
+        if arr.size > 0 and float(arr.max()) <= 1.0:
+            arr = arr * 255.0
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    elif arr.dtype != np.uint8:
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    return arr
+
+
+def _extract_first_image(images: Any) -> np.ndarray | None:
+    if images is None:
+        return None
+    if isinstance(images, (list, tuple)):
+        if not images:
+            return None
+        return _normalize_image_for_cache(images[0])
+    arr = _to_numpy(images)
+    if arr.ndim == 4:
+        return _normalize_image_for_cache(arr[0])
+    return _normalize_image_for_cache(arr)
+
+
+def _resize_depth_reasoning_image(image: np.ndarray, target_size: int) -> np.ndarray:
+    from PIL import Image
+
+    if image.shape[0] == target_size and image.shape[1] == target_size:
+        return image
+    pil_image = Image.fromarray(np.asarray(image, dtype=np.uint8))
+    return np.asarray(pil_image.resize((target_size, target_size), Image.BILINEAR))
+
+
+def _compute_depth_update_mask(
+    current_image: np.ndarray,
+    previous_image: np.ndarray,
+    *,
+    num_depth_codes: int,
+) -> np.ndarray:
+    grid_side = int(math.isqrt(int(num_depth_codes)))
+    if grid_side * grid_side != int(num_depth_codes):
+        raise ValueError(
+            f"enable_adaptive_depth=True requires a square depth grid, got num_depth_codes={int(num_depth_codes)}."
+        )
+    target_size = grid_side * _DEPTH_REASONING_PATCH_SIZE
+    current_resized = _resize_depth_reasoning_image(current_image, target_size).astype(np.float32)
+    previous_resized = _resize_depth_reasoning_image(previous_image, target_size).astype(np.float32)
+    current_patches = (
+        current_resized.reshape(
+            grid_side,
+            _DEPTH_REASONING_PATCH_SIZE,
+            grid_side,
+            _DEPTH_REASONING_PATCH_SIZE,
+            3,
+        )
+        .transpose(0, 2, 1, 3, 4)
+        .reshape(grid_side, grid_side, -1)
+    )
+    previous_patches = (
+        previous_resized.reshape(
+            grid_side,
+            _DEPTH_REASONING_PATCH_SIZE,
+            grid_side,
+            _DEPTH_REASONING_PATCH_SIZE,
+            3,
+        )
+        .transpose(0, 2, 1, 3, 4)
+        .reshape(grid_side, grid_side, -1)
+    )
+    dot = np.sum(current_patches * previous_patches, axis=-1)
+    norm_current = np.linalg.norm(current_patches, axis=-1)
+    norm_previous = np.linalg.norm(previous_patches, axis=-1)
+    denom = norm_current * norm_previous
+    similarity = np.where(denom < 1e-8, 1.0, dot / (denom + 1e-12))
+    return np.asarray(similarity < _DEPTH_REASONING_THRESHOLD, dtype=np.bool_).reshape(-1)
+
+
+def _build_depth_update_spans(
+    update_mask: Sequence[bool],
+) -> list[tuple[int, int, bool]]:
+    flat_mask = np.asarray(update_mask, dtype=np.bool_).reshape(-1)
+    if flat_mask.size == 0:
+        return []
+    spans: list[tuple[int, int, bool]] = []
+    start = 0
+    current_value = bool(flat_mask[0])
+    for idx in range(1, int(flat_mask.shape[0])):
+        next_value = bool(flat_mask[idx])
+        if next_value == current_value:
+            continue
+        spans.append((start, idx, current_value))
+        start = idx
+        current_value = next_value
+    spans.append((start, int(flat_mask.shape[0]), current_value))
+    return spans
+
+
+def _wrap_setup_text(setup_type: str, add_setup_tokens: bool = False) -> str:
+    setup_type = str(setup_type or "")
+    if setup_type.startswith(SETUP_START_TOKEN) and setup_type.endswith(SETUP_END_TOKEN):
+        return setup_type
+    if not setup_type or not add_setup_tokens:
+        return setup_type
+    return f"{SETUP_START_TOKEN}{setup_type}{SETUP_END_TOKEN}"
+
+
+def _wrap_control_text(control_mode: str, add_control_tokens: bool = False) -> str:
+    control_mode = str(control_mode or "")
+    if control_mode.startswith(CONTROL_START_TOKEN) and control_mode.endswith(CONTROL_END_TOKEN):
+        return control_mode
+    if not control_mode or not add_control_tokens:
+        return control_mode
+    return f"{CONTROL_START_TOKEN}{control_mode}{CONTROL_END_TOKEN}"
+
+
+def _discretize_normalized_state(state: np.ndarray, num_state_tokens: int) -> np.ndarray:
+    arr = np.asarray(state, dtype=np.float32)
+    arr = np.nan_to_num(arr, nan=0.0, posinf=1.0, neginf=-1.0)
+    arr = np.clip(arr, -1.0, 1.0)
+    scaled = (arr + 1.0) / 2.0 * float(num_state_tokens - 1)
+    return np.clip(np.rint(scaled).astype(np.int64), 0, int(num_state_tokens) - 1)
+
+
+def _build_discrete_state_string(state: np.ndarray | None, num_state_tokens: int) -> str:
+    if state is None:
+        return ""
+    token_ids = _discretize_normalized_state(state, num_state_tokens).reshape(-1)
+    return f"{STATE_START_TOKEN}{''.join(f'{STATE_TOKEN_PREFIX}{int(token_id)}>' for token_id in token_ids)}{STATE_END_TOKEN}"
+
+
+def _normalize_question_text(text: str) -> str:
+    normalized = re.sub(r"\s+", " ", text).strip()
+    if not normalized:
+        return ""
+    previous = None
+    while normalized and normalized != previous:
+        previous = normalized
+        normalized = normalized.strip().strip(_QUESTION_SURROUNDING_DELIMITERS).strip()
+        for pattern in _QUESTION_PREFIX_PATTERNS:
+            normalized = pattern.sub("", normalized, count=1).strip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_SENTENCE_PUNCTUATION).rstrip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_CLOSERS).rstrip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_SENTENCE_PUNCTUATION).rstrip()
+    sentence_chunks = [chunk.strip() for chunk in re.split(r"[.!?]+", normalized) if chunk.strip()]
+    if len(sentence_chunks) > 1:
+        normalized = "; ".join(sentence_chunks)
+    normalized = normalized.lower()
+    return normalized
+
+
+def _build_robot_text(
+    *,
+    task: str,
+    style: str,
+    discrete_state_string: str,
+    setup_type: str,
+    control_mode: str,
+    add_setup_tokens: bool,
+    add_control_tokens: bool,
+    num_images: int,
+) -> str:
+    setup_text = _wrap_setup_text(setup_type, add_setup_tokens=add_setup_tokens)
+    control_text = _wrap_control_text(control_mode, add_control_tokens=add_control_tokens)
+    state_clause = (
+        f" The current state of the robot is {discrete_state_string}." if discrete_state_string else ""
+    )
+    if style == "robot_depth_action":
+        prompt = (
+            f"The task is to {task}. The setup is {setup_text}.{state_clause} "
+            f"The expected control mode is {control_text}. Given these, first predict the depth map of the main image "
+            "and then predict the action the robot should take to complete the task?"
+        )
+        trigger = f"{DEPTH_OUTPUT_TOKEN}{ACTION_OUTPUT_TOKEN}"
+    else:
+        prompt = (
+            f"The task is to {task}. The setup is {setup_text}.{state_clause} "
+            f"The expected control mode is {control_text}. Given these, what action should the robot take to complete the task?"
+        )
+        trigger = ACTION_OUTPUT_TOKEN
+    if num_images <= 0:
+        image_prefix = ""
+    elif num_images == 1:
+        image_prefix = "<|image|>"
+    else:
+        image_prefix = "".join(f"Image {idx + 1}<|image|>" for idx in range(num_images))
+    return f"{image_prefix}<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{trigger}"
+
+
+def _flatten_generated_token_ids(token_ids: torch.Tensor) -> list[int]:
+    if token_ids.ndim == 3:
+        return [int(x) for x in token_ids[0, 0].detach().cpu().tolist()]
+    if token_ids.ndim == 2:
+        return [int(x) for x in token_ids[0].detach().cpu().tolist()]
+    if token_ids.ndim == 1:
+        return [int(x) for x in token_ids.detach().cpu().tolist()]
+    raise ValueError(f"Unexpected generated token tensor shape {tuple(token_ids.shape)}")
+
+
+def _extract_discrete_token_bins(
+    generated_ids: list[int],
+    start_token_id: int,
+    end_token_id: int,
+    token_id_to_bin: dict[int, int],
+) -> list[int]:
+    start_idx = None
+    end_idx = None
+    for idx, token_id in enumerate(generated_ids):
+        if token_id == start_token_id:
+            start_idx = idx
+            break
+    if start_idx is not None:
+        for idx in range(start_idx + 1, len(generated_ids)):
+            if generated_ids[idx] == end_token_id:
+                end_idx = idx
+                break
+    span_start = 0 if start_idx is None else start_idx + 1
+    span_end = len(generated_ids) if end_idx is None else end_idx
+    return [
+        int(token_id_to_bin[token_id])
+        for token_id in generated_ids[span_start:span_end]
+        if token_id in token_id_to_bin
+    ]
+
+
+@dataclass
+class MolmoAct2ActionOutput(ModelOutput):
+    actions: torch.FloatTensor | None = None
+    generated_token_ids: torch.LongTensor | None = None
+    depth_bins: torch.LongTensor | None = None
+    depth_cache: dict[str, Any] | None = None
+
+
+@dataclass
+class _DepthPrefix:
+    token_ids: torch.Tensor
+    depth_bins: torch.Tensor
+    full_input_ids: torch.Tensor
+    attention_mask: torch.Tensor | None
+    encoder_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]]
+    next_output: Any
+    past_key_values: Cache | None
+
+
+@dataclass
+class MolmoAct2CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for MolmoAct2 causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: torch.FloatTensor | None = None
+    logits: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
+    image_hidden_states: torch.FloatTensor | None = None
+
+
+@dataclass
+class MolmoAct2ModelOutputWithPast(BaseModelOutputWithPast):
+    """
+    Base class for MolmoAct2 outputs, with hidden states and attentions.
+
+    Args:
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_num_patches, hidden_size)`.
+            image_hidden_states of the model produced by the vision backbone
+    """
+
+    last_hidden_state: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
+    image_hidden_states: torch.FloatTensor | None = None
+
+
+class ViTMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        hidden_act: str,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=True, device=device)
+        self.act = ACT2FN[hidden_act]
+        self.w2 = nn.Linear(hidden_dim, dim, bias=True, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act(self.w1(x)))
+
+
+class ViTMultiHeadDotProductAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        use_bias: bool = True,
+        input_dim: int | None = None,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        device: str | torch.device = None,
+        attn_implementation: str = "eager",
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attn_implementation = attn_implementation
+        self.is_causal = False
+
+        input_dim = input_dim or hidden_size
+
+        self.wq = nn.Linear(
+            input_dim,
+            self.num_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wk = nn.Linear(
+            input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wv = nn.Linear(
+            input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wo = nn.Linear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+        )
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = nn.Dropout(residual_dropout)
+        self.sdpa_backend_list = [
+            SDPBackend.FLASH_ATTENTION,
+            SDPBackend.CUDNN_ATTENTION,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.MATH,
+        ]
+
+    def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+
+    def forward(
+        self,
+        inputs_q: torch.Tensor,
+        inputs_kv: torch.Tensor | None = None,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_kv is not None:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        else:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+
+        xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
+
+        xq = self._split_heads(xq, self.num_heads)
+        xk = self._split_heads(xk, self.num_key_value_heads)
+        xv = self._split_heads(xv, self.num_key_value_heads)
+
+        if self.num_heads != self.num_key_value_heads:
+            xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+            xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+
+        og_dtype = xq.dtype
+
+        if self.float32_attention:
+            xq = xq.to(torch.float)
+            xk = xk.to(torch.float)
+
+        dropout_p = 0.0 if not self.training else self.attention_dropout
+
+        if self.attn_implementation == "eager":
+            attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
+            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
+            attn_weights = F.dropout(attn_weights, p=dropout_p, training=self.training)
+            attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
+
+        elif self.attn_implementation == "sdpa":
+            if self.float32_attention:
+                xv = xv.to(torch.float32)
+
+            query = xq.transpose(1, 2).contiguous()
+            key = xk.transpose(1, 2).contiguous()
+            value = xv.transpose(1, 2).contiguous()
+            if inputs_kv is not None:
+                with sdpa_kernel(self.sdpa_backend_list):
+                    attn_output = F.scaled_dot_product_attention(
+                        query,
+                        key,
+                        value,
+                        attn_mask=attn_mask,
+                        is_causal=False,
+                        dropout_p=dropout_p,
+                    ).transpose(1, 2)
+            else:
+                attn_output = F.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attn_mask,
+                    is_causal=False,
+                    dropout_p=dropout_p,
+                ).transpose(1, 2)
+
+        elif self.attn_implementation == "flash_attention_2":
+            if xq.dtype == torch.float32:
+                if torch.is_autocast_enabled():
+                    target_dtype = torch.get_autocast_gpu_dtype()
+                else:
+                    target_dtype = self.wq.weight.dtype
+            attn_output = _flash_attention_forward(
+                xq,
+                xk,
+                xv,
+                attention_mask=attn_mask,
+                query_length=inputs_q.shape[1],
+                is_causal=False,
+                dropout=dropout_p,
+                softmax_scale=xq.shape[-1] ** -0.5,
+                use_top_left_mask=flash_attn_supports_top_left_mask(),
+                target_dtype=target_dtype,
+                implementation=self.attn_implementation,
+            )
+        else:
+            raise ValueError(f"Attention implementation {self.attn_implementation} not supported")
+
+        attn_output = attn_output.to(og_dtype)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.wo(attn_output)
+        attn_output = self.residual_dropout(attn_output)
+
+        return attn_output
+
+
+class MolmoAct2VisionBlock(nn.Module):
+    def __init__(self, config: MolmoAct2VitConfig, device: str | torch.device = None):
+        super().__init__()
+        self.attention = ViTMultiHeadDotProductAttention(
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            float32_attention=config.float32_attention,
+            attention_dropout=config.attention_dropout,
+            residual_dropout=config.residual_dropout,
+            device=device,
+            attn_implementation=config._attn_implementation,
+        )
+        self.feed_forward = ViTMLP(
+            config.hidden_size,
+            config.intermediate_size,
+            config.hidden_act,
+            device=device,
+        )
+        self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+        self.ffn_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+
+
+class MolmoAct2VisionBlockCollection(nn.Module):
+    def __init__(self, config: MolmoAct2VitConfig, device: str | torch.device = None):
+        super().__init__()
+        self.config = config
+        self.resblocks = nn.ModuleList(
+            [MolmoAct2VisionBlock(config, device) for _ in range(config.num_hidden_layers)]
+        )
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        hidden_states = []
+        for r in self.resblocks:
+            x = r(x)
+            hidden_states.append(x)
+        return hidden_states
+
+
+class MolmoAct2VisionTransformer(nn.Module):
+    def __init__(self, config: MolmoAct2VitConfig, device: str | torch.device = None):
+        super().__init__()
+        self.config = config
+
+        # positional embeddings
+        self.scale = config.hidden_size**-0.5
+        self.num_prefix_tokens: int = 0  # no class embeddings
+        self.positional_embedding = nn.Parameter(
+            torch.zeros(config.image_num_pos, config.hidden_size, device=device),
+        )
+
+        image_patch_size = config.image_patch_size
+        self.patch_embedding = nn.Linear(
+            image_patch_size * image_patch_size * 3,
+            config.hidden_size,
+            bias=True,
+            device=device,
+        )
+
+        self.transformer = MolmoAct2VisionBlockCollection(config, device)
+
+    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
+        pos_emb = self.positional_embedding
+
+        pos_emb = pos_emb.reshape(
+            (
+                int(math.sqrt(pos_emb.shape[0])),
+                int(math.sqrt(pos_emb.shape[0])),
+                pos_emb.shape[1],
+            )
+        )
+
+        (patch_num_0, patch_num_1) = patch_num
+
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # Derived from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+            # antialias: default True in jax.image.resize
+            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
+            pos_emb = F.interpolate(
+                pos_emb,
+                size=(patch_num_0, patch_num_1),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
+            )
+            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
+
+        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
+        x = x + pos_emb[None, :, :].to(x.dtype)
+        return x
+
+    def forward(self, x: torch.Tensor, patch_num: int = None) -> list[torch.Tensor]:
+        """
+        : param x: (batch_size, num_patch, n_pixels)
+        """
+        if patch_num is None:
+            patch_num = self.config.image_num_patch
+
+        B, N, D = x.shape
+
+        x = self.patch_embedding(x)
+
+        # class embeddings and positional embeddings
+        x = self.add_pos_emb(x, patch_num)
+
+        hidden_states = self.transformer(x)
+        return hidden_states
+
+
+class ImageProjectorMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        hidden_act: str,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.w2 = nn.Linear(hidden_dim, output_dim, bias=False, device=device)
+        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.act = ACT2FN[hidden_act]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act(self.w1(x)) * self.w3(x))
+
+
+class MolmoAct2VisionBackbone(nn.Module):
+    def __init__(self, vit_config: MolmoAct2VitConfig, adapter_config: MolmoAct2AdapterConfig):
+        super().__init__()
+        self.vit_config = vit_config
+        self.adapter_config = adapter_config
+
+        self.vit_layers = []
+        for layer in adapter_config.vit_layers:
+            if layer >= 0:
+                self.vit_layers.append(layer)
+            else:
+                self.vit_layers.append(layer + vit_config.num_hidden_layers)
+
+        last_layer_needed = max(self.vit_layers) + 1
+        if last_layer_needed < vit_config.num_hidden_layers:
+            new_vit_config = deepcopy(vit_config)
+            new_vit_config.num_hidden_layers = last_layer_needed
+            self.image_vit = MolmoAct2VisionTransformer(new_vit_config)
+        else:
+            self.image_vit = MolmoAct2VisionTransformer(vit_config)
+
+        self.num_prefix_tokens: int = self.image_vit.num_prefix_tokens
+
+        pool_dim = vit_config.hidden_size * len(adapter_config.vit_layers)
+        self.image_pooling_2d = ViTMultiHeadDotProductAttention(
+            hidden_size=adapter_config.hidden_size,
+            num_heads=adapter_config.num_attention_heads,
+            num_key_value_heads=adapter_config.num_key_value_heads,
+            head_dim=adapter_config.head_dim,
+            input_dim=pool_dim,
+            float32_attention=adapter_config.float32_attention,
+            attention_dropout=adapter_config.attention_dropout,
+            residual_dropout=adapter_config.residual_dropout,
+            attn_implementation=adapter_config._attn_implementation,
+        )
+        self.image_projector = ImageProjectorMLP(
+            adapter_config.hidden_size,
+            adapter_config.intermediate_size,
+            adapter_config.text_hidden_size,
+            adapter_config.hidden_act,
+        )
+        self.image_feature_dropout = nn.Dropout(adapter_config.image_feature_dropout)
+        self.gradient_checkpointing = False
+
+    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        : param images: (batch_size, num_crops, num_patch, n_pixels)
+        """
+        batch_size, num_crops, num_patches, patch_dim = images.shape
+        images = images.view(batch_size * num_crops, num_patches, patch_dim)
+
+        x = self.image_vit.patch_embedding(images)
+        x = self.image_vit.add_pos_emb(x, self.image_vit.config.image_num_patch)
+
+        needed_layers = {int(layer) for layer in self.vit_layers}
+        selected_features: dict[int, torch.Tensor] = {}
+        use_checkpoint = bool(self.gradient_checkpointing and self.training and torch.is_grad_enabled())
+        for layer_idx, block in enumerate(self.image_vit.transformer.resblocks):
+            if use_checkpoint:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+            if layer_idx in needed_layers:
+                selected_features[layer_idx] = x
+
+        missing = needed_layers - set(selected_features)
+        if missing:
+            raise RuntimeError(
+                f"MolmoAct2 vision backbone did not produce requested layers: {sorted(missing)}."
+            )
+
+        image_features = torch.cat([selected_features[int(layer)] for layer in self.vit_layers], dim=-1)
+
+        if self.num_prefix_tokens > 0:
+            image_features = image_features[:, 1:]
+        image_features = image_features.view(batch_size, num_crops, num_patches, -1)
+        return image_features
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.image_vit.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.image_vit.patch_embedding.weight.device
+
+    def forward(
+        self,
+        images: torch.Tensor,
+        pooled_patches_idx: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
+        batch_size, num_image = images.shape[:2]
+        images = images.to(device=self.device)
+        if images.dtype == torch.uint8:
+            images = images.to(dtype=torch.float32) / 255.0
+            images = images * 2.0 - 1.0
+        elif torch.is_floating_point(images):
+            # Native MolmoAct2 eval keeps resized SigLIP pixels as uint8 and normalizes
+            # on device. Canonicalize HF processor floats to that exact grid.
+            images = torch.round(((images.to(dtype=torch.float32) + 1.0) * 0.5) * 255.0)
+            images = torch.clamp(images, 0.0, 255.0) / 255.0
+            images = images * 2.0 - 1.0
+        images = images.to(dtype=self.dtype)
+        image_features = self.encode_image(images)
+
+        image_features = self.image_feature_dropout(image_features)
+        dim = image_features.shape[-1]
+        valid = pooled_patches_idx >= 0
+        valid_token = torch.any(valid, -1)
+
+        # Use `pooled_patches_idx` to arange the features for image pooling
+        batch_idx = torch.arange(
+            pooled_patches_idx.shape[0],
+            dtype=torch.long,
+            device=pooled_patches_idx.device,
+        )
+        batch_idx = torch.tile(
+            batch_idx.view(batch_size, 1, 1),
+            [1, pooled_patches_idx.shape[1], pooled_patches_idx.shape[2]],
+        )
+
+        # Now [batch, num_high_res_features, pool_dim, dim]
+        to_pool = image_features.reshape(batch_size, -1, dim)[batch_idx, torch.clip(pooled_patches_idx, 0)]
+        to_pool = to_pool * valid.to(self.dtype)[:, :, :, None]
+        to_pool = to_pool.reshape([-1, pooled_patches_idx.shape[-1], dim])
+        if self.adapter_config.pooling_attention_mask:
+            attn_mask = valid.reshape([-1, 1, 1, valid.shape[-1]])
+            denom = valid.view(-1, to_pool.shape[-2]).float().sum(-1)
+            denom = torch.where(denom == 0, 1, denom)
+            query = to_pool.sum(-2, keepdim=True) / denom[:, None, None].to(to_pool.dtype)
+        else:
+            attn_mask = None
+            query = to_pool.mean(-2, keepdim=True)
+        pooled_features = self.image_pooling_2d(query, to_pool, attn_mask=attn_mask)
+        pooled_features = pooled_features.reshape([batch_size, -1, pooled_features.shape[-1]])
+
+        # MLP layer to map the feature.
+        pooled_features = self.image_projector(pooled_features)
+        return pooled_features.view(-1, pooled_features.shape[-1])[valid_token.flatten()]
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MolmoAct2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(
+        self,
+        config: MolmoAct2TextConfig,
+        device: str | torch.device = None,
+        rope_type: str | None = None,
+    ):
+        super().__init__()
+        if rope_type is not None:
+            self.rope_type = rope_type
+        elif hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            # BC: "rope_type" was originally "type"
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        if self.rope_type == "default":
+            self.rope_init_fn = self._default_rope_init
+        else:
+            self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=True)
+        self.original_inv_freq = self.inv_freq
+        self.register_buffer("_pos_sin_cache", torch.empty(0), persistent=False)
+        self.register_buffer("_pos_cos_cache", torch.empty(0), persistent=False)
+
+    @staticmethod
+    def _default_rope_init(
+        config: MolmoAct2TextConfig, device: str | torch.device = None, **_
+    ) -> tuple[torch.Tensor, float]:
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, config.head_dim, 2, dtype=torch.float32, device=device) / config.head_dim)
+        )
+        return inv_freq, 1.0
+
+    def _target_cache_seq_len(self, x: torch.Tensor, position_ids: torch.Tensor | None) -> int:
+        if self.config.max_position_embeddings:
+            return int(self.config.max_position_embeddings)
+        if position_ids is not None:
+            return int(position_ids.max().item()) + 1
+        return int(x.shape[-2])
+
+    def _rope_cache_ready(self, device: torch.device, seq_len: int) -> bool:
+        return (
+            self._pos_sin_cache.numel() > 0
+            and self._pos_sin_cache.device == device
+            and self._pos_cos_cache.device == device
+            and self._pos_sin_cache.shape[-2] >= seq_len
+            and self._pos_cos_cache.shape[-2] >= seq_len
+        )
+
+    def _refresh_inv_freq_if_needed(self, device: torch.device) -> None:
+        device = torch.device(device)
+        expected = int(self.config.head_dim) // 2
+        needs_refresh = (
+            self.inv_freq is None
+            or self._pos_sin_cache.numel() == 0
+            or self.inv_freq.device.type == "meta"
+            or self.inv_freq.device != device
+            or self.inv_freq.numel() != expected
+        )
+        if not needs_refresh:
+            inv_freq_cpu = self.inv_freq.detach()
+            needs_refresh = (
+                not bool(torch.isfinite(inv_freq_cpu).all().item())
+                or bool((inv_freq_cpu <= 0).any().item())
+                or not bool(torch.isclose(inv_freq_cpu[0].cpu(), torch.tensor(1.0)).item())
+            )
+        if needs_refresh:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+            self.register_buffer("inv_freq", inv_freq, persistent=True)
+            self.original_inv_freq = self.inv_freq
+            self._pos_sin_cache = torch.empty(0, device=device)
+            self._pos_cos_cache = torch.empty(0, device=device)
+
+    def _build_rope_cache(self, device: torch.device, seq_len: int) -> None:
+        device_type = device.type if device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = torch.einsum("i,j->ij", seq, self.inv_freq.to(device=device, dtype=torch.float))
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self._pos_sin_cache = emb.sin()[None, None, :, :] * self.attention_scaling
+            self._pos_cos_cache = emb.cos()[None, None, :, :] * self.attention_scaling
+
+    @torch.no_grad()
+    def prepare_rope_cache(
+        self,
+        *,
+        device: str | torch.device,
+        max_seq_len: int | None = None,
+    ) -> None:
+        if self.rope_type != "default":
+            return
+        device = torch.device(device)
+        seq_len = int(max_seq_len or self.config.max_position_embeddings or 0)
+        if seq_len <= 0:
+            raise ValueError("RoPE cache preparation requires a positive max sequence length.")
+        if self._rope_cache_ready(device, seq_len):
+            return
+        self._refresh_inv_freq_if_needed(device)
+        self._build_rope_cache(device, seq_len)
+
+    def _select_rope_cache(
+        self,
+        x: torch.Tensor,
+        position_ids: torch.Tensor | None,
+        seq_len: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        pos_sin = self._pos_sin_cache[:, :, :seq_len, :]
+        pos_cos = self._pos_cos_cache[:, :, :seq_len, :]
+        if position_ids is None:
+            sin = pos_sin[0, 0, : x.shape[-2], :]
+            cos = pos_cos[0, 0, : x.shape[-2], :]
+        else:
+            sin = pos_sin[0, 0][position_ids].view(position_ids.shape + (pos_sin.shape[-1],))
+            cos = pos_cos[0, 0][position_ids].view(position_ids.shape + (pos_cos.shape[-1],))
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        seq_len = self._target_cache_seq_len(x, position_ids)
+        if not self._rope_cache_ready(x.device, seq_len):
+            self._refresh_inv_freq_if_needed(x.device)
+            self._build_rope_cache(x.device, seq_len)
+        return self._select_rope_cache(x, position_ids, seq_len)
+
+
+class MolmoAct2RMSNorm(nn.Module):
+    def __init__(
+        self,
+        size: int,
+        eps: float = 1e-6,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(size, device=device))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+
+        return self.weight * x
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor | None]:
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class MolmoAct2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: MolmoAct2TextConfig, layer_idx: int) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.head_dim = config.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = True
+
+        self.fused_dims = (
+            config.num_attention_heads * config.head_dim,
+            config.head_dim * config.num_key_value_heads,
+            config.head_dim * config.num_key_value_heads,
+        )
+        self.att_proj = nn.Linear(
+            config.hidden_size,
+            sum(self.fused_dims),
+            bias=config.qkv_bias,
+        )
+
+        # Layer norms.
+        self.k_norm: MolmoAct2RMSNorm | None = None
+        self.q_norm: MolmoAct2RMSNorm | None = None
+        self.qk_norm_type: str | None = None
+        if config.use_qk_norm:
+            k_norm_size = (
+                config.head_dim
+                if config.qk_norm_type == "qwen3"
+                else config.num_key_value_heads * config.head_dim
+            )
+            self.k_norm = MolmoAct2RMSNorm(k_norm_size, eps=config.layer_norm_eps)
+            q_norm_size = (
+                config.head_dim
+                if config.qk_norm_type == "qwen3"
+                else config.num_attention_heads * config.head_dim
+            )
+            self.q_norm = MolmoAct2RMSNorm(q_norm_size, eps=config.layer_norm_eps)
+            self.qk_norm_type = config.qk_norm_type
+
+        self.attention_dropout = config.attention_dropout
+
+        self.attn_out = nn.Linear(
+            config.head_dim * config.num_attention_heads,
+            config.hidden_size,
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        qkv = self.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(self.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type != "qwen3":
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+        if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type == "qwen3":
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        collected_key_states = key_states
+        collected_value_states = value_states
+
+        dropout_p = 0.0 if not self.training else self.attention_dropout
+        if self.config._attn_implementation == "sdpa" and (
+            attention_mask is None or torch.is_tensor(attention_mask)
+        ):
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=dropout_p,
+                is_causal=attention_mask is None,
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_weights = None
+        else:
+            attention_interface: Callable = eager_attention_forward
+            if self.config._attn_implementation != "eager":
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                dropout=dropout_p,
+                scaling=self.scaling,
+                **kwargs,
+            )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.attn_out(attn_output)
+        if collect_layer_kv_states:
+            return attn_output, attn_weights, collected_key_states, collected_value_states
+        return attn_output, attn_weights
+
+
+class LanguageModelMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        intermediate_size: int,
+        hidden_act: str,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.ff_proj = nn.Linear(input_dim, intermediate_size * 2, bias=False, device=device)
+        self.ff_out = nn.Linear(intermediate_size, input_dim, bias=False, device=device)
+        self.act = ACT2FN[hidden_act]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ff_proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        x = self.act(gate) * x
+        x = self.ff_out(x)
+        return x
+
+
+class MolmoAct2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(
+        self,
+        config: MolmoAct2TextConfig,
+        layer_idx: int | None = None,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.config = config
+
+        self.self_attn = MolmoAct2Attention(config, layer_idx)
+        self.attn_norm = MolmoAct2RMSNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+        self.dropout = nn.Dropout(config.residual_dropout)
+        self.mlp = LanguageModelMLP(
+            config.hidden_size,
+            config.intermediate_size,
+            config.hidden_act,
+            device=device,
+        )
+        self.ff_norm = MolmoAct2RMSNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        output_attentions: bool | None = False,
+        use_cache: bool | None = False,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
+        collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
+
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states)
+
+        # Self Attention
+        attention_outputs = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            collect_layer_kv_states=collect_layer_kv_states,
+            **kwargs,
+        )
+        hidden_states = attention_outputs[0]
+        self_attn_weights = attention_outputs[1]
+
+        hidden_states = residual + self.dropout(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ff_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + self.dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if collect_layer_kv_states:
+            outputs += (attention_outputs[2], attention_outputs[3])
+
+        return outputs
+
+
+class MolmoAct2PostNormDecoderLayer(MolmoAct2DecoderLayer):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        output_attentions: bool | None = False,
+        use_cache: bool | None = False,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
+        collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
+
+        residual = hidden_states
+
+        # Self Attention
+        attention_outputs = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            collect_layer_kv_states=collect_layer_kv_states,
+            **kwargs,
+        )
+        hidden_states = attention_outputs[0]
+        self_attn_weights = attention_outputs[1]
+        hidden_states = self.attn_norm(hidden_states)
+
+        hidden_states = residual + self.dropout(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.ff_norm(hidden_states)
+
+        hidden_states = residual + self.dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if collect_layer_kv_states:
+            outputs += (attention_outputs[2], attention_outputs[3])
+
+        return outputs
+
+
+class MolmoAct2Embedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        num_new_embeddings: int,
+        features: int,
+        device: str | torch.device = None,
+    ):
+        super().__init__()
+        self.embedding = nn.Parameter(
+            torch.zeros(num_embeddings, features, device=device),
+        )
+        self.new_embedding = nn.Parameter(
+            torch.zeros(num_new_embeddings, features, device=device),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
+
+
+class MolmoAct2PreTrainedModel(PreTrainedModel):
+    config: MolmoAct2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "MolmoAct2DecoderLayer",
+        "MolmoAct2PostNormDecoderLayer",
+        "MolmoAct2VisionBlock",
+        "ViTMultiHeadDotProductAttention",
+    ]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": MolmoAct2DecoderLayer,
+        "attentions": MolmoAct2Attention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear,)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, MolmoAct2Embedding):
+            module.embedding.data.normal_(mean=0.0, std=std)
+            module.new_embedding.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MolmoAct2RMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+class MolmoAct2TextModel(MolmoAct2PreTrainedModel):
+    config: MolmoAct2TextConfig
+    _no_split_modules = ["MolmoAct2DecoderLayer", "MolmoAct2PostNormDecoderLayer"]
+
+    def __init__(self, config: MolmoAct2TextConfig):
+        super().__init__(config)
+        if config.additional_vocab_size is not None:
+            self.wte = MolmoAct2Embedding(
+                config.vocab_size,
+                config.additional_vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_drop = nn.Dropout(config.embedding_dropout)
+        decoder_layer = MolmoAct2PostNormDecoderLayer if config.norm_after else MolmoAct2DecoderLayer
+        self.blocks = nn.ModuleList(
+            [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.ln_f = MolmoAct2RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if config.rope_scaling_layers is not None:
+            self.rotary_embs = nn.ModuleDict(
+                {
+                    "default": MolmoAct2RotaryEmbedding(config, rope_type="default"),
+                    "scaling": MolmoAct2RotaryEmbedding(config),
+                }
+            )
+        else:
+            self.rotary_emb = MolmoAct2RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.no_grad()
+    def prepare_rope_cache(
+        self,
+        *,
+        device: str | torch.device,
+        max_seq_len: int | None = None,
+    ) -> None:
+        if self.config.rope_scaling_layers is not None:
+            for rotary_emb in self.rotary_embs.values():
+                rotary_emb.prepare_rope_cache(device=device, max_seq_len=max_seq_len)
+            return
+        self.rotary_emb.prepare_rope_cache(device=device, max_seq_len=max_seq_len)
+
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.wte
+
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.wte = value
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
+        if collect_layer_kv_states and past_key_values is not None:
+            raise ValueError("collect_layer_kv_states cannot be used with past_key_values.")
+        if collect_layer_kv_states:
+            use_cache = False
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+            inputs_embeds = self.wte(input_ids)
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if torch.is_tensor(attention_mask) and attention_mask.ndim == 4:
+            causal_mask_mapping = attention_mask
+        elif not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+
+            # Create the mask
+            causal_mask_mapping = create_causal_mask(**mask_kwargs)
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        if self.config.rope_scaling_layers is not None:
+            position_embeddings_mapping = {
+                "default": self.rotary_embs["default"](hidden_states, position_ids),
+                "scaling": self.rotary_embs["scaling"](hidden_states, position_ids),
+            }
+        else:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        collected_kv_states = [] if collect_layer_kv_states else None
+
+        for layer_idx, decoder_block in enumerate(self.blocks[: self.config.num_hidden_layers]):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.config.rope_scaling_layers is not None:
+                position_embeddings_i = (
+                    position_embeddings_mapping["scaling"]
+                    if layer_idx in self.config.rope_scaling_layers
+                    else position_embeddings_mapping["default"]
+                )
+            else:
+                position_embeddings_i = position_embeddings
+
+            layer_outputs = decoder_block(
+                hidden_states,
+                attention_mask=causal_mask_mapping,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings_i,
+                collect_layer_kv_states=collect_layer_kv_states,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            output_idx = 1
+            if output_attentions:
+                all_self_attns += (layer_outputs[output_idx],)
+                output_idx += 1
+            if collect_layer_kv_states:
+                collected_kv_states.append((layer_outputs[output_idx], layer_outputs[output_idx + 1]))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=tuple(collected_kv_states) if collect_layer_kv_states else past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+# Adapted from transformers.models.gemma3.modeling_gemma3
+def token_type_ids_mask_function(
+    token_type_ids: torch.Tensor | None = None,
+) -> Callable | None:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & is_image_block
+
+    return inner_mask
+
+
+class MolmoAct2Model(MolmoAct2PreTrainedModel):
+    base_model_prefix = ""
+    _checkpoint_conversion_mapping = {}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: MolmoAct2Config
+
+    def __init__(self, config: MolmoAct2Config):
+        super().__init__(config)
+        self.transformer: MolmoAct2TextModel = MolmoAct2TextModel(config.text_config)
+        self.vision_backbone: MolmoAct2VisionBackbone | None = None
+        if config.vit_config is not None and config.adapter_config is not None:
+            self.vision_backbone = MolmoAct2VisionBackbone(config.vit_config, config.adapter_config)
+        llm_kv_dim = config.text_config.num_key_value_heads * config.text_config.head_dim
+        if config.add_action_expert:
+            self.action_expert = ActionExpert(
+                config.action_expert_config,
+                llm_dim=config.hidden_size,
+                llm_kv_dim=llm_kv_dim,
+                llm_num_layers=config.num_hidden_layers,
+            )
+        else:
+            self.action_expert = None
+        if config.add_action_expert and config.action_expert_depth_gate:
+            if config.action_expert_depth_gate_per_layer:
+                self.action_expert_depth_gate = nn.ModuleList(
+                    nn.Linear(llm_kv_dim, 1) for _ in range(config.action_expert_config.num_layers)
+                )
+            else:
+                self.action_expert_depth_gate = nn.Linear(llm_kv_dim, 1)
+            self.reset_action_expert_depth_gate_parameters()
+        else:
+            self.action_expert_depth_gate = None
+        self._depth_gate_token_ids = self._resolve_depth_gate_token_ids()
+        self.action_cuda_graph_manager: ActionCudaGraphManager | None = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.transformer.wte
+
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.transformer.wte = value
+
+    def set_decoder(self, decoder):
+        self.transformer = decoder
+
+    def get_decoder(self):
+        return self.transformer
+
+    @property
+    def device(self) -> torch.device:
+        return self.transformer.ln_f.weight.device
+
+    def reset_action_expert_depth_gate_parameters(self) -> None:
+        if self.action_expert_depth_gate is None:
+            return
+        gates = (
+            self.action_expert_depth_gate
+            if isinstance(self.action_expert_depth_gate, nn.ModuleList)
+            else [self.action_expert_depth_gate]
+        )
+        for gate in gates:
+            nn.init.zeros_(gate.weight)
+            nn.init.constant_(gate.bias, float(self.config.action_expert_depth_gate_init_bias))
+
+    def _resolve_depth_gate_token_ids(self) -> tuple[int, ...]:
+        if not self.config.action_expert_depth_gate:
+            return ()
+        token_ids = []
+        for token_id in (
+            self.config.depth_output_token_id,
+            self.config.depth_start_token_id,
+            self.config.depth_end_token_id,
+        ):
+            if token_id is not None:
+                token_ids.append(int(token_id))
+        if self.config.depth_token_start_id is not None and int(self.config.num_depth_tokens or 0) > 0:
+            start = int(self.config.depth_token_start_id)
+            token_ids.extend(range(start, start + int(self.config.num_depth_tokens)))
+        return tuple(dict.fromkeys(token_ids))
+
+    def _require_action_expert(self) -> ActionExpert:
+        if self.action_expert is None:
+            raise RuntimeError("This MolmoAct2 checkpoint does not include an action expert.")
+        return self.action_expert
+
+    def _cache_to_sequence(self, cache: torch.Tensor) -> torch.Tensor:
+        if cache.dim() != 4:
+            raise ValueError(f"Expected KV cache tensor with 4 dims, got shape {tuple(cache.shape)}")
+        head_candidates = {
+            self.config.text_config.num_key_value_heads,
+            self.config.text_config.num_attention_heads,
+        }
+        if cache.shape[1] in head_candidates:
+            bsz, n_heads, seq_len, head_dim = cache.shape
+            return cache.permute(0, 2, 1, 3).reshape(bsz, seq_len, n_heads * head_dim)
+        if cache.shape[2] in head_candidates:
+            bsz, seq_len, n_heads, head_dim = cache.shape
+            return cache.reshape(bsz, seq_len, n_heads * head_dim)
+        if cache.shape[1] <= cache.shape[2]:
+            bsz, n_heads, seq_len, head_dim = cache.shape
+            return cache.permute(0, 2, 1, 3).reshape(bsz, seq_len, n_heads * head_dim)
+        bsz, seq_len, n_heads, head_dim = cache.shape
+        return cache.reshape(bsz, seq_len, n_heads * head_dim)
+
+    def _extract_kv_states(self, past_key_values: Cache) -> Sequence[tuple[torch.Tensor, torch.Tensor]]:
+        if past_key_values is None:
+            raise RuntimeError("Action generation requires past_key_values from the VLM forward pass.")
+        seq_len = _cache_seq_len_int(past_key_values)
+        kv_states = []
+        for key, value in _iter_cache_key_values(past_key_values):
+            if key is None or value is None:
+                continue
+            if key.shape[-2] > seq_len:
+                key = key[..., :seq_len, :]
+                value = value[..., :seq_len, :]
+            kv_states.append((self._cache_to_sequence(key), self._cache_to_sequence(value)))
+        if len(kv_states) != self.config.action_expert_config.num_layers:
+            raise RuntimeError(
+                f"Expected {self.config.action_expert_config.num_layers} KV layers, got {len(kv_states)}."
+            )
+        return kv_states
+
+    @staticmethod
+    def _mask_discrete_output_span(
+        row_ids: torch.Tensor,
+        row_mask: torch.Tensor,
+        start_id: int | None,
+        end_id: int | None,
+    ) -> None:
+        if start_id is None or end_id is None:
+            return
+        start_positions = (row_ids == start_id).nonzero(as_tuple=False).flatten().tolist()
+        if not start_positions:
+            return
+        end_positions = (row_ids == end_id).nonzero(as_tuple=False).flatten().tolist()
+        end_ptr = 0
+        for start_pos in start_positions:
+            while end_ptr < len(end_positions) and end_positions[end_ptr] < start_pos:
+                end_ptr += 1
+            if end_ptr >= len(end_positions):
+                row_mask[start_pos:] = False
+                break
+            end_pos = end_positions[end_ptr]
+            row_mask[start_pos : end_pos + 1] = False
+            end_ptr += 1
+
+    def _get_encoder_attention_mask(
+        self,
+        input_ids: torch.Tensor | None,
+        attention_mask: torch.Tensor | None,
+    ) -> torch.Tensor | None:
+        if attention_mask is not None:
+            mask = attention_mask.to(dtype=torch.bool).clone()
+        elif input_ids is not None:
+            mask = input_ids != -1
+        else:
+            return None
+        if self.config.action_mode != "both" or input_ids is None:
+            return mask
+        eos_id = getattr(self.config, "eos_token_id", None)
+        if eos_id is not None:
+            mask &= input_ids != int(eos_id)
+        for batch_idx in range(input_ids.shape[0]):
+            self._mask_discrete_output_span(
+                input_ids[batch_idx],
+                mask[batch_idx],
+                self.config.action_start_token_id,
+                self.config.action_end_token_id,
+            )
+        return mask
+
+    def _get_depth_token_mask(
+        self,
+        input_ids: torch.Tensor | None,
+        encoder_attention_mask: torch.Tensor | None,
+    ) -> torch.Tensor | None:
+        if not self.config.action_expert_depth_gate or input_ids is None or not self._depth_gate_token_ids:
+            return None
+        depth_token_ids = torch.as_tensor(
+            self._depth_gate_token_ids,
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
+        depth_mask = (input_ids.unsqueeze(-1) == depth_token_ids).any(dim=-1)
+        if encoder_attention_mask is not None:
+            depth_mask = depth_mask & encoder_attention_mask.to(device=input_ids.device, dtype=torch.bool)
+        return depth_mask
+
+    @staticmethod
+    def _depth_gate_from_source(
+        gate_head: nn.Linear,
+        *,
+        source: torch.Tensor,
+        depth_mask: torch.Tensor,
+        encoder_attention_mask: torch.Tensor | None,
+    ) -> torch.Tensor:
+        if source.ndim == 4:
+            source = source.reshape(source.shape[0], source.shape[1], -1)
+        if source.ndim != 3:
+            raise ValueError(f"Depth gate expected a 3D sequence tensor, got {tuple(source.shape)}.")
+        if encoder_attention_mask is not None:
+            valid_mask = encoder_attention_mask.to(device=source.device, dtype=torch.bool)
+        else:
+            valid_mask = torch.ones(depth_mask.shape, device=source.device, dtype=torch.bool)
+        depth_mask = depth_mask.to(device=source.device, dtype=torch.bool)
+        pool_mask = valid_mask & ~depth_mask
+        has_pool = pool_mask.any(dim=-1, keepdim=True)
+        pool_mask = torch.where(has_pool, pool_mask, valid_mask)
+        weights = pool_mask.to(dtype=source.dtype).unsqueeze(-1)
+        pooled = (source * weights).sum(dim=1) / weights.sum(dim=1).clamp_min(1.0)
+        gate_logits = gate_head(pooled.to(dtype=gate_head.weight.dtype))
+        return torch.sigmoid(gate_logits).to(dtype=source.dtype)
+
+    def _depth_gate_from_condition(
+        self,
+        *,
+        input_ids: torch.Tensor | None,
+        encoder_attention_mask: torch.Tensor | None,
+        layer_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]] | None,
+    ) -> tuple[torch.Tensor | Sequence[torch.Tensor] | None, torch.Tensor | None]:
+        gate_head = self.action_expert_depth_gate
+        if gate_head is None:
+            return None, None
+        depth_mask = self._get_depth_token_mask(input_ids, encoder_attention_mask)
+        if depth_mask is None or layer_kv_states is None:
+            return None, depth_mask
+        sources = [value for _, value in layer_kv_states]
+        if isinstance(gate_head, nn.ModuleList):
+            if len(gate_head) != len(sources):
+                raise ValueError(
+                    f"Depth gate layer count mismatch: gates={len(gate_head)}, sources={len(sources)}."
+                )
+            gates = [
+                self._depth_gate_from_source(
+                    gate,
+                    source=source,
+                    depth_mask=depth_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+                for gate, source in zip(gate_head, sources)
+            ]
+            return gates, depth_mask
+        gate = self._depth_gate_from_source(
+            gate_head,
+            source=sources[-1],
+            depth_mask=depth_mask,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        return gate, depth_mask
+
+    @staticmethod
+    def _depth_gate_for_layer(
+        gate: torch.Tensor | Sequence[torch.Tensor],
+        layer_idx: int,
+        *,
+        num_layers: int,
+    ) -> torch.Tensor:
+        if isinstance(gate, torch.Tensor):
+            return gate
+        if len(gate) != num_layers:
+            raise ValueError(f"Depth gate layer count mismatch: gates={len(gate)}, layers={num_layers}.")
+        return gate[layer_idx]
+
+    def _apply_depth_gate_to_layer_kv_states(
+        self,
+        layer_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]] | None,
+        depth_mask: torch.Tensor | None,
+        gate: torch.Tensor | Sequence[torch.Tensor] | None,
+    ) -> Sequence[tuple[torch.Tensor, torch.Tensor]] | None:
+        if layer_kv_states is None or depth_mask is None or gate is None:
+            return layer_kv_states
+        gated_kv = []
+        for layer_idx, (key, value) in enumerate(layer_kv_states):
+            layer_gate = self._depth_gate_for_layer(gate, layer_idx, num_layers=len(layer_kv_states))
+            mask = depth_mask.to(device=key.device, dtype=torch.bool)
+            view_shape = [mask.shape[0], mask.shape[1]] + [1] * (key.ndim - 2)
+            scale = torch.ones(view_shape, device=key.device, dtype=key.dtype)
+            gate_view = layer_gate.to(device=key.device, dtype=key.dtype).view(
+                layer_gate.shape[0],
+                *([1] * (key.ndim - 1)),
+            )
+            scale = torch.where(mask.view(view_shape), gate_view, scale)
+            gated_kv.append((key * scale, value * scale))
+        return gated_kv
+
+    @staticmethod
+    def _action_dim_valid_mask(
+        target: torch.Tensor,
+        action_dim_is_pad: torch.Tensor | None,
+    ) -> torch.Tensor | None:
+        if action_dim_is_pad is None:
+            return None
+        mask = ~action_dim_is_pad.to(device=target.device, dtype=torch.bool)
+        if mask.ndim == 1:
+            mask = mask.unsqueeze(0)
+        if mask.shape[-1] != target.shape[-1]:
+            raise ValueError(
+                f"action_dim_is_pad width {mask.shape[-1]} does not match target width {target.shape[-1]}."
+            )
+        if mask.shape[0] == 1 and target.shape[0] != 1:
+            mask = mask.expand(target.shape[0], -1)
+        if mask.shape[0] != target.shape[0]:
+            raise ValueError(
+                f"action_dim_is_pad batch {mask.shape[0]} does not match target batch {target.shape[0]}."
+            )
+        while mask.ndim < target.ndim:
+            mask = mask.unsqueeze(1)
+        return mask
+
+    @classmethod
+    def _mask_action_dim_tensor(
+        cls,
+        tensor: torch.Tensor,
+        *,
+        action_dim_is_pad: torch.Tensor | None,
+        enabled: bool,
+    ) -> torch.Tensor:
+        if not enabled:
+            return tensor
+        valid_mask = cls._action_dim_valid_mask(tensor, action_dim_is_pad)
+        if valid_mask is None:
+            return tensor
+        return tensor.masked_fill(~valid_mask, 0)
+
+    def _run_action_flow_loop(self, inputs: _ActionFlowInputs, steps: int) -> torch.Tensor:
+        action_expert = self._require_action_expert()
+        dt = 1.0 / steps
+        trajectory = inputs.trajectory
+        action_dim_is_pad = inputs.action_dim_is_pad
+        mask_enabled = self.config.mask_action_dim_padding
+        for idx in range(steps):
+            velocity = action_expert.forward_with_context(
+                trajectory,
+                inputs.modulations[idx].conditioning,
+                context=inputs.context,
+                modulation=inputs.modulations[idx],
+            )
+            velocity = self._mask_action_dim_tensor(
+                velocity,
+                action_dim_is_pad=action_dim_is_pad,
+                enabled=mask_enabled,
+            )
+            trajectory = trajectory + dt * velocity
+            trajectory = self._mask_action_dim_tensor(
+                trajectory,
+                action_dim_is_pad=action_dim_is_pad,
+                enabled=mask_enabled,
+            )
+        return trajectory
+
+    def _resolve_action_horizon(self, action_horizon: int | None = None) -> int:
+        max_action_horizon = int(self.config.max_action_horizon or 1)
+        resolved = max_action_horizon if action_horizon is None else int(action_horizon)
+        if resolved < 1:
+            raise ValueError(f"action_horizon must be >= 1, got {resolved}.")
+        if resolved > max_action_horizon:
+            raise ValueError(
+                f"Requested action_horizon={resolved} exceeds checkpoint max_action_horizon={max_action_horizon}."
+            )
+        return resolved
+
+    @torch.no_grad()
+    def generate_actions_from_inputs(
+        self,
+        *,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.Tensor | None = None,
+        image_token_pooling: torch.Tensor | None = None,
+        image_grids: torch.Tensor | None = None,
+        image_num_crops: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_token_pooling: torch.Tensor | None = None,
+        video_grids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        states: torch.Tensor | None = None,
+        action_dim_is_pad: torch.Tensor | None = None,
+        action_horizon: int | None = None,
+        num_steps: int | None = None,
+        generator: torch.Generator | None = None,
+        encoder_kv_states: Sequence[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        action_expert = self._require_action_expert()
+        if encoder_kv_states is None:
+            outputs = self(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+                pixel_values_videos=pixel_values_videos,
+                video_token_pooling=video_token_pooling,
+                video_grids=video_grids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                use_cache=True,
+            )
+            encoder_kv_states = self._extract_kv_states(outputs.past_key_values)
+            encoder_attention_mask = self._get_encoder_attention_mask(input_ids, attention_mask)
+        elif encoder_attention_mask is None:
+            encoder_attention_mask = self._get_encoder_attention_mask(input_ids, attention_mask)
+
+        depth_gate, depth_mask = self._depth_gate_from_condition(
+            input_ids=input_ids,
+            encoder_attention_mask=encoder_attention_mask,
+            layer_kv_states=encoder_kv_states,
+        )
+        encoder_kv_states = self._apply_depth_gate_to_layer_kv_states(
+            encoder_kv_states,
+            depth_mask,
+            depth_gate,
+        )
+        steps = int(num_steps or self.config.flow_matching_num_steps)
+        if steps <= 0:
+            raise ValueError(f"num_steps must be >= 1, got {steps}.")
+        source_tensor = encoder_kv_states[0][0]
+        batch_size = source_tensor.shape[0]
+        device = source_tensor.device
+        action_horizon = self._resolve_action_horizon(action_horizon)
+        trajectory_dtype = action_expert.action_embed.weight.dtype
+        trajectory = torch.randn(
+            (batch_size, action_horizon, self.config.max_action_dim),
+            device=device,
+            dtype=trajectory_dtype,
+            generator=generator,
+        )
+        trajectory = self._mask_action_dim_tensor(
+            trajectory,
+            action_dim_is_pad=action_dim_is_pad,
+            enabled=self.config.mask_action_dim_padding,
+        )
+        action_context = action_expert.prepare_context(
+            encoder_kv_states=encoder_kv_states,
+            encoder_attention_mask=encoder_attention_mask,
+            state_embeddings=states,
+            batch_size=batch_size,
+            seq_len=trajectory.shape[1],
+            device=device,
+            dtype=trajectory.dtype,
+        )
+        flow_timesteps = [
+            torch.full((batch_size,), idx / steps, device=device, dtype=torch.float32) for idx in range(steps)
+        ]
+        modulation_cache = action_expert.get_or_prepare_modulation_cache(
+            flow_timesteps,
+            cache_key=(steps, batch_size, device, trajectory.dtype),
+        )
+        flow_inputs = _ActionFlowInputs(
+            trajectory=trajectory,
+            context=action_context,
+            modulations=modulation_cache,
+            action_dim_is_pad=action_dim_is_pad,
+        )
+        action_cuda_graph_manager = self.action_cuda_graph_manager
+        if action_cuda_graph_manager is not None and action_cuda_graph_manager.can_use_action_flow(
+            flow_inputs
+        ):
+            trajectory = action_cuda_graph_manager.run_action_flow(
+                flow_inputs, steps, self._run_action_flow_loop
+            )
+        else:
+            trajectory = self._run_action_flow_loop(flow_inputs, steps)
+        return trajectory
+
+    def build_batched_images(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.Tensor,
+        image_token_pooling: torch.Tensor,
+        image_grids: torch.Tensor,
+        image_num_crops: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # 1) Count the number of images in each example
+        raw_counts = (input_ids == self.config.image_end_token_id).sum(1)  # [N]
+        total_images = int(image_grids.size(0))
+        total_end_tokens = int(raw_counts.sum().item())
+        if total_images <= 0:
+            counts = raw_counts.new_zeros(raw_counts.shape)
+        elif total_end_tokens == total_images:
+            counts = raw_counts
+        elif total_end_tokens == 2 * total_images:
+            counts = raw_counts // 2
+        else:
+            raise ValueError(
+                "Could not infer image counts from image end tokens: "
+                f"end_tokens={total_end_tokens}, image_grids={total_images}."
+            )
+        N = counts.size(0)
+        device = input_ids.device
+
+        # Total number of images in the batch
+        num_images = total_images
+
+        # Sanity check
+        assert image_grids.size(0) == num_images, (
+            f"Expected {num_images} image grids, but got {image_grids.size(0)}"
+        )
+        assert image_num_crops.size(0) == num_images, (
+            f"Expected {num_images} image num crops, but got {image_num_crops.size(0)}"
+        )
+
+        # 1-1) Compute per-image pooled patch count from image grids
+        with torch.no_grad():
+            first_prod = image_grids[:, :2].prod(dim=1)  # [num_images]
+            second_prod = image_grids[:, 2:].prod(dim=1)  # [num_images]
+            num_pooled_patches_per_image = (first_prod + second_prod).to(
+                image_num_crops.dtype
+            )  # [num_images]
+
+        # pixel_values: [n_crops, n_patches, pixels_per_patch]
+        n_crops, n_patches, pixels_per_patch = pixel_values.shape
+
+        # 2) Map each image index → example index
+        # Example: if counts = [2, 1, 3], then this becomes [0,0,1,2,2,2]
+        example_ids_for_image = torch.arange(N, device=device).repeat_interleave(counts)  # [num_images]
+        assert example_ids_for_image.numel() == num_images
+
+        # 2-1) Compute crops_per_example by summing per-image crop counts
+        crops_per_example = torch.zeros(N, dtype=image_num_crops.dtype, device=image_num_crops.device)
+        crops_per_example.index_add_(0, example_ids_for_image, image_num_crops)  # [N]
+
+        # 2-2) Per-image number of patches = (crops per image) * n_patches
+        patches_per_image = image_num_crops * n_patches  # [num_images]
+
+        # 2-3) Compute per-example per-image patch offsets
+        counts_list = counts.tolist()
+        index_offset_per_example_list = []
+        offset_img = 0
+        for c in counts_list:
+            per_img_patches = patches_per_image[offset_img : offset_img + c]  # [c]
+            # Offsets: [0, img0_total_patches, img0+img1_total_patches, ...]
+            index_offset = [0] + per_img_patches.cumsum(0).tolist()[:-1]
+            index_offset_per_example_list.append(index_offset)
+            offset_img += c
+
+        # 2-4) Compute num_pooled_patches_per_example
+        num_pooled_patches_per_example = torch.zeros(
+            N,
+            dtype=num_pooled_patches_per_image.dtype,
+            device=num_pooled_patches_per_image.device,
+        )
+        num_pooled_patches_per_example.index_add_(0, example_ids_for_image, num_pooled_patches_per_image)
+
+        # Sanity checks
+        total_crops = int(crops_per_example.sum().item())
+        assert total_crops == n_crops, f"Expected {total_crops} crops, but got {n_crops}"
+
+        total_num_pooled_patches = int(num_pooled_patches_per_example.sum().item())
+        assert total_num_pooled_patches == image_token_pooling.size(0), (
+            f"Expected {total_num_pooled_patches} pooled patches, but got {image_token_pooling.size(0)}"
+        )
+
+        # 3) Build images tensor filled with -1
+        M = int(crops_per_example.max().item())
+        images = torch.full(
+            (N, M, n_patches, pixels_per_patch),
+            fill_value=-1,
+            dtype=pixel_values.dtype,
+            device=pixel_values.device,
+        )
+
+        # 4) Fill images with per-example slices from pixel_values
+        offset_crop = 0
+        for i in range(N):
+            num = int(crops_per_example[i].item())
+            cur = pixel_values[offset_crop : offset_crop + num]  # [num, n_patches, pixels_per_patch]
+            images[i, :num] = cur
+            offset_crop += num
+
+        # Sanity check
+        assert offset_crop == n_crops
+
+        # 5) Build new_token_pooling tensor filled with -1
+        P = int(num_pooled_patches_per_example.max().item())
+        _, dim = image_token_pooling.shape
+        new_token_pooling = torch.full(
+            (N, P, dim),
+            fill_value=-1,
+            dtype=image_token_pooling.dtype,
+            device=image_token_pooling.device,
+        )
+
+        # 6) Fill token_pooling with per-example slices, adding per-image patch offsets
+        patch_offset = 0
+        img_offset = 0
+
+        for i, c in enumerate(counts_list):
+            num_patches = int(num_pooled_patches_per_example[i].item())
+
+            # Subsequence of pooled tokens belonging to this example
+            cur = image_token_pooling[patch_offset : patch_offset + num_patches].clone()  # [num_patches, dim]
+
+            index_offset_per_example = index_offset_per_example_list[i]  # length = c
+            per_img_pooled = num_pooled_patches_per_image[img_offset : img_offset + c]  # [c]
+
+            assert len(index_offset_per_example) == per_img_pooled.numel()
+
+            # Apply per-image offsets to the (ragged) subsequence
+            offset = 0
+            for j in range(c):
+                index_offset = int(index_offset_per_example[j])
+                n = int(per_img_pooled[j].item())
+                cur_slice = cur[offset : offset + n]
+
+                # Apply offset across all columns
+                cur[offset : offset + n] = torch.where(
+                    cur_slice >= 0,
+                    cur_slice + index_offset,
+                    cur_slice,
+                )
+                offset += n
+
+            new_token_pooling[i, :num_patches] = cur
+
+            patch_offset += num_patches
+            img_offset += c
+
+        # Final sanity checks
+        assert patch_offset == total_num_pooled_patches
+        assert img_offset == num_images
+
+        return images, new_token_pooling
+
+    def build_batched_videos(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values_videos: torch.Tensor,
+        video_token_pooling: torch.Tensor,
+        video_grids: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # 1) Count the number of videos in each example
+        if self.config.use_frame_special_tokens:
+            end_token_id = self.config.frame_end_token_id
+        else:
+            end_token_id = self.config.image_end_token_id
+        counts = (input_ids == end_token_id).any(dim=1).long()  # [N]
+        N = counts.size(0)
+        device = input_ids.device
+
+        # Total number of videos in the batch
+        num_videos = int(counts.sum().item())
+
+        # Sanity check
+        assert video_grids.size(0) == num_videos, (
+            f"Expected {num_videos} videos, but got {video_grids.size(0)}"
+        )
+
+        video_num_frames = video_grids[:, 0]  # [num_videos]
+        num_pooled_patches_per_video = video_grids.prod(dim=1)  # [num_videos]
+
+        # pixel_values_videos: [n_frames, n_patches, pixels_per_patch]
+        n_frames, n_patches, pixels_per_patch = pixel_values_videos.shape
+
+        # 2) Map each video index -> example index
+        # Example: if counts = [2, 1, 3], then this becomes [0,0,1,2,2,2]
+        example_ids_for_video = torch.arange(N, device=device).repeat_interleave(counts)  # [num_videos]
+        assert example_ids_for_video.numel() == num_videos
+
+        # 2-1) Compute frames_per_example by summing per-video frame counts
+        frames_per_example = torch.zeros(
+            N,
+            dtype=video_num_frames.dtype,
+            device=device,
+        )
+        frames_per_example.index_add_(0, example_ids_for_video, video_num_frames)  # [N]
+
+        # 2-2) Compute num_pooled_patches_per_example
+        num_pooled_patches_per_example = torch.zeros(
+            N,
+            dtype=num_pooled_patches_per_video.dtype,
+            device=num_pooled_patches_per_video.device,
+        )
+        num_pooled_patches_per_example.index_add_(
+            0,
+            example_ids_for_video,
+            num_pooled_patches_per_video,
+        )
+
+        # Sanity checks
+        total_frames = int(frames_per_example.sum().item())
+        assert total_frames == n_frames, f"Expected {total_frames} frames, but got {n_frames}"
+
+        total_num_pooled_patches = int(num_pooled_patches_per_example.sum().item())
+        assert total_num_pooled_patches == video_token_pooling.size(0), (
+            f"Expected {total_num_pooled_patches} pooled patches, but got {video_token_pooling.size(0)}"
+        )
+
+        # 3) Build videos tensor filled with -1
+        M = int(frames_per_example.max().item())
+        videos = torch.full(
+            (N, M, n_patches, pixels_per_patch),
+            fill_value=-1,
+            dtype=pixel_values_videos.dtype,
+            device=device,
+        )
+
+        # 4) Fill videos with per-examples slices from pixel_values_videos
+        offset_frame = 0
+        for i in range(N):
+            num = int(frames_per_example[i].item())
+            cur = pixel_values_videos[offset_frame : offset_frame + num]  # [num, n_patches, pixels_per_patch]
+            videos[i, :num] = cur
+            offset_frame += num
+
+        # Sanity check
+        assert offset_frame == n_frames
+
+        # 5) Build new token_pooling tensor filled with -1
+        P = int(num_pooled_patches_per_example.max().item())
+        _, dim = video_token_pooling.shape
+        new_token_pooling = torch.full(
+            (N, P, dim),
+            fill_value=-1,
+            dtype=video_token_pooling.dtype,
+            device=video_token_pooling.device,
+        )
+
+        # 6) Fill new token_pooling with per-examples slices from video_token_pooling
+        patch_offset = 0
+        for i in range(N):
+            num_patches = int(num_pooled_patches_per_example[i].item())
+            cur = video_token_pooling[patch_offset : patch_offset + num_patches]  # [num_patches, dim]
+            new_token_pooling[i, :num_patches] = cur
+            patch_offset += num_patches
+
+        # Final sanity checks
+        assert patch_offset == total_num_pooled_patches
+
+        return videos, new_token_pooling
+
+    def merge_visual_inputs(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        image_token_pooling: torch.Tensor | None = None,
+        image_grids: torch.Tensor | None = None,
+        image_num_crops: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_token_pooling: torch.Tensor | None = None,
+        video_grids: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+        if pixel_values is not None and pixel_values_videos is not None:
+            raise ValueError("pixel_values and pixel_values_videos are provided at the same time")
+        elif pixel_values is not None:
+            assert input_ids is not None
+            images, token_pooling = self.build_batched_images(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+            )
+        elif pixel_values_videos is not None:
+            assert input_ids is not None
+            images, token_pooling = self.build_batched_videos(
+                input_ids=input_ids,
+                pixel_values_videos=pixel_values_videos,
+                video_token_pooling=video_token_pooling,
+                video_grids=video_grids,
+            )
+        else:
+            images, token_pooling = None, None
+        return images, token_pooling
+
+    def build_input_embeddings(
+        self,
+        input_ids: torch.LongTensor,
+        images: torch.FloatTensor | None = None,  # image inputs
+        token_pooling: torch.LongTensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+        x = self.transformer.wte(input_ids)
+
+        image_features: torch.FloatTensor | None = None
+        if images is not None:
+            image_features = self.vision_backbone(images, token_pooling).to(x.device)
+            is_image_patch = input_ids.reshape(-1) == self.config.image_patch_id
+            if is_image_patch.sum() != len(image_features):
+                raise RuntimeError(
+                    f"Expected {int(is_image_patch.sum())} image patch embeddings, got {len(image_features)}."
+                )
+            flat_x = x.reshape(-1, x.shape[-1]).clone()
+            flat_x[is_image_patch] = flat_x[is_image_patch] + image_features
+            x = flat_x.reshape_as(x)
+
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+
+        return x, image_features
+
+    def _build_native_attention_bias(
+        self,
+        *,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None,
+        past_key_values: Cache | None,
+    ) -> torch.Tensor:
+        if attention_mask is not None and attention_mask.ndim == 4:
+            return attention_mask.to(device=inputs_embeds.device)
+        batch_size, seq_len = inputs_embeds.shape[:2]
+        past_length = _cache_seq_len_int(past_key_values)
+        current_length = past_length + int(seq_len)
+        max_cache_len = _cache_max_len_int(past_key_values)
+        attention_mask_len = max_cache_len if max_cache_len > 0 else current_length
+        device = inputs_embeds.device
+
+        if attention_mask is None:
+            positions = torch.arange(attention_mask_len, device=device)
+            valid_mask = positions.unsqueeze(0) < current_length
+            valid_mask = valid_mask.expand(batch_size, -1)
+        elif attention_mask.ndim == 2:
+            valid_mask = torch.zeros((batch_size, attention_mask_len), device=device, dtype=torch.bool)
+            source_mask = attention_mask.to(device=device, dtype=torch.bool)
+            copy_len = min(int(source_mask.shape[-1]), attention_mask_len)
+            if copy_len > 0:
+                valid_mask[:, :copy_len] = source_mask[:, :copy_len]
+            if attention_mask_len > current_length:
+                valid_mask[:, current_length:] = False
+        else:
+            raise ValueError(f"Unsupported attention_mask shape for MolmoAct2: {tuple(attention_mask.shape)}")
+
+        valid_mask = valid_mask[:, None, None, :]
+        causal_mask = torch.tril(
+            torch.ones(attention_mask_len, attention_mask_len, device=device, dtype=torch.bool)
+        )[None, None, past_length:current_length, :attention_mask_len]
+
+        if token_type_ids is not None and past_length == 0:
+            causal_mask = causal_mask.expand(batch_size, -1, -1, -1).clone()
+            image_mask = token_type_ids.to(device=device, dtype=torch.bool)
+            can_attend_back = image_mask[:, :, None] & image_mask[:, None, :]
+            image_len = min(int(token_type_ids.shape[1]), attention_mask_len)
+            causal_mask[:, :, :, :image_len] = (
+                causal_mask[:, :, :, :image_len] | can_attend_back[:, None, :, :image_len]
+            )
+
+        allowed = valid_mask & causal_mask
+        return torch.where(
+            allowed,
+            torch.zeros((), device=device, dtype=inputs_embeds.dtype),
+            torch.full(
+                (),
+                torch.finfo(inputs_embeds.dtype).min,
+                device=device,
+                dtype=inputs_embeds.dtype,
+            ),
+        )
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        image_token_pooling: torch.Tensor | None = None,
+        image_grids: torch.Tensor | None = None,
+        image_num_crops: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_token_pooling: torch.Tensor | None = None,
+        video_grids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | MolmoAct2ModelOutputWithPast:
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        images, token_pooling = self.merge_visual_inputs(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            image_token_pooling=image_token_pooling,
+            image_grids=image_grids,
+            image_num_crops=image_num_crops,
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )
+
+        if images is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both images and inputs_embeds at the same time.")
+
+        if inputs_embeds is None:
+            inputs_embeds, image_features = self.build_input_embeddings(
+                input_ids,
+                images,
+                token_pooling,
+            )
+
+        if cache_position is None:
+            past_seen_tokens = _cache_seq_len_int(past_key_values)
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if isinstance(attention_mask, dict):
+            causal_mask_mapping = attention_mask
+        else:
+            causal_mask_mapping = self._build_native_attention_bias(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                past_key_values=past_key_values,
+            )
+
+        outputs = self.transformer(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return MolmoAct2ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if images is not None else None,
+        )
+
+
+class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = []  # Weights are not tied
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: MolmoAct2Config
+
+    def __init__(self, config: MolmoAct2Config):
+        super().__init__(config)
+
+        self.model = MolmoAct2Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.vocab_size = config.vocab_size
+        self.model.action_cuda_graph_manager = ActionCudaGraphManager(self.model)
+        self.depth_decode_cuda_graph_manager = DepthDecodeCudaGraphManager(self)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.model.transformer.wte = value
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self) -> torch.nn.Module:
+        return self.model.transformer
+
+    @property
+    def vision_backbone(self) -> torch.nn.Module:
+        return self.model.vision_backbone
+
+    def _get_robot_stats(self) -> _RobotStats:
+        stats = getattr(self, "_molmoact2_robot_stats", None)
+        if stats is not None:
+            return stats
+        filename = getattr(self.config, "norm_stats_filename", "norm_stats.json")
+        base_dir = getattr(self.config, "_name_or_path", None) or getattr(self, "name_or_path", None)
+        if not base_dir:
+            raise ValueError(
+                "MolmoAct2 normalization stats are not loaded and config._name_or_path is empty; "
+                "load the model from a converted HF directory containing norm_stats.json."
+            )
+        stats_path = os.path.join(str(base_dir), filename)
+        if not os.path.isfile(stats_path):
+            try:
+                from huggingface_hub import hf_hub_download
+
+                stats_path = hf_hub_download(str(base_dir), filename, repo_type="model")
+            except Exception as exc:
+                raise FileNotFoundError(
+                    f"MolmoAct2 normalization stats file is missing: {stats_path}. "
+                    "Converted checkpoints must include norm_stats.json."
+                ) from exc
+        with open(stats_path, encoding="utf-8") as f:
+            payload = json.load(f)
+        stats = _RobotStats(payload)
+        self._molmoact2_robot_stats = stats
+        return stats
+
+    @staticmethod
+    def _move_inputs_to_device(inputs: Mapping[str, Any], device: torch.device) -> dict[str, Any]:
+        out = {}
+        for key, value in inputs.items():
+            out[key] = value.to(device) if torch.is_tensor(value) else value
+        return out
+
+    @staticmethod
+    def _drop_trivial_attention_mask(inputs: Mapping[str, Any]) -> dict[str, Any]:
+        out = dict(inputs)
+        attention_mask = out.get("attention_mask")
+        if torch.is_tensor(attention_mask) and bool(attention_mask.to(dtype=torch.bool).all().item()):
+            out.pop("attention_mask", None)
+        return out
+
+    @staticmethod
+    def _count_images(images: Any) -> int:
+        if images is None:
+            return 0
+        if isinstance(images, (list, tuple)):
+            return len(images)
+        arr = np.asarray(images) if not torch.is_tensor(images) else images
+        if getattr(arr, "ndim", 0) == 4:
+            return int(arr.shape[0])
+        return 1
+
+    @staticmethod
+    def _build_action_dim_is_pad(
+        *,
+        action_dim: int,
+        max_action_dim: int,
+        batch_size: int,
+        device: torch.device,
+    ) -> torch.Tensor | None:
+        if int(action_dim) > int(max_action_dim):
+            raise ValueError(
+                f"Requested action_dim {int(action_dim)} exceeds checkpoint max_action_dim {int(max_action_dim)}."
+            )
+        if int(action_dim) == int(max_action_dim):
+            return None
+        mask = torch.ones((int(batch_size), int(max_action_dim)), device=device, dtype=torch.bool)
+        mask[:, : int(action_dim)] = False
+        return mask
+
+    @staticmethod
+    def _slice_action_dim(actions: torch.Tensor, action_dim: int) -> torch.Tensor:
+        if actions.shape[-1] < int(action_dim):
+            raise ValueError(
+                f"Requested action_dim {int(action_dim)} but chunk only has width {actions.shape[-1]}."
+            )
+        return actions[..., : int(action_dim)]
+
+    @staticmethod
+    def _slice_action_chunk(
+        actions: torch.Tensor, n_obs_steps: int, n_action_steps: int | None
+    ) -> torch.Tensor:
+        if n_action_steps is None:
+            return actions
+        start = int(n_obs_steps) - 1
+        end = start + int(n_action_steps)
+        if end > actions.shape[1]:
+            raise ValueError(f"Requested actions up to {end} but model produced horizon {actions.shape[1]}.")
+        return actions[:, start:end]
+
+    def _depth_token_id_to_bin(self) -> dict[int, int]:
+        if self.config.depth_token_start_id is None or int(self.config.num_depth_tokens or 0) <= 0:
+            return {}
+        start = int(self.config.depth_token_start_id)
+        return {start + idx: idx for idx in range(int(self.config.num_depth_tokens))}
+
+    def _action_token_id_to_bin(self) -> dict[int, int]:
+        if self.config.action_token_start_id is None or int(self.config.num_action_tokens or 0) <= 0:
+            return {}
+        start = int(self.config.action_token_start_id)
+        return {start + idx: idx for idx in range(int(self.config.num_action_tokens))}
+
+    def _require_eos_token_id(self) -> int:
+        eos_token_id = getattr(self.config, "eos_token_id", None)
+        if eos_token_id is None and getattr(self, "generation_config", None) is not None:
+            eos_token_id = getattr(self.generation_config, "eos_token_id", None)
+        if isinstance(eos_token_id, (list, tuple)):
+            eos_token_id = eos_token_id[0] if eos_token_id else None
+        if eos_token_id is None:
+            raise RuntimeError(
+                "Discrete action generation requires `eos_token_id` in the converted HF config."
+            )
+        return int(eos_token_id)
+
+    def _decode_depth_bins_from_token_ids(self, token_ids: torch.Tensor) -> torch.Tensor:
+        if self.config.depth_start_token_id is None or self.config.depth_end_token_id is None:
+            raise RuntimeError("Depth generation requires <depth_start>/<depth_end> token IDs.")
+        token_id_to_bin = self._depth_token_id_to_bin()
+        if not token_id_to_bin:
+            raise RuntimeError("Depth generation requires indexed depth tokens in the converted config.")
+        depth_token_bins = _extract_discrete_token_bins(
+            _flatten_generated_token_ids(token_ids),
+            int(self.config.depth_start_token_id),
+            int(self.config.depth_end_token_id),
+            token_id_to_bin,
+        )
+        if not depth_token_bins:
+            raise RuntimeError("Model generated no decodable depth tokens between <depth_start>/<depth_end>.")
+        return torch.as_tensor([depth_token_bins], device=self.device, dtype=torch.long)
+
+    def _consume_generation_tokens(
+        self,
+        token_ids: torch.Tensor,
+        *,
+        past_key_values: Cache | None,
+        attention_mask: torch.Tensor | None,
+    ) -> tuple[MolmoAct2CausalLMOutputWithPast, torch.Tensor | None]:
+        if token_ids.ndim == 1:
+            next_input_ids = token_ids.unsqueeze(1)
+        elif token_ids.ndim == 2:
+            next_input_ids = token_ids
+        else:
+            raise ValueError(f"Expected token_ids to have rank 1 or 2, got {tuple(token_ids.shape)}.")
+        next_attention_mask = attention_mask
+        if next_attention_mask is not None:
+            past_length = _cache_seq_len_int(past_key_values)
+            required_len = int(past_length) + int(next_input_ids.shape[1])
+            if int(next_attention_mask.shape[-1]) < required_len:
+                pad_len = required_len - int(next_attention_mask.shape[-1])
+                next_attention_mask = torch.cat(
+                    (
+                        next_attention_mask,
+                        next_attention_mask.new_ones((next_input_ids.shape[0], pad_len)),
+                    ),
+                    dim=-1,
+                )
+        past_length = _cache_seq_len_int(past_key_values)
+        output = self(
+            input_ids=next_input_ids,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+            cache_position=(
+                torch.arange(
+                    past_length,
+                    past_length + int(next_input_ids.shape[1]),
+                    device=next_input_ids.device,
+                )
+                if past_key_values is not None
+                else None
+            ),
+        )
+        return output, next_attention_mask
+
+    def _make_depth_decode_attention_bias(
+        self, inputs: Mapping[str, Any], past_key_values: Cache
+    ) -> torch.Tensor:
+        layers = getattr(past_key_values, "layers", None)
+        max_cache_len = int(getattr(layers[0], "max_cache_len", 0)) if layers else 0
+        if max_cache_len <= 0:
+            raise RuntimeError("Depth decode fast path requires a cache with a fixed maximum length.")
+        input_ids = inputs["input_ids"]
+        batch_size = int(input_ids.shape[0])
+        device = input_ids.device
+        dtype = self.lm_head.weight.dtype
+
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        valid_mask = torch.ones((batch_size, max_cache_len), device=device, dtype=torch.bool)
+        attention_mask = inputs.get("attention_mask")
+        if attention_mask is not None:
+            source_mask = attention_mask.to(device=device, dtype=torch.bool)
+            copy_len = min(int(source_mask.shape[-1]), max_cache_len)
+            if copy_len > 0:
+                valid_mask[:, :copy_len] = source_mask[:, :copy_len]
+        causal_mask = positions[None, :] <= positions[:, None]
+        allowed = causal_mask.unsqueeze(0) & valid_mask[:, None, :]
+        attention_bias = torch.where(
+            allowed[:, None, :, :],
+            torch.zeros((), device=device, dtype=dtype),
+            torch.full((), torch.finfo(dtype).min, device=device, dtype=dtype),
+        )
+        return attention_bias
+
+    def _embed_base_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Skips MolmoAct2Embedding's per-call cat([base, new]); safe only for IDs
+        # below text_config.vocab_size. This includes released depth/action tokens.
+        wte = self.model.transformer.wte
+        base_embedding = getattr(wte, "embedding", None)
+        if base_embedding is None:
+            return wte(input_ids)
+        return F.embedding(input_ids, base_embedding)
+
+    def _run_ar_decode_step(
+        self,
+        token_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> tuple[torch.Tensor, Cache]:
+        if token_ids.ndim == 1:
+            next_input_ids = token_ids.unsqueeze(1)
+        elif token_ids.ndim == 2:
+            next_input_ids = token_ids
+        else:
+            raise ValueError(f"Expected token_ids to have rank 1 or 2, got {tuple(token_ids.shape)}.")
+        past_length = _cache_seq_len_int(past_key_values)
+        end = past_length + int(next_input_ids.shape[1])
+        if self.depth_decode_cuda_graph_manager.can_use(
+            next_input_ids,
+            past_key_values=past_key_values,
+            attention_bias=attention_bias,
+        ):
+            return self.depth_decode_cuda_graph_manager.run(
+                next_input_ids,
+                past_key_values=past_key_values,
+                attention_bias=attention_bias,
+                past_length=past_length,
+            )
+        cache_position = torch.arange(past_length, end, device=next_input_ids.device, dtype=torch.long)
+        attention_bias = attention_bias[:, :, past_length:end, :end]
+        inputs_embeds = self._embed_base_tokens(next_input_ids)
+        outputs = self.model.transformer(
+            attention_mask=attention_bias,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            cache_position=cache_position,
+        )
+        return outputs.last_hidden_state[:, -1:, :], outputs.past_key_values
+
+    def _run_depth_decode_step(
+        self,
+        token_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> tuple[torch.Tensor, Cache]:
+        return self._run_ar_decode_step(
+            token_ids,
+            past_key_values=past_key_values,
+            attention_bias=attention_bias,
+        )
+
+    def _project_depth_logits(self, last_hidden: torch.Tensor) -> torch.Tensor:
+        start = int(self.config.depth_token_start_id)
+        end_id = start + int(self.config.num_depth_tokens)
+        return F.linear(last_hidden, self.lm_head.weight[start:end_id])
+
+    def _max_depth_decode_steps(self) -> int:
+        return max(
+            int(self.config.num_depth_codes or 0) + 8,
+            self.model._resolve_action_horizon() * 16,
+            1,
+        )
+
+    def _make_ar_decode_static_cache(self, inputs: Mapping[str, Any], max_steps: int) -> Cache:
+        prompt_len = inputs["input_ids"].shape[1]
+        return self.depth_decode_cuda_graph_manager.make_static_cache(
+            max_cache_len=prompt_len + max(1, int(max_steps)),
+        )
+
+    def _make_depth_static_cache(self, inputs: Mapping[str, Any]) -> Cache:
+        prompt_len = inputs["input_ids"].shape[1]
+        action_horizon = self.model._resolve_action_horizon()
+        max_end_steps = max(8, action_horizon)
+        action_token_budget = max(1, action_horizon * 16)
+        return self.depth_decode_cuda_graph_manager.make_static_cache(
+            max_cache_len=prompt_len + self._max_depth_decode_steps() + max_end_steps + action_token_budget,
+        )
+
+    def _continue_discrete_generation_from_output(
+        self,
+        initial_output: MolmoAct2CausalLMOutputWithPast,
+        *,
+        past_key_values: Cache | None,
+        attention_mask: torch.Tensor | None,
+        end_token_id: int,
+        max_steps: int,
+        attention_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        generated_tokens: list[torch.Tensor] = []
+        current_output = initial_output
+        current_past_key_values = past_key_values
+        current_attention_mask = attention_mask
+        hit_end = False
+        for _ in range(int(max_steps)):
+            next_token = torch.argmax(current_output.logits[:, -1, :], dim=-1)
+            generated_tokens.append(next_token)
+            if bool((next_token == int(end_token_id)).all()):
+                hit_end = True
+                break
+            if attention_bias is None:
+                current_output, current_attention_mask = self._consume_generation_tokens(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_mask=current_attention_mask,
+                )
+                current_past_key_values = current_output.past_key_values
+            else:
+                last_hidden, current_past_key_values = self._run_ar_decode_step(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_bias=attention_bias,
+                )
+                current_output = MolmoAct2CausalLMOutputWithPast(
+                    logits=self.lm_head(last_hidden),
+                    past_key_values=current_past_key_values,
+                )
+        if not generated_tokens:
+            raise RuntimeError("Discrete continuation generated no tokens.")
+        if not hit_end:
+            raise RuntimeError(
+                f"Discrete continuation did not emit end token {int(end_token_id)} within {int(max_steps)} steps."
+            )
+        return torch.stack(generated_tokens, dim=1)
+
+    def _generate_depth_prefix(
+        self,
+        inputs: Mapping[str, Any],
+        *,
+        latest_first_image: np.ndarray | None,
+        depth_cache: Mapping[str, Any] | None,
+        enable_adaptive_depth: bool,
+    ) -> _DepthPrefix:
+        if self.config.depth_start_token_id is None or self.config.depth_end_token_id is None:
+            raise RuntimeError("Depth reasoning requires single-token <depth_start>/<depth_end>.")
+        if self.config.depth_token_start_id is None or int(self.config.num_depth_tokens or 0) <= 0:
+            raise RuntimeError("Depth reasoning requires indexed depth tokens.")
+        batch_size = int(inputs["input_ids"].shape[0])
+        if batch_size != 1 and enable_adaptive_depth:
+            raise ValueError("enable_adaptive_depth=True currently supports batch size 1.")
+        static_cache = self._make_depth_static_cache(inputs)
+        output = self(**inputs, use_cache=True, past_key_values=static_cache)
+        current_output = output
+        current_past_key_values = output.past_key_values
+        current_attention_mask = inputs.get("attention_mask")
+        generated_tokens: list[torch.Tensor] = []
+
+        if not enable_adaptive_depth:
+            hit_depth_end = False
+            max_steps = self._max_depth_decode_steps()
+            for _ in range(max_steps):
+                next_token = torch.argmax(current_output.logits[:, -1, :], dim=-1)
+                generated_tokens.append(next_token)
+                current_output, current_attention_mask = self._consume_generation_tokens(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_mask=current_attention_mask,
+                )
+                current_past_key_values = current_output.past_key_values
+                if bool((next_token == int(self.config.depth_end_token_id)).all()):
+                    hit_depth_end = True
+                    break
+            if not generated_tokens:
+                raise RuntimeError("Depth generation produced no tokens.")
+            if not hit_depth_end:
+                raise RuntimeError(f"Depth generation did not emit <depth_end> within {max_steps} steps.")
+            depth_token_ids = torch.stack(generated_tokens, dim=1)
+            full_input_ids = torch.cat([inputs["input_ids"], depth_token_ids], dim=1)
+            full_attention_mask = None
+            if current_attention_mask is not None:
+                full_attention_mask = current_attention_mask[:, : full_input_ids.shape[1]]
+            encoder_kv_states = self.model._extract_kv_states(current_past_key_values)
+            return _DepthPrefix(
+                token_ids=depth_token_ids,
+                depth_bins=self._decode_depth_bins_from_token_ids(depth_token_ids),
+                full_input_ids=full_input_ids,
+                attention_mask=full_attention_mask,
+                encoder_kv_states=encoder_kv_states,
+                next_output=current_output,
+                past_key_values=current_past_key_values,
+            )
+
+        depth_start = torch.full(
+            (batch_size,),
+            int(self.config.depth_start_token_id),
+            device=self.device,
+            dtype=torch.long,
+        )
+        code_token_ids = torch.arange(
+            int(self.config.depth_token_start_id),
+            int(self.config.depth_token_start_id) + int(self.config.num_depth_tokens),
+            device=self.device,
+            dtype=torch.long,
+        )
+        depth_attention_bias = self._make_depth_decode_attention_bias(inputs, current_past_key_values)
+        generated_tokens.append(depth_start)
+        last_hidden, current_past_key_values = self._run_depth_decode_step(
+            depth_start,
+            past_key_values=current_past_key_values,
+            attention_bias=depth_attention_bias,
+        )
+        previous_image = None
+        previous_bins = None
+        if depth_cache is not None:
+            previous_image = depth_cache.get("image")
+            previous_bins = depth_cache.get("depth_bins")
+        selective = (
+            bool(enable_adaptive_depth)
+            and latest_first_image is not None
+            and previous_image is not None
+            and previous_bins is not None
+        )
+        update_mask = None
+        previous_buffer_t = None
+        if selective:
+            previous_buffer = np.asarray(previous_bins, dtype=np.int64).reshape(-1)
+            if previous_buffer.shape[0] == int(self.config.num_depth_codes):
+                update_mask = _compute_depth_update_mask(
+                    latest_first_image,
+                    _normalize_image_for_cache(previous_image),
+                    num_depth_codes=int(self.config.num_depth_codes),
+                )
+                previous_buffer_t = (
+                    torch.from_numpy(previous_buffer)
+                    .to(
+                        device=self.device,
+                        dtype=torch.long,
+                    )
+                    .unsqueeze(0)
+                )
+            else:
+                selective = False
+
+        depth_bins = torch.zeros(
+            (batch_size, int(self.config.num_depth_codes)),
+            device=self.device,
+            dtype=torch.long,
+        )
+        num_depth_codes = int(self.config.num_depth_codes)
+        if not selective or update_mask is None or previous_buffer_t is None:
+            for depth_idx in range(num_depth_codes):
+                depth_logits = self._project_depth_logits(last_hidden)
+                predicted_bins = depth_logits.squeeze(1).argmax(dim=-1)
+                depth_bins[:, depth_idx] = predicted_bins
+                chosen_token_ids = code_token_ids[predicted_bins]
+                generated_tokens.append(chosen_token_ids)
+                last_hidden, current_past_key_values = self._run_depth_decode_step(
+                    chosen_token_ids,
+                    past_key_values=current_past_key_values,
+                    attention_bias=depth_attention_bias,
+                )
+        else:
+            for start_idx, end_idx, should_generate in _build_depth_update_spans(update_mask):
+                if should_generate:
+                    for depth_idx in range(start_idx, end_idx):
+                        depth_logits = self._project_depth_logits(last_hidden)
+                        predicted_bins = depth_logits.squeeze(1).argmax(dim=-1)
+                        depth_bins[:, depth_idx] = predicted_bins
+                        chosen_token_ids = code_token_ids[predicted_bins]
+                        generated_tokens.append(chosen_token_ids)
+                        last_hidden, current_past_key_values = self._run_depth_decode_step(
+                            chosen_token_ids,
+                            past_key_values=current_past_key_values,
+                            attention_bias=depth_attention_bias,
+                        )
+                    continue
+                replay_bins = previous_buffer_t[:, start_idx:end_idx].expand(batch_size, -1)
+                depth_bins[:, start_idx:end_idx] = replay_bins
+                replay_token_ids = code_token_ids[replay_bins]
+                generated_tokens.extend(replay_token_ids.unbind(dim=1))
+                last_hidden, current_past_key_values = self._run_depth_decode_step(
+                    replay_token_ids,
+                    past_key_values=current_past_key_values,
+                    attention_bias=depth_attention_bias,
+                )
+        hit_depth_end = False
+        max_depth_end_steps = max(8, self.model._resolve_action_horizon())
+        full_logits = self.lm_head(last_hidden)
+        for _ in range(max_depth_end_steps):
+            next_token = full_logits.squeeze(1).argmax(dim=-1)
+            generated_tokens.append(next_token)
+            last_hidden, current_past_key_values = self._run_depth_decode_step(
+                next_token,
+                past_key_values=current_past_key_values,
+                attention_bias=depth_attention_bias,
+            )
+            full_logits = self.lm_head(last_hidden)
+            if bool((next_token == int(self.config.depth_end_token_id)).all()):
+                hit_depth_end = True
+                break
+        if not hit_depth_end:
+            raise RuntimeError(
+                f"Depth generation did not emit <depth_end> within {max_depth_end_steps} steps "
+                "after adaptive depth tokens."
+            )
+
+        depth_token_ids = torch.stack(generated_tokens, dim=1)
+        full_input_ids = torch.cat([inputs["input_ids"], depth_token_ids], dim=1)
+        attention_mask = inputs.get("attention_mask")
+        if attention_mask is not None:
+            full_attention_mask = torch.cat(
+                (attention_mask, attention_mask.new_ones(depth_token_ids.shape)),
+                dim=-1,
+            )[:, : full_input_ids.shape[1]]
+        else:
+            full_attention_mask = None
+        current_output = MolmoAct2CausalLMOutputWithPast(
+            logits=full_logits,
+            past_key_values=current_past_key_values,
+        )
+        encoder_kv_states = self.model._extract_kv_states(current_past_key_values)
+        return _DepthPrefix(
+            token_ids=depth_token_ids,
+            depth_bins=depth_bins,
+            full_input_ids=full_input_ids,
+            attention_mask=full_attention_mask,
+            encoder_kv_states=encoder_kv_states,
+            next_output=current_output,
+            past_key_values=current_past_key_values,
+        )
+
+    def _decode_discrete_action_chunk(
+        self,
+        generated_token_ids: torch.Tensor,
+        *,
+        action_tokenizer: Any,
+        action_dim: int,
+        action_horizon: int,
+    ) -> torch.Tensor:
+        if action_tokenizer is None:
+            raise ValueError("inference_action_mode='discrete' requires an `action_tokenizer` input.")
+        if self.config.action_start_token_id is None or self.config.action_end_token_id is None:
+            raise RuntimeError("Discrete action generation requires <action_start>/<action_end> token IDs.")
+        token_id_to_bin = self._action_token_id_to_bin()
+        if not token_id_to_bin:
+            raise RuntimeError(
+                "Discrete action generation requires indexed action tokens in the converted config."
+            )
+        discrete_token_ids = _extract_discrete_token_bins(
+            _flatten_generated_token_ids(generated_token_ids),
+            int(self.config.action_start_token_id),
+            int(self.config.action_end_token_id),
+            token_id_to_bin,
+        )
+        if not discrete_token_ids:
+            raise RuntimeError(
+                "Model generated no decodable action tokens between <action_start>/<action_end>."
+            )
+        try:
+            decoded = action_tokenizer.decode(
+                [discrete_token_ids],
+                time_horizon=int(action_horizon),
+                action_dim=int(action_dim),
+            )
+        except TypeError:
+            decoded = action_tokenizer.decode([discrete_token_ids])
+        action_chunk = np.asarray(decoded, dtype=np.float32)
+        if action_chunk.ndim == 1:
+            action_chunk = action_chunk[None, None, :]
+        elif action_chunk.ndim == 2:
+            action_chunk = action_chunk[None, :, :]
+        elif action_chunk.ndim > 3:
+            action_chunk = action_chunk.reshape(1, action_chunk.shape[-2], action_chunk.shape[-1])
+        if action_chunk.ndim != 3:
+            raise RuntimeError(f"Decoded action chunk has unexpected shape {action_chunk.shape}.")
+        return torch.as_tensor(action_chunk, device=self.device, dtype=torch.float32)
+
+    @torch.no_grad()
+    def predict_action(
+        self,
+        *,
+        processor: Any,
+        images: Any,
+        task: str,
+        state: Any,
+        norm_tag: str,
+        inference_action_mode: str | None = None,
+        enable_depth_reasoning: bool = False,
+        enable_adaptive_depth: bool = True,
+        depth_cache: Mapping[str, Any] | None = None,
+        action_tokenizer: Any = None,
+        num_steps: int | None = None,
+        n_action_steps: int | None = None,
+        generator: torch.Generator | None = None,
+        normalize_language: bool = True,
+        enable_cuda_graph: bool = True,
+        return_dict: bool = True,
+    ) -> MolmoAct2ActionOutput | torch.Tensor:
+        if state is None:
+            raise ValueError("MolmoAct2 `predict_action` requires `state` for discrete state prompting.")
+        if inference_action_mode is None:
+            raise ValueError(
+                "`inference_action_mode` must be provided explicitly as either 'continuous' or 'discrete'."
+            )
+        inference_action_mode = str(inference_action_mode)
+        if inference_action_mode not in {"continuous", "discrete"}:
+            raise ValueError("inference_action_mode must be either 'continuous' or 'discrete'.")
+        if inference_action_mode == "continuous" and not bool(self.config.add_action_expert):
+            raise RuntimeError(
+                "inference_action_mode='continuous' requires an action expert, but this checkpoint "
+                "was converted with add_action_expert=False."
+            )
+        if inference_action_mode == "continuous" and self.config.action_mode not in {
+            "continuous",
+            "both",
+        }:
+            raise ValueError(
+                "inference_action_mode='continuous' requires checkpoint action_mode in "
+                f"{{'continuous', 'both'}}, got {self.config.action_mode!r}."
+            )
+        if inference_action_mode == "discrete":
+            if action_tokenizer is None:
+                raise ValueError("inference_action_mode='discrete' requires an `action_tokenizer` input.")
+            if self.config.action_mode not in {"discrete", "both"}:
+                raise ValueError(
+                    "inference_action_mode='discrete' requires checkpoint action_mode in "
+                    f"{{'discrete', 'both'}}, got {self.config.action_mode!r}."
+                )
+        if enable_depth_reasoning and not bool(self.config.enable_depth_reasoning):
+            raise ValueError("this model was not trained with `--enable_depth_reasoning`.")
+
+        stats = self._get_robot_stats()
+        norm_tag = stats.validate_tag(norm_tag)
+        metadata = stats.get_metadata(norm_tag)
+        normalized_state = np.asarray(stats.normalize_state(state, norm_tag), dtype=np.float32)
+        num_state_tokens = int(self.config.num_state_tokens or 0)
+        if num_state_tokens <= 0:
+            raise RuntimeError(
+                "Discrete state prompting requires indexed state tokens in the converted config."
+            )
+        discrete_state_string = _build_discrete_state_string(normalized_state, num_state_tokens)
+        style = "robot_depth_action" if enable_depth_reasoning else "robot_action"
+        task_text = str(task or "")
+        if normalize_language:
+            task_text = _normalize_question_text(task_text)
+        text = _build_robot_text(
+            task=task_text,
+            style=style,
+            discrete_state_string=discrete_state_string,
+            setup_type=str(metadata.get("setup_type", "") or ""),
+            control_mode=str(metadata.get("control_mode", "") or ""),
+            add_setup_tokens=bool(self.config.add_setup_tokens),
+            add_control_tokens=bool(self.config.add_control_tokens),
+            num_images=self._count_images(images),
+        )
+        inputs = processor(text=text, images=images, return_tensors="pt")
+        inputs = self._move_inputs_to_device(inputs, self.device)
+        inputs = self._drop_trivial_attention_mask(inputs)
+
+        action_dim = stats.get_action_dim(norm_tag)
+        if action_dim is None:
+            action_dim = int(self.config.max_action_dim)
+        action_dim = int(action_dim)
+        max_action_horizon = self.model._resolve_action_horizon()
+        action_horizon = stats.get_action_horizon(norm_tag) or max_action_horizon
+        if int(action_horizon) > max_action_horizon:
+            raise ValueError(
+                f"Tag action_horizon={int(action_horizon)} exceeds checkpoint max_action_horizon={max_action_horizon}."
+            )
+        generation_horizon = int(action_horizon)
+        resolved_n_action_steps = n_action_steps
+        if resolved_n_action_steps is None:
+            resolved_n_action_steps = stats.get_n_action_steps(norm_tag)
+        if resolved_n_action_steps is None:
+            resolved_n_action_steps = int(action_horizon)
+        resolved_n_action_steps = int(resolved_n_action_steps)
+        if resolved_n_action_steps < 1:
+            raise ValueError(f"n_action_steps must be >= 1, got {resolved_n_action_steps}.")
+        if resolved_n_action_steps > int(action_horizon):
+            raise ValueError(
+                f"Requested n_action_steps={resolved_n_action_steps} exceeds tag action_horizon={int(action_horizon)}."
+            )
+        batch_size = int(inputs["input_ids"].shape[0])
+        action_dim_is_pad = self._build_action_dim_is_pad(
+            action_dim=action_dim,
+            max_action_dim=int(self.config.max_action_dim),
+            batch_size=batch_size,
+            device=self.device,
+        )
+        self.model.action_cuda_graph_manager.set_enabled(enable_cuda_graph)
+        self.depth_decode_cuda_graph_manager.set_enabled(enable_cuda_graph)
+
+        generated_token_ids = None
+        depth_bins = None
+        updated_depth_cache = depth_cache
+        if inference_action_mode == "continuous":
+            if enable_depth_reasoning:
+                latest_first_image = _extract_first_image(images)
+                depth_prefix = self._generate_depth_prefix(
+                    inputs,
+                    latest_first_image=latest_first_image,
+                    depth_cache=depth_cache,
+                    enable_adaptive_depth=bool(enable_adaptive_depth),
+                )
+                generated_token_ids = depth_prefix.token_ids
+                depth_bins = depth_prefix.depth_bins
+                actions = self.model.generate_actions_from_inputs(
+                    input_ids=depth_prefix.full_input_ids,
+                    attention_mask=depth_prefix.attention_mask,
+                    action_dim_is_pad=action_dim_is_pad,
+                    action_horizon=generation_horizon,
+                    num_steps=num_steps,
+                    generator=generator,
+                    encoder_kv_states=depth_prefix.encoder_kv_states,
+                    encoder_attention_mask=self.model._get_encoder_attention_mask(
+                        depth_prefix.full_input_ids,
+                        depth_prefix.attention_mask,
+                    ),
+                )
+                if latest_first_image is not None:
+                    updated_depth_cache = {
+                        "image": latest_first_image,
+                        "depth_bins": depth_bins.detach().cpu().reshape(-1).numpy().astype(np.int64),
+                    }
+            else:
+                actions = self.model.generate_actions_from_inputs(
+                    **inputs,
+                    action_dim_is_pad=action_dim_is_pad,
+                    action_horizon=generation_horizon,
+                    num_steps=num_steps,
+                    generator=generator,
+                )
+        else:
+            if enable_depth_reasoning:
+                latest_first_image = _extract_first_image(images)
+                depth_prefix = self._generate_depth_prefix(
+                    inputs,
+                    latest_first_image=latest_first_image,
+                    depth_cache=depth_cache,
+                    enable_adaptive_depth=bool(enable_adaptive_depth),
+                )
+                action_token_ids = self._continue_discrete_generation_from_output(
+                    depth_prefix.next_output,
+                    past_key_values=depth_prefix.past_key_values,
+                    attention_mask=depth_prefix.attention_mask,
+                    end_token_id=self._require_eos_token_id(),
+                    max_steps=max(1, int(generation_horizon * 16)),
+                )
+                generated_token_ids = torch.cat([depth_prefix.token_ids, action_token_ids], dim=1)
+                depth_bins = depth_prefix.depth_bins
+                if latest_first_image is not None:
+                    updated_depth_cache = {
+                        "image": latest_first_image,
+                        "depth_bins": depth_bins.detach().cpu().reshape(-1).numpy().astype(np.int64),
+                    }
+            else:
+                max_action_decode_steps = max(1, int(generation_horizon * 16))
+                action_attention_bias = None
+                if enable_cuda_graph:
+                    action_static_cache = self._make_ar_decode_static_cache(
+                        inputs,
+                        max_steps=max_action_decode_steps,
+                    )
+                    action_attention_bias = self._make_depth_decode_attention_bias(
+                        inputs,
+                        action_static_cache,
+                    )
+                    prefill_output = self(
+                        **inputs,
+                        use_cache=True,
+                        past_key_values=action_static_cache,
+                    )
+                else:
+                    prefill_output = self(**inputs, use_cache=True)
+                action_token_ids = self._continue_discrete_generation_from_output(
+                    prefill_output,
+                    past_key_values=prefill_output.past_key_values,
+                    attention_mask=inputs.get("attention_mask"),
+                    end_token_id=self._require_eos_token_id(),
+                    max_steps=max_action_decode_steps,
+                    attention_bias=action_attention_bias,
+                )
+                generated_token_ids = action_token_ids
+            actions = self._decode_discrete_action_chunk(
+                generated_token_ids,
+                action_tokenizer=action_tokenizer,
+                action_dim=action_dim,
+                action_horizon=generation_horizon,
+            )
+
+        actions = self._slice_action_dim(actions, action_dim)
+        actions = self._slice_action_chunk(actions, int(self.config.n_obs_steps), resolved_n_action_steps)
+        actions = stats.unnormalize_action(actions, norm_tag)
+        if not torch.is_tensor(actions):
+            actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32)
+        else:
+            actions = actions.to(device=self.device, dtype=torch.float32)
+        output = MolmoAct2ActionOutput(
+            actions=actions,
+            generated_token_ids=generated_token_ids,
+            depth_bins=depth_bins,
+            depth_cache=updated_depth_cache,
+        )
+        if return_dict:
+            return output
+        return actions
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.Tensor | None = None,
+        image_token_pooling: torch.Tensor | None = None,
+        image_grids: torch.Tensor | None = None,
+        image_num_crops: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_token_pooling: torch.Tensor | None = None,
+        video_grids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | MolmoAct2CausalLMOutputWithPast:
+        r"""
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from lerobot.policies.molmoact2.hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
+        >>> from lerobot.policies.molmoact2.processor_molmoact2 import _load_local_molmoact2_processor
+
+        >>> model = MolmoAct2ForConditionalGeneration.from_pretrained("...")
+        >>> processor = _load_local_molmoact2_processor("...")
+
+        >>> prompt = "What's the content of the image?"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> messages = [{"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image", "image": image}]}]
+
+        >>> inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> generated_tokens = generated_ids[:, inputs['input_ids'].size(1):]
+        >>> processor.post_process_image_text_to_text(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a bustling street scene in what appears to be a Chinatown area. There's ..."
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            image_token_pooling=image_token_pooling,
+            image_grids=image_grids,
+            image_num_crops=image_num_crops,
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size)
+
+        return MolmoAct2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        image_token_pooling: torch.Tensor | None = None,
+        image_grids: torch.Tensor | None = None,
+        image_num_crops: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_token_pooling: torch.Tensor | None = None,
+        video_grids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor | None = None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        include_visual_inputs = past_key_values is None
+        if past_key_values is not None and hasattr(past_key_values, "get_seq_length"):
+            include_visual_inputs = int(past_key_values.get_seq_length()) == 0
+        if include_visual_inputs:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_token_pooling"] = image_token_pooling
+            model_inputs["image_grids"] = image_grids
+            model_inputs["image_num_crops"] = image_num_crops
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+            model_inputs["video_token_pooling"] = video_token_pooling
+            model_inputs["video_grids"] = video_grids
+
+        return model_inputs
+
+    # Adapted from transformers.models.gemma3.modeling_gemma3
+    @staticmethod
+    def create_masks_for_generate(
+        config: PretrainedConfig,
+        input_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        cache_position: torch.Tensor,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None = None,
+        **kwargs,
+    ) -> dict:
+        # Prepare mask arguments
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "input_embeds": input_embeds,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        # Add the token type ids mask for generate as well
+        if token_type_ids is not None and input_embeds.shape[1] != 1:
+            # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+            mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                token_type_ids.to(cache_position.device)
+            )
+
+        return create_masks_for_generate(**mask_kwargs)
diff --git a/src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py b/src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py
new file mode 100644
index 000000000..7b8775faa
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py
@@ -0,0 +1,431 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""
+Processor class for MolmoAct2.
+"""
+
+from typing import Optional, Union
+import dataclasses
+
+import numpy as np
+
+from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
+from transformers.processing_utils import (
+    Unpack,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN = f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+VIDEO_PROMPT = "<|video|>"
+
+IMAGE_TOKENS = [
+    IMAGE_PATCH_TOKEN,
+    IM_COL_TOKEN,
+    IM_START_TOKEN,
+    LOW_RES_IMAGE_START_TOKEN,
+    FRAME_START_TOKEN,
+    IM_END_TOKEN,
+    FRAME_END_TOKEN,
+    IMAGE_LOW_RES_TOKEN,
+]
+
+
+class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct2 processor kwargs"""
+
+    images_kwargs: MolmoAct2ImagesKwargs
+    videos_kwargs: MolmoAct2VideoProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+
+
+class MolmoAct2Processor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "time_mode",
+        "image_use_col_tokens",
+        "use_single_crop_col_tokens",
+        "use_single_crop_start_token",
+        "video_use_col_tokens",
+        "use_frame_special_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor: MolmoAct2ImageProcessor = None,
+        video_processor: MolmoAct2VideoProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: str | None = None,
+        image_use_col_tokens: bool | None = True,
+        use_single_crop_col_tokens: bool | None = None,
+        use_single_crop_start_token: bool | None = True,
+        video_use_col_tokens: bool | None = False,
+        use_frame_special_tokens: bool | None = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_processor,
+            video_processor,
+            tokenizer,
+            chat_template=chat_template,
+        )
+        self.image_use_col_tokens = image_use_col_tokens
+        self.use_single_crop_col_tokens = use_single_crop_col_tokens
+        self.use_single_crop_start_token = use_single_crop_start_token
+        self.video_use_col_tokens = video_use_col_tokens
+        self.use_frame_special_tokens = use_frame_special_tokens
+
+        self.image_placeholder_token = IMAGE_PROMPT
+        self.video_placeholder_token = VIDEO_PROMPT
+        self.image_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in IMAGE_TOKENS]
+
+    def get_image_tokens(self, image_grid: np.ndarray):
+        resized_h, resized_w, height, width = image_grid
+        if int(height) == 0 or int(width) == 0:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+            use_single_crop_col_tokens = (
+                self.image_use_col_tokens
+                if self.use_single_crop_col_tokens is None
+                else self.use_single_crop_col_tokens
+            )
+            if use_single_crop_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            joint = [
+                [IM_START_TOKEN],
+                np.tile(per_row, [resized_h]),
+                [IM_END_TOKEN],
+            ]
+            return np.concatenate(joint)
+        per_row = np.full(width, IMAGE_PATCH_TOKEN)
+        if self.image_use_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [IM_START_TOKEN],
+            np.tile(per_row, [height]),
+            [IM_END_TOKEN],
+        ]
+        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+        use_single_crop_col_tokens = (
+            self.image_use_col_tokens
+            if self.use_single_crop_col_tokens is None
+            else self.use_single_crop_col_tokens
+        )
+        image_start_token = LOW_RES_IMAGE_START_TOKEN if self.use_single_crop_start_token else IM_START_TOKEN
+        if use_single_crop_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [image_start_token],
+            np.tile(per_row, [resized_h]),
+            [IM_END_TOKEN],
+        ] + joint
+
+        return np.concatenate(joint)
+
+    def get_video_string(
+        self,
+        video_grid: np.ndarray,
+        timestamps: np.ndarray,
+    ):
+        if self.use_frame_special_tokens:
+            start_token_id = FRAME_START_TOKEN
+            end_token_id = FRAME_END_TOKEN
+        else:
+            start_token_id = IM_START_TOKEN
+            end_token_id = IM_END_TOKEN
+
+        num_frames, h, w = video_grid
+        video_string: str = ""
+        for frame_idx, frame_time in enumerate(timestamps):
+            # `per-frame-compact` time mode
+            prev_space = " " if frame_idx > 0 else ""
+            frame_prefix = prev_space + f"{frame_time:.1f} "  # explicit whitespace before/after image tokens
+
+            video_string += frame_prefix
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.video_use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            video_tokens = [
+                [start_token_id],
+                extra_tokens,
+                [end_token_id],
+            ]
+            video_string += "".join(np.concatenate(video_tokens, 0))
+
+        return video_string
+
+    def insert_bos(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+
+        B, S = input_ids.shape
+
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S + 1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S + 1), dtype=attention_mask.dtype)
+
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+
+            return new_input_ids, new_attention_mask
+
+    def __call__(
+        self,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[MolmoAct2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`dict[str, Any]` or `list[dict[str, Any]]`):
+                The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
+                - `"frames"`: `np.ndarray` of shape (T, H, W, 3)
+                - `"timestamps"`: `np.ndarray` of shape (T,)
+                - `"sampled_fps"`: `float` (optional)
+                - `"sampling_augmentation"`: `str` (optional)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            `BatchFeature`: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
+              Returned when `images` is not `None`.
+            - **image_grids** -- Grids of images. Returned when `images` is not `None`.
+            - **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
+              Returned when `videos` is not `None`.
+            - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            MolmoAct2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_grids = image_inputs["image_grids"]
+        else:
+            image_inputs = {}
+            image_grids = None
+
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grids = videos_inputs["video_grids"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grids = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        if image_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_images = text[i].count(self.image_placeholder_token)
+                image_grids_i = image_grids[index : index + num_images]
+                for image_grid in image_grids_i:
+                    image_tokens = self.get_image_tokens(image_grid)
+                    image_string = "".join(image_tokens)
+                    text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
+                index += num_images
+
+        if video_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_videos = text[i].count(self.video_placeholder_token)
+                assert num_videos in {0, 1}, "At most one video is supported for now"
+                video_grids_i = video_grids[index : index + num_videos]
+                metadata_i = video_metadata[index : index + num_videos]
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
+                    video_string = self.get_video_string(
+                        video_grid,
+                        metadata.timestamps,
+                    )
+                    text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
+                index += num_videos
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+
+        input_ids = np.array(input_ids)
+        attention_mask = np.array(attention_mask)
+
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        input_ids, attention_mask = self.insert_bos(
+            input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+        )
+
+        if return_mm_token_type_ids:
+            image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
+            token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
+            text_inputs["token_type_ids"] = token_type_ids.tolist()
+
+        text_inputs["input_ids"] = input_ids.tolist()
+        text_inputs["attention_mask"] = attention_mask.tolist()
+
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+
+MolmoAct2Processor.register_for_auto_class()
diff --git a/src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py b/src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py
new file mode 100644
index 000000000..644d5a691
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py
@@ -0,0 +1,997 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Video processor class for MolmoAct2"""
+
+from functools import partial
+import os
+import warnings
+from contextlib import redirect_stdout
+from io import BytesIO
+from urllib.parse import urlparse
+from typing import Optional, Union
+from collections.abc import Callable
+
+import numpy as np
+import requests
+import einops
+import torch
+import torchvision.transforms
+
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    validate_kwargs,
+)
+from transformers.video_utils import (
+    VideoInput,
+    is_valid_video,
+    make_batched_videos,
+    make_batched_metadata,
+    VideoMetadata,
+)
+from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.video_processing_utils import BaseVideoProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import (
+    is_av_available,
+    is_decord_available,
+    is_torchcodec_available,
+    is_yt_dlp_available,
+    TensorType,
+    logging,
+    to_numpy,
+)
+
+
+logger = logging.get_logger(__name__)
+
+MAX_VIDEO_FPS = 8
+
+
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+
+
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    if len(image.shape) == 3:
+        is_video = False
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    else:
+        is_video = True
+        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
+        )
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+
+    if is_video:
+        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
+    else:
+        resized = torch.permute(resized, [1, 2, 0]).numpy()
+
+    return resized
+
+
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image,
+        base_image_input_size,
+        resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w * crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+
+
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size * c])
+        return array
+
+
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(
+        idx_arr,
+        [[h_pad // 2, (h_pad + 1) // 2], [w_pad // 2, (w_pad + 1) // 2]],
+        mode="constant",
+        constant_values=-1,
+    )
+    return einops.rearrange(idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+
+
+def image_to_patches_and_grids(
+    image: ImageInput,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h * pooling_w])
+    image_grid = [h, w]
+    return (
+        image_grid,
+        batch_pixels_to_patches(resized, image_patch_size),
+        pooling_idx,
+    )
+
+
+def get_candidate_target_fps(
+    video_fps: int | float,
+    sampling_fps: int | float,
+    max_fps: int | float = MAX_VIDEO_FPS,
+) -> list[float]:
+    """
+    Return the subset of `video_fps` factors that remain multiples of `sampling_fps`.
+
+    Examples:
+        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
+        [2, 6]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
+        [1, 5]
+        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
+        [2]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
+        Traceback (most recent call last):
+            ...
+        ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps.
+    """
+    video_fps = int(video_fps)
+    sampling_fps = int(sampling_fps)
+    max_fps = int(max_fps)
+
+    if sampling_fps is None:
+        raise ValueError("sampling_fps must be provided")
+    if video_fps <= 0 or sampling_fps <= 0:
+        raise ValueError(f"video_fps and sampling_fps must be positive (got {video_fps}, {sampling_fps})")
+    if video_fps % sampling_fps != 0:
+        raise ValueError(f"sampling_fps={sampling_fps} must divide video_fps={video_fps}.")
+
+    candidates = []
+    for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
+        if candidate > max_fps:
+            break
+        if video_fps % candidate == 0:
+            candidates.append(float(candidate))
+
+    return candidates
+
+
+def read_video_decord(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the Decord backend.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import from decord
+    import importlib
+
+    decord = importlib.import_module("decord")
+
+    vr = decord.VideoReader(uri=video_path, ctx=decord.cpu(0))  # decord has problems with gpu
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    time_stamps = vr.get_frame_timestamp(list(range(len(vr))))
+    duration = time_stamps[-1][1] - time_stamps[0][0]
+
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="decord",
+    )
+
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    target_timestamps = np.array(target_timestamps)
+    offset = time_stamps[0, 0]
+
+    ix = np.searchsorted(time_stamps[:, 1], target_timestamps + offset, side="right")
+    ix = np.minimum(ix, len(time_stamps) - 1)
+
+    video = vr.get_batch(ix).asnumpy()
+    metadata.update(
+        {
+            "frames_indices": target_timestamps * video_fps,
+            "height": video.shape[1],
+            "width": video.shape[2],
+        }
+    )
+    return video, metadata
+
+
+def read_video_torchcodec(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using torchcodec decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+
+    torchcodec = importlib.import_module("torchcodec")
+
+    decoder = torchcodec.decoders.VideoDecoder(
+        video_path,
+        # Interestingly `exact` mode takes less than approximate when we load the whole video
+        seek_mode="exact",
+        # Allow FFmpeg decide on the number of threads for efficiency
+        num_ffmpeg_threads=0,
+    )
+    # If the first frame starts at > 0, we effectively clip the video starting at that time
+    # since (most) video players would also skip to that time
+    time_offset = decoder.metadata.begin_stream_seconds_from_content
+    # Note this duration does assume we started playing at `time_offset`
+    duration = decoder.metadata.duration_seconds
+
+    metadata = VideoMetadata(
+        total_num_frames=decoder.metadata.num_frames,
+        fps=decoder.metadata.average_fps,
+        duration=duration,
+        video_backend="torchcodec",
+        height=decoder.metadata.height,
+        width=decoder.metadata.width,
+    )
+
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+
+    # Floating point/rounding issues might cause `target_timestamps` to be very slightly
+    # out-of-bounds, to handle this we sanity check then clip them
+    assert all(x >= 0 for x in target_timestamps)
+    assert all(x < duration + 1e-6 for x in target_timestamps)
+    # 1e-6 padding since torchcodec can throw out-of-bounds errors even if you ask for the
+    # exact boundary value, we should still get the first/last frame anyway
+    max_timestamp = decoder.metadata.end_stream_seconds_from_content - 1e-6
+    min_timestamp = decoder.metadata.begin_stream_seconds_from_content + 1e-6
+    # Note we avoid using numpy ops here to reduce floating precision issues
+    timestamps = [x + time_offset for x in target_timestamps]
+    timestamps = [max(min_timestamp, min(max_timestamp, x)) for x in timestamps]
+
+    video = (
+        decoder.get_frames_played_at(timestamps).data.numpy().transpose(0, 2, 3, 1)
+    )  # Convert to THWC format
+    target_timestamps = np.array(target_timestamps)
+    metadata.frames_indices = target_timestamps * metadata.fps
+
+    return video, metadata
+
+
+def read_video_pyav(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the PyAV backend.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+
+    av = importlib.import_module("av")
+
+    with av.open(video_path) as container:
+        video_stream = container.streams.video[0]
+        fps = video_stream.average_rate or video_stream.guessed_rate
+        it = container.decode(video=0)
+        frames = list(it)
+
+        stream = container.streams.video[0]
+        start = frames[0].pts * stream.time_base
+        container_end = stream.duration
+        if container_end is not None:
+            container_end *= stream.time_base
+        if container_end is None or container_end < frames[-1].pts:
+            # Some problem with stream duration, so use the frame PTS directly
+            # and guess the duration of the last frame
+            end = frames[-1].pts * stream.time_base + 1 / fps
+        else:
+            end = container_end
+        duration = float(end - start)
+
+        metadata = VideoMetadata(
+            total_num_frames=len(frames),
+            fps=float(fps),
+            duration=float(duration),
+            video_backend="pyav",
+            height=video_stream.height,
+            width=video_stream.width,
+        )
+
+        target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+        offset = float(start)
+
+        target_timestamps = np.array(target_timestamps)
+        end_time_stamps = np.array([float(frame.pts * stream.time_base) for frame in frames[1:]] + [duration])
+        indices = np.searchsorted(end_time_stamps, target_timestamps + offset, side="right")
+        indices = np.minimum(indices, len(end_time_stamps) - 1)
+
+        video = np.stack(
+            [frames[i].to_ndarray(format="rgb24", channel_last=True) for i in indices],
+            axis=0,
+        )
+
+        metadata.frames_indices = target_timestamps * fps
+
+        return video, metadata
+
+
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "torchcodec": read_video_torchcodec,
+    "pyav": read_video_pyav,
+}
+
+
+def load_video(
+    video: VideoInput,
+    backend: str = "decord",
+    sample_timestamps_fn: Callable | None = None,
+    **kwargs,
+):
+    """
+    Loads `video` to a numpy array.
+
+    Args:
+        video (`VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        backend (`str`, *optional*, defaults to `"decord"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", ""torchcodec"]. Defaults to "decord".
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    """
+
+    # Early exit if provided an array or `PIL` frames
+    if not isinstance(video, str):
+        metadata = [None] * len(video)
+        return video, metadata
+
+    if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        import importlib
+
+        yt_dlp = importlib.import_module("yt_dlp")
+
+        buffer = BytesIO()
+        with redirect_stdout(buffer), yt_dlp.YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video, timeout=10).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    else:
+        raise TypeError(
+            "Incorrect format used for video. Should be an url linking to an video or a local path."
+        )
+
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend == "opencv":
+        raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
+
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_torchcodec_available() and backend == "torchcodec")
+        or (not is_av_available() and backend == "pyav")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+
+    video_decoder = VIDEO_DECODERS[backend]
+    video, metadata = video_decoder(file_obj, sample_timestamps_fn, **kwargs)
+    return video, metadata
+
+
+def get_target_fps(
+    video_fps: float,
+    max_frames: int,
+    total_frames: int,
+    frame_sample_mode: str,
+    candidate_target_fps: tuple[float],
+) -> float:
+    """
+    Get the target fps that best spans the video and has the most frames sampled
+    """
+    num_frames_sampled = 0
+    selected_target_fps = None
+    for target_fps in candidate_target_fps:
+        step_size = max(int(video_fps / target_fps), 1)
+        num_frames_sampled_at_fps = int(total_frames / step_size)
+        if num_frames_sampled == 0:
+            if "uniform" in frame_sample_mode:
+                if num_frames_sampled_at_fps > max_frames:
+                    break
+            selected_target_fps = target_fps
+            num_frames_sampled = num_frames_sampled_at_fps
+
+        else:
+            # the candidate sampling fps increases so frame count can't decrease
+            assert num_frames_sampled <= num_frames_sampled_at_fps
+            if num_frames_sampled_at_fps > max_frames:
+                # choose the sampling fps that spans the video
+                continue
+
+            elif num_frames_sampled_at_fps > num_frames_sampled:
+                # both are less than max_frames, choose the one with higher density of frames sampled
+                selected_target_fps = target_fps
+                num_frames_sampled = num_frames_sampled_at_fps
+    return selected_target_fps
+
+
+def get_frame_times_and_chosen_fps(selected_target_fps, total_frames, max_frames, video_fps):
+    if selected_target_fps is None:
+        frame_indices = np.linspace(0, total_frames, max_frames, endpoint=False, dtype=int)
+    else:
+        step_size = max(int(video_fps / selected_target_fps), 1)
+        frame_indices = np.arange(0, total_frames, step_size)
+    if len(frame_indices) > max_frames:
+        frame_indices = frame_indices[:max_frames]
+    return selected_target_fps, frame_indices
+
+
+class MolmoAct2VideoProcessorKwargs(VideosKwargs, total=False):
+    patch_size: int | None
+    pooling_size: list[int] | None
+    frame_sample_mode: str | None
+    max_fps: int | None
+    sampling_fps: int | None
+
+
+class MolmoAct2VideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 378, "width": 378}
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = 14
+    pooling_size = [3, 3]
+    do_sample_frames = True
+    frame_sample_mode = "uniform_last_frame"
+    max_fps = 2
+    sampling_fps = 2
+    valid_kwargs = MolmoAct2VideoProcessorKwargs
+    model_input_names = ["pixel_values_videos", "video_token_pooling", "video_grids"]
+
+    def __init__(self, **kwargs: Unpack[MolmoAct2VideoProcessorKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("height", None) is None or self.size.get("width", None) is None
+        ):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+    def _further_process_kwargs(
+        self,
+        size: SizeDict | None = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("height" not in size or "width" not in size):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+        return super()._further_process_kwargs(size=size, **kwargs)
+
+    def sample_times(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str,
+        num_frames: int,
+        max_fps: int | None = None,
+        sampling_fps: int | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Time-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            man_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+
+        duration = metadata.duration or metadata.total_num_frames / metadata.fps
+        if frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            # Try larger and larger FPSs until we hit one that can't span the video
+            target_fps = candidate_target_fps[0]
+            for candidate_fps in candidate_target_fps[1:]:
+                if num_frames / candidate_fps < duration:
+                    break
+                target_fps = candidate_fps
+            times = np.arange(0, num_frames) / target_fps
+            times = times[times < duration]
+            return times
+        elif frame_sample_mode == "uniform_last_frame":
+            if max_fps is not None:
+                max_duration = (num_frames - 1) / max_fps  # -1 to include the last frame
+                if max_duration < duration:
+                    times = np.linspace(0, duration, num=num_frames, endpoint=True, dtype=np.float64)
+                else:
+                    times = np.arange(0.0, stop=duration, step=1 / max_fps)
+                    times = np.concatenate([times, [duration]], axis=0)
+                    assert len(times) <= num_frames
+            else:
+                times = np.linspace(0, duration, num=num_frames, endpoint=True, dtype=np.float64)
+            return times
+        else:
+            raise NotImplementedError(frame_sample_mode)
+
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str | None = None,
+        num_frames: int | None = None,
+        max_fps: int | None = None,
+        sampling_fps: int | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Frame-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            max_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+
+        total_num_frames = metadata.total_num_frames
+        if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
+            duration = total_num_frames / metadata.fps
+            if total_num_frames <= 2:
+                return np.arange(total_num_frames).astype(int)
+            if duration > (num_frames - 1) / max_fps:  # -1 to include the last frame
+                # uniform fallback
+                indices = np.linspace(
+                    0,
+                    total_num_frames - 1,
+                    num=min(num_frames, total_num_frames),
+                    endpoint=True,
+                ).astype(int)
+                return indices
+            else:
+                float_indices = np.arange(
+                    0.0,
+                    stop=total_num_frames - 1,
+                    step=float(metadata.fps / max_fps),
+                )
+                if np.round(float_indices[-1]) != total_num_frames - 1:
+                    float_indices = np.concatenate([float_indices, [total_num_frames - 1]], axis=0)
+                indices = np.round(float_indices).astype(int)
+                assert indices[-1] < total_num_frames
+                assert len(float_indices) <= num_frames
+                return indices
+        elif frame_sample_mode == "uniform_last_frame":
+            indices = np.linspace(
+                0,
+                total_num_frames - 1,
+                num=min(num_frames, total_num_frames),
+                endpoint=True,
+            ).astype(int)
+            return indices
+        elif frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            selected_target_fps = get_target_fps(
+                metadata.fps,
+                num_frames,
+                total_num_frames,
+                frame_sample_mode,
+                candidate_target_fps,
+            )
+            _, indices = get_frame_times_and_chosen_fps(
+                selected_target_fps,
+                total_num_frames,
+                num_frames,
+                metadata.fps,
+            )
+            return indices
+        else:
+            raise NotImplementedError(frame_sample_mode)
+
+    def fetch_videos(self, video_url_or_urls: str | list[str] | list[list[str]], sample_timestamps_fn=None):
+        """
+        Convert a single or a list of urls into the corresponding `np.array` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if (not is_decord_available()) and (not is_torchcodec_available()) and (not is_av_available()):
+            raise ImportError(
+                "MolmoAct2VideoProcessor requires `decord`, `torchcodec`, or `av` to be installed."
+            )
+
+        if is_decord_available():
+            backend = "decord"
+        elif is_torchcodec_available():
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `torchcodec`."
+            )
+            backend = "torchcodec"
+        else:
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `PyAV`."
+            )
+            backend = "pyav"
+
+        if isinstance(video_url_or_urls, list):
+            return list(
+                zip(
+                    *[
+                        self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn)
+                        for x in video_url_or_urls
+                    ]
+                )
+            )
+        else:
+            return load_video(video_url_or_urls, backend=backend, sample_timestamps_fn=sample_timestamps_fn)
+
+    def _decode_and_sample_videos(
+        self,
+        videos: VideoInput,
+        video_metadata: VideoMetadata | dict,
+        do_sample_frames: bool | None = None,
+        sample_indices_fn: Callable | None = None,
+        sample_timestamps_fn: Callable | None = None,
+    ):
+        """
+        Decode input videos and sample frames if needed.
+        """
+        videos = make_batched_videos(videos)
+        video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
+
+        # Framed-based sampling if an array video is passed
+        # Otherwise, time-based sampling with decoding
+        if is_valid_video(videos[0]) and do_sample_frames:
+            assert video_metadata[0].fps is not None, "FPS must be provided for video input"
+            sampled_videos = []
+            sampled_metadata = []
+            for video, metadata in zip(videos, video_metadata):
+                indices = sample_indices_fn(metadata=metadata)
+                metadata.frames_indices = indices
+                sampled_videos.append(video[indices])
+                sampled_metadata.append(metadata)
+            videos = sampled_videos
+            video_metadata = sampled_metadata
+        elif not is_valid_video(videos[0]):
+            if sample_indices_fn is None:
+                logger.warning(
+                    "do_sample_frames is False, but video array is not provided: "
+                    "Will decode the video and sample frames using MolmoAct2's default sampling mode"
+                )
+            if isinstance(videos[0], list):
+                raise ValueError("A list of images is not supported for video input!")
+            else:
+                videos, video_metadata = self.fetch_videos(videos, sample_timestamps_fn=sample_timestamps_fn)
+
+        return videos, video_metadata
+
+    def _prepare_input_videos(
+        self,
+        videos: VideoInput,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        processed_videos = [to_numpy(video) for video in videos]
+        return processed_videos
+
+    def preprocess(
+        self,
+        videos: VideoInput,
+        **kwargs: Unpack[MolmoAct2VideoProcessorKwargs],
+    ) -> BatchFeature:
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
+        )
+
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+
+        do_sample_frames = kwargs.pop("do_sample_frames")
+        video_metadata = kwargs.pop("video_metadata")
+
+        sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
+        sample_timestamps_fn = partial(self.sample_times, **kwargs)
+        videos, video_metadata = self._decode_and_sample_videos(
+            videos,
+            video_metadata=video_metadata,
+            do_sample_frames=do_sample_frames,
+            sample_indices_fn=sample_indices_fn,
+            sample_timestamps_fn=sample_timestamps_fn,
+        )
+        videos = self._prepare_input_videos(videos=videos)
+
+        kwargs = self._further_process_kwargs(**kwargs)
+
+        return_metadata = kwargs.pop("return_metadata")
+        preprocessed_videos = self._preprocess(videos=videos, **kwargs)
+        if return_metadata:
+            preprocessed_videos["video_metadata"] = video_metadata
+        return preprocessed_videos
+
+    def _preprocess(
+        self,
+        videos: list[np.ndarray],
+        size: SizeDict | None = None,
+        resample: PILImageResampling | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool | None = None,
+        patch_size: int | None = None,
+        pooling_size: list[int] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess a video for the model.
+        Args:
+            videos (`VideoInput`):
+                Video to preprocess.
+            size (`SizeDict`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values_videos`: The preprocessed videos.
+                - `video_token_pooling`: The indices of the patches in `crops` to pool for each token in `video_tokens`.
+                - `video_grids`: The video grids.
+        """
+        if size.height is None or size.width is None:
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+        base_image_input_size = [size.height, size.width]
+
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+
+        image_pooling_h, image_pooling_w = pooling_size
+
+        batch_grids = []
+        batch_crops = []
+        batch_pooled_patches_idx = []
+
+        for video in videos:
+            all_crops = []
+            pooled_patches_idx = []
+
+            for frame in video:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    frame,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                )
+                offset = sum(np.prod(x.shape[:2]) for x in all_crops)
+                pooled_idx_with_offset = np.where(pooled_idx >= 0, pooled_idx + offset, pooled_idx)
+                pooled_patches_idx.append(pooled_idx_with_offset)
+                all_crops.append(crops)
+
+            video_grid = np.array([len(video), image_grid[0], image_grid[1]])
+            all_crops = np.concatenate(all_crops, 0)
+            pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
+
+            batch_grids.append(video_grid)
+            batch_crops.append(all_crops)
+            batch_pooled_patches_idx.append(pooled_patches_idx)
+
+        video_grids = np.stack(batch_grids, 0)
+        pixel_values_videos = np.concatenate(batch_crops, 0)
+        video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+
+        data = dict(
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )
+
+        return BatchFeature(data, tensor_type=return_tensors)
+
+
+MolmoAct2VideoProcessor.register_for_auto_class()
diff --git a/src/lerobot/policies/molmoact2/modeling_molmoact2.py b/src/lerobot/policies/molmoact2/modeling_molmoact2.py
new file mode 100644
index 000000000..f86be0904
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/modeling_molmoact2.py
@@ -0,0 +1,1551 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+import types
+from collections import deque
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+import torch.nn.functional as F  # noqa: N812
+from safetensors.torch import load_file as load_safetensors_file
+from torch import Tensor
+from torch.distributions import Beta
+
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.utils.constants import ACTION
+from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package
+
+from ..rtc.modeling_rtc import RTCProcessor
+from .configuration_molmoact2 import MolmoAct2Config, _hf_token, _resolve_checkpoint_location
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+
+    from .hf_model.configuration_molmoact2 import MolmoAct2Config as HFMolmoAct2Config
+    from .hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
+else:
+    SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
+    SAFE_WEIGHTS_NAME = "model.safetensors"
+    HFMolmoAct2Config = None
+    MolmoAct2ForConditionalGeneration = None
+
+if TYPE_CHECKING or (_transformers_available and _scipy_available):
+    from .hf_model.action_tokenizer import UniversalActionProcessor
+else:
+    UniversalActionProcessor = None
+
+_MODEL_INPUT_KEYS = {
+    "input_ids",
+    "pixel_values",
+    "image_token_pooling",
+    "image_grids",
+    "image_num_crops",
+    "pixel_values_videos",
+    "video_token_pooling",
+    "video_grids",
+    "attention_mask",
+    "position_ids",
+    "past_key_values",
+    "token_type_ids",
+    "inputs_embeds",
+}
+
+
+def _strict_load_safetensors_weights(model: torch.nn.Module, checkpoint_location: str) -> None:
+    index_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_INDEX_NAME)
+    single_file_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_NAME)
+    if os.path.isfile(index_path):
+        with open(index_path, encoding="utf-8") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        loaded_keys = set(weight_map)
+        model_keys = set(model.state_dict())
+        missing_keys = sorted(model_keys - loaded_keys)
+        unexpected_keys = sorted(loaded_keys - model_keys)
+        if missing_keys or unexpected_keys:
+            message = ["MolmoAct2 safetensors do not match the local model implementation."]
+            if missing_keys:
+                message.append(f"Missing keys: {missing_keys[:8]}")
+            if unexpected_keys:
+                message.append(f"Unexpected keys: {unexpected_keys[:8]}")
+            raise RuntimeError(" ".join(message))
+        for shard_file in sorted(set(weight_map.values())):
+            state_dict = load_safetensors_file(os.path.join(checkpoint_location, shard_file), device="cpu")
+            model.load_state_dict(state_dict, strict=False)
+            del state_dict
+        return
+    if os.path.isfile(single_file_path):
+        state_dict = load_safetensors_file(single_file_path, device="cpu")
+        model.load_state_dict(state_dict, strict=True)
+        return
+    raise FileNotFoundError(
+        f"MolmoAct2 checkpoint at {checkpoint_location} must contain {SAFE_WEIGHTS_NAME} "
+        f"or {SAFE_WEIGHTS_INDEX_NAME}."
+    )
+
+
+def _torch_dtype(dtype: str) -> torch.dtype:
+    if dtype == "float32":
+        return torch.float32
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    if dtype == "float16":
+        return torch.float16
+    raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def _sample_beta_timesteps(
+    *,
+    batch_size: int,
+    device: torch.device,
+    cutoff: float,
+    time_offset: float,
+    time_scale: float,
+    alpha: float,
+    beta: float,
+) -> Tensor:
+    if cutoff < time_offset:
+        raise ValueError(f"flow-matching cutoff must be >= time_offset, got {cutoff} < {time_offset}")
+    if time_scale <= 0:
+        raise ValueError(f"flow-matching time_scale must be > 0, got {time_scale}")
+    upper = min(cutoff, time_offset + time_scale)
+    dist = Beta(torch.tensor(alpha, device=device), torch.tensor(beta, device=device))
+    samples = dist.sample((batch_size,))
+    scale = upper - time_offset
+    if scale == 0:
+        return torch.full((batch_size,), time_offset, device=device, dtype=samples.dtype)
+    return time_offset + scale * samples
+
+
+class MolmoAct2Policy(PreTrainedPolicy):
+    config_class = MolmoAct2Config
+    name = "molmoact2"
+
+    def __init__(
+        self,
+        config: MolmoAct2Config,
+        *inputs,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+        dataset_meta: Any | None = None,
+        **kwargs,
+    ):
+        super().__init__(config, *inputs, **kwargs)
+        self.config.apply_norm_tag_metadata()
+        self.config.validate_features()
+        del inputs, kwargs, dataset_stats, dataset_meta
+        self._checkpoint_action_mode = self.config.saved_policy_action_mode()
+        self._action_queue: deque[Tensor] = deque(maxlen=self.config.n_action_steps)
+        self._rollout_action_generator: torch.Generator | None = None
+        self._rollout_task_key: tuple[Any, ...] | None = None
+        self._rollout_index_for_task = -1
+        self.rtc_processor: RTCProcessor | None = None
+        self.action_tokenizer: Any | None = None
+        self._load_hf_model()
+        self.config.validate_inference_action_mode(self._checkpoint_action_mode)
+        if self.config.enable_lora_vlm:
+            self._apply_lora_adapters()
+        self.init_rtc_processor()
+
+    def _load_hf_model(self) -> None:
+        require_package("transformers", extra="molmoact2")
+
+        checkpoint_location = _resolve_checkpoint_location(
+            self.config.checkpoint_path,
+            revision=self.config.checkpoint_revision,
+            force_download=bool(self.config.checkpoint_force_download),
+        )
+        model_dtype = _torch_dtype(self.config.model_dtype)
+        if HFMolmoAct2Config is None or MolmoAct2ForConditionalGeneration is None:
+            raise RuntimeError("transformers is required to load MolmoAct2 checkpoints.")
+        hf_config = HFMolmoAct2Config.from_pretrained(
+            checkpoint_location,
+            token=_hf_token(),
+        )
+        self.model = MolmoAct2ForConditionalGeneration.from_pretrained(
+            checkpoint_location,
+            config=hf_config,
+            dtype=model_dtype,
+            low_cpu_mem_usage=True,
+            token=_hf_token(),
+        )
+        # Keep Hub loading limited to local code plus safetensors, and verify the
+        # local implementation exactly matches the checkpoint key space.
+        _strict_load_safetensors_weights(self.model, checkpoint_location)
+        hf_max_action_dim = int(getattr(self.model.config, "max_action_dim", -1))
+        if hf_max_action_dim != int(self.config.expected_max_action_dim):
+            raise ValueError(
+                "MolmoAct2 checkpoint max_action_dim mismatch: "
+                f"checkpoint={hf_max_action_dim}, expected={self.config.expected_max_action_dim}."
+            )
+        if hf_max_action_dim != 32:
+            raise ValueError(
+                f"MolmoAct2 released checkpoints must have max_action_dim=32, got {hf_max_action_dim}."
+            )
+
+        if not hasattr(self.model.config, "max_action_horizon"):
+            raise ValueError("MolmoAct2 HF checkpoints must define `max_action_horizon`.")
+        self._override_loaded_max_action_horizon(int(self.config.chunk_size))
+
+        if not hasattr(self.model.config, "action_mode"):
+            raise ValueError(
+                "MolmoAct2 HF checkpoints must define `action_mode`. If this is a released "
+                "MolmoAct2 checkpoint, refresh the local Hub cache with "
+                "`policy.checkpoint_force_download=true` after the updated files are pushed."
+            )
+        checkpoint_action_mode = str(self.model.config.action_mode)
+        self.config.validate_checkpoint_action_mode(
+            checkpoint_action_mode,
+            has_action_expert=bool(getattr(self.model.config, "add_action_expert", False)),
+        )
+
+        if self.config.freeze_embedding:
+            self._freeze_input_embeddings()
+        if self.config.train_action_expert_only:
+            self._freeze_non_action_expert_parameters()
+        if self.config.gradient_checkpointing:
+            self._enable_gradient_checkpointing()
+        self.train(self.training)
+
+    def reset(self) -> None:
+        self._action_queue = deque(maxlen=self.config.n_action_steps)
+        self._rollout_action_generator = None
+
+    def _set_inference_cuda_graph_enabled(self, enabled: bool) -> None:
+        if not hasattr(self, "model"):
+            return
+        hf_model = self._hf_model()
+        enabled = bool(enabled and getattr(self.config, "enable_inference_cuda_graph", True))
+        managers = [
+            getattr(self._backbone(), "action_cuda_graph_manager", None),
+            getattr(hf_model, "action_cuda_graph_manager", None),
+            getattr(hf_model, "depth_decode_cuda_graph_manager", None),
+        ]
+        seen: set[int] = set()
+        for manager in managers:
+            if manager is None or id(manager) in seen:
+                continue
+            seen.add(id(manager))
+            set_enabled = getattr(manager, "set_enabled", None)
+            if callable(set_enabled):
+                set_enabled(enabled)
+
+    def init_rtc_processor(self) -> None:
+        self.rtc_processor = None
+        if self.config.rtc_config is not None:
+            self.rtc_processor = RTCProcessor(self.config.rtc_config)
+
+    def _rtc_enabled(self) -> bool:
+        return self.config.rtc_config is not None and self.config.rtc_config.enabled
+
+    def _action_expert(self) -> torch.nn.Module:
+        return self._backbone()._require_action_expert()
+
+    def _enable_gradient_checkpointing(self) -> None:
+        enable_gradient_checkpointing = getattr(self._hf_model(), "gradient_checkpointing_enable", None)
+        if callable(enable_gradient_checkpointing):
+            try:
+                enable_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": False})
+            except TypeError:
+                enable_gradient_checkpointing()
+        else:
+            transformer = getattr(self._backbone(), "transformer", None)
+            if transformer is None:
+                raise RuntimeError("gradient_checkpointing=true, but MolmoAct2 exposes no text transformer.")
+            transformer.gradient_checkpointing = True
+
+        transformer = getattr(self._backbone(), "transformer", None)
+        if transformer is not None:
+            transformer.gradient_checkpointing = True
+        vision_backbone = getattr(self._backbone(), "vision_backbone", None)
+        if vision_backbone is not None:
+            vision_backbone.gradient_checkpointing = True
+
+    def _freeze_non_action_expert_parameters(self) -> None:
+        trainable_params = 0
+        for name, param in self.named_parameters():
+            param.requires_grad = "action_expert" in name
+            if param.requires_grad:
+                trainable_params += param.numel()
+        if trainable_params == 0:
+            raise RuntimeError("train_action_expert_only=true, but no action_expert parameters were found.")
+
+    def _unfreeze_action_expert_parameters(self) -> None:
+        trainable_params = 0
+        for name, param in self.named_parameters():
+            if "action_expert" in name:
+                param.requires_grad_(True)
+                trainable_params += param.numel()
+        if trainable_params == 0:
+            raise RuntimeError("enable_lora_vlm=true, but no action_expert parameters were found.")
+
+    def train(self, mode: bool = True):
+        super().train(mode)
+        if getattr(self.config, "train_action_expert_only", False) and hasattr(self, "model"):
+            self._hf_model().eval()
+            self._action_expert().train(mode)
+        self._set_inference_cuda_graph_enabled(not mode)
+        return self
+
+    def _freeze_input_embeddings(self) -> None:
+        embedding_modules: list[torch.nn.Module] = []
+        seen_module_ids: set[int] = set()
+        hf_model = self._hf_model()
+        for module in (hf_model, self._backbone()):
+            get_input_embeddings = getattr(module, "get_input_embeddings", None)
+            if not callable(get_input_embeddings):
+                continue
+            embeddings = get_input_embeddings()
+            if embeddings is None or id(embeddings) in seen_module_ids:
+                continue
+            embedding_modules.append(embeddings)
+            seen_module_ids.add(id(embeddings))
+
+        if not embedding_modules:
+            raise RuntimeError("freeze_embedding=true, but MolmoAct2 checkpoint exposes no input embeddings.")
+
+        lm_head = getattr(hf_model, "lm_head", None)
+        lm_head_params = {id(param) for param in lm_head.parameters()} if lm_head is not None else set()
+        embedding_params = [param for embeddings in embedding_modules for param in embeddings.parameters()]
+        if any(id(param) in lm_head_params for param in embedding_params):
+            raise RuntimeError(
+                "freeze_embedding=true would also freeze lm_head because input embeddings and lm_head "
+                "share parameters in this checkpoint."
+            )
+        for param in embedding_params:
+            param.requires_grad = False
+
+    def get_optim_params(self) -> list[dict[str, Any]]:
+        vit_params: list[Tensor] = []
+        connector_params: list[Tensor] = []
+        action_expert_params: list[Tensor] = []
+        vlm_params: list[Tensor] = []
+        for name, param in self.named_parameters():
+            if not param.requires_grad:
+                continue
+            if "action_expert" in name:
+                action_expert_params.append(param)
+            elif any(part in name for part in ("image_pooling_2d", "image_projector")):
+                connector_params.append(param)
+            elif any(part in name for part in ("vision", "image_encoder", "vit")):
+                vit_params.append(param)
+            elif any(part in name for part in ("multi_modal_projector", "connector", "mm_projector")):
+                connector_params.append(param)
+            else:
+                vlm_params.append(param)
+
+        vlm_lr = 5e-5 if self.config.enable_lora_vlm else self.config.optimizer_lr
+        vit_lr = 5e-5 if self.config.enable_lora_vlm else self.config.optimizer_vit_lr
+        connector_lr = 5e-5 if self.config.enable_lora_vlm else self.config.optimizer_connector_lr
+
+        groups: list[dict[str, Any]] = []
+        if vlm_params:
+            groups.append({"params": vlm_params, "lr": vlm_lr})
+        if vit_params:
+            groups.append({"params": vit_params, "lr": vit_lr})
+        if connector_params:
+            groups.append({"params": connector_params, "lr": connector_lr})
+        if action_expert_params:
+            groups.append({"params": action_expert_params, "lr": self.config.optimizer_action_expert_lr})
+        return groups
+
+    def _model_inputs(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        compute_dtype = _torch_dtype(self.config.model_dtype)
+        return {
+            key: value.to(dtype=compute_dtype) if value.is_floating_point() else value
+            for key, value in batch.items()
+            if key in _MODEL_INPUT_KEYS and value is not None
+        }
+
+    def _output_action_dim(self, batch: dict[str, Tensor]) -> int:
+        action_feature = self.config.output_features.get(ACTION)
+        if action_feature is not None and action_feature.shape:
+            action_dim = int(action_feature.shape[0])
+            if action_dim > 0:
+                return action_dim
+
+        action_dim_is_pad = batch.get("action_dim_is_pad")
+        if action_dim_is_pad is not None:
+            valid_counts = (~action_dim_is_pad.to(dtype=torch.bool)).sum(dim=-1)
+            if bool((valid_counts == valid_counts[0]).all()) and int(valid_counts[0]) > 0:
+                return int(valid_counts[0])
+
+        raise RuntimeError("MolmoAct2 inference requires a positive action dimension in output_features.")
+
+    def _hf_model(self):
+        base_model = getattr(self.model, "base_model", None)
+        wrapped_model = getattr(base_model, "model", None) if base_model is not None else None
+        return wrapped_model if wrapped_model is not None else self.model
+
+    def _backbone(self):
+        return self._hf_model().model
+
+    def _override_loaded_max_action_horizon(self, action_horizon: int) -> None:
+        if action_horizon < 1:
+            raise ValueError(f"action_horizon must be >= 1, got {action_horizon}.")
+        hf_model = self._hf_model()
+        for cfg in (getattr(hf_model, "config", None), getattr(self._backbone(), "config", None)):
+            if cfg is not None:
+                cfg.max_action_horizon = int(action_horizon)
+
+    def _generation_action_horizon(self) -> int:
+        chunk_size = getattr(self.config, "chunk_size", None)
+        if chunk_size is not None:
+            return int(chunk_size)
+        hf_model = self._hf_model()
+        for cfg in (getattr(hf_model, "config", None), getattr(self._backbone(), "config", None)):
+            if cfg is None:
+                continue
+            value = getattr(cfg, "max_action_horizon", None)
+            if value is not None:
+                return int(value)
+        raise RuntimeError("MolmoAct2 could not resolve an action generation horizon.")
+
+    @staticmethod
+    def _mask_discrete_action_spans(
+        *,
+        input_ids: Tensor,
+        mask: Tensor,
+        start_token_id: int | None,
+        end_token_id: int | None,
+    ) -> Tensor:
+        if start_token_id is None or end_token_id is None:
+            return mask
+        mask = mask.clone()
+        for batch_idx in range(input_ids.shape[0]):
+            row = input_ids[batch_idx]
+            starts = (row == int(start_token_id)).nonzero(as_tuple=False).flatten().tolist()
+            ends = (row == int(end_token_id)).nonzero(as_tuple=False).flatten().tolist()
+            end_ptr = 0
+            for start in starts:
+                while end_ptr < len(ends) and ends[end_ptr] < start:
+                    end_ptr += 1
+                if end_ptr >= len(ends):
+                    mask[batch_idx, start:] = False
+                    break
+                end = int(ends[end_ptr])
+                mask[batch_idx, start : end + 1] = False
+                end_ptr += 1
+        return mask
+
+    def _encoder_attention_mask_for_action_expert(
+        self,
+        *,
+        input_ids: Tensor | None,
+        attention_mask: Tensor | None,
+    ) -> Tensor | None:
+        backbone = self._backbone()
+        get_encoder_attention_mask = getattr(backbone, "_get_encoder_attention_mask", None)
+        if callable(get_encoder_attention_mask):
+            mask = get_encoder_attention_mask(input_ids, attention_mask)
+        elif attention_mask is not None:
+            mask = attention_mask.to(dtype=torch.bool)
+        elif input_ids is not None:
+            mask = input_ids != -1
+        else:
+            return None
+
+        if getattr(self.config, "action_mode", None) != "both" or input_ids is None or mask is None:
+            return mask
+
+        mask = mask.to(dtype=torch.bool).clone()
+        eos_token_id = getattr(self.model.config, "eos_token_id", None)
+        if eos_token_id is not None:
+            mask &= input_ids != int(eos_token_id)
+        return self._mask_discrete_action_spans(
+            input_ids=input_ids,
+            mask=mask,
+            start_token_id=getattr(self.model.config, "action_start_token_id", None),
+            end_token_id=getattr(self.model.config, "action_end_token_id", None),
+        )
+
+    @staticmethod
+    def _drop_trivial_attention_mask(model_inputs: dict[str, Tensor]) -> dict[str, Tensor]:
+        attention_mask = model_inputs.get("attention_mask")
+        if torch.is_tensor(attention_mask) and bool(attention_mask.to(dtype=torch.bool).all().item()):
+            model_inputs = dict(model_inputs)
+            model_inputs.pop("attention_mask", None)
+        return model_inputs
+
+    def _load_discrete_action_tokenizer(self) -> Any:
+        if self.action_tokenizer is None:
+            require_package("transformers", extra="molmoact2")
+            require_package("scipy", extra="molmoact2")
+
+            if UniversalActionProcessor is None:
+                raise RuntimeError("transformers and scipy are required to load MolmoAct2 action tokenizer.")
+            self.action_tokenizer = UniversalActionProcessor.from_pretrained_local(
+                self.config.discrete_action_tokenizer,
+            )
+        return self.action_tokenizer
+
+    def _resolve_inference_action_mode(self, requested_mode: str | None) -> str:
+        return self.config.resolve_inference_action_mode(requested_mode, self._checkpoint_action_mode)
+
+    @staticmethod
+    def _combine_rollout_seeds(first_seed: int, batch_size: int) -> int:
+        seed = 0
+        for idx in range(batch_size):
+            seed = (seed + (idx + 1) * (first_seed + idx)) % (2**63 - 1)
+        return seed
+
+    @staticmethod
+    def _rollout_task_signature(batch: dict[str, Any]) -> tuple[Any, ...] | None:
+        task = batch.get("task")
+        if task is None:
+            task = batch.get("observation.language")
+        if task is None:
+            return None
+        if isinstance(task, str):
+            return (task,)
+        if isinstance(task, (list, tuple)):
+            return tuple(str(item) for item in task)
+        return (str(task),)
+
+    def _rollout_generator_for_inputs(
+        self,
+        batch: dict[str, Any],
+        *,
+        batch_size: int,
+        device: torch.device,
+    ) -> torch.Generator | None:
+        if not bool(getattr(self.config, "per_episode_seed", False)):
+            return None
+        if self._rollout_action_generator is not None:
+            return self._rollout_action_generator
+
+        task_signature = self._rollout_task_signature(batch)
+        if task_signature != self._rollout_task_key:
+            self._rollout_task_key = task_signature
+            self._rollout_index_for_task = 0
+        else:
+            self._rollout_index_for_task += 1
+
+        base_seed = int(getattr(self.config, "eval_seed", None) or 0)
+        first_seed = base_seed + self._rollout_index_for_task * batch_size
+        generator_device = (
+            device if device.type == "cuda" and torch.cuda.is_available() else torch.device("cpu")
+        )
+        generator = torch.Generator(device=generator_device)
+        generator.manual_seed(self._combine_rollout_seeds(first_seed, batch_size))
+        self._rollout_action_generator = generator
+        return generator
+
+    @staticmethod
+    def _expand_mask(mask: Tensor | None, num_flow_timesteps: int) -> Tensor | None:
+        if mask is None:
+            return None
+        return (
+            mask.unsqueeze(1)
+            .expand(-1, num_flow_timesteps, *([-1] * (mask.ndim - 1)))
+            .reshape(mask.shape[0] * num_flow_timesteps, *mask.shape[1:])
+        )
+
+    @staticmethod
+    def _action_dim_valid_mask(target: Tensor, action_dim_is_pad: Tensor | None) -> Tensor | None:
+        if action_dim_is_pad is None:
+            return None
+        mask = ~action_dim_is_pad.to(device=target.device, dtype=torch.bool)
+        if mask.ndim == 1:
+            mask = mask.unsqueeze(0)
+        if mask.shape[-1] != target.shape[-1]:
+            raise ValueError(
+                f"action_dim_is_pad width {mask.shape[-1]} does not match target width {target.shape[-1]}."
+            )
+        if mask.shape[0] == 1 and target.shape[0] != 1:
+            mask = mask.expand(target.shape[0], -1)
+        if mask.shape[0] != target.shape[0]:
+            raise ValueError(
+                f"action_dim_is_pad batch {mask.shape[0]} does not match target batch {target.shape[0]}."
+            )
+        while mask.ndim < target.ndim:
+            mask = mask.unsqueeze(1)
+        return mask
+
+    @classmethod
+    def _mask_action_dim_tensor(cls, tensor: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
+        if not cls._mask_enabled_static(action_dim_is_pad):
+            return tensor
+        valid_mask = cls._action_dim_valid_mask(tensor, action_dim_is_pad)
+        if valid_mask is None:
+            return tensor
+        return tensor.masked_fill(~valid_mask, 0)
+
+    @staticmethod
+    def _mask_enabled_static(action_dim_is_pad: Tensor | None) -> bool:
+        return action_dim_is_pad is not None
+
+    @classmethod
+    def _apply_action_dim_padding_mask(cls, loss: Tensor, action_dim_is_pad: Tensor | None) -> Tensor:
+        valid_mask = cls._action_dim_valid_mask(loss, action_dim_is_pad)
+        if valid_mask is None:
+            return loss
+        valid = valid_mask.to(dtype=loss.dtype)
+        denom = valid.sum(dim=-1).clamp_min(1.0)
+        return (loss * valid).sum(dim=-1) / denom
+
+    @staticmethod
+    def _apply_action_chunk_padding_mask(loss: Tensor, action_horizon_is_pad: Tensor | None) -> Tensor:
+        if action_horizon_is_pad is None:
+            return loss
+        valid_action = (
+            (~action_horizon_is_pad.to(device=loss.device, dtype=torch.bool)).unsqueeze(1).unsqueeze(-1)
+        )
+        return loss * valid_action
+
+    def _prepare_flow_matching_tensors(
+        self,
+        *,
+        actions: Tensor,
+        action_dim_is_pad: Tensor | None,
+        timesteps: Tensor | None = None,
+        noise: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+        action_expert = self._backbone()._require_action_expert()
+        action_dtype = next(action_expert.parameters()).dtype
+        actions = actions.to(dtype=action_dtype)
+        batch_size = int(actions.shape[0])
+        device = actions.device
+        num_flow_timesteps = max(1, int(self.config.num_flow_timesteps))
+
+        if timesteps is None:
+            timesteps = (
+                _sample_beta_timesteps(
+                    batch_size=batch_size * num_flow_timesteps,
+                    device=device,
+                    cutoff=self.config.flow_matching_cutoff,
+                    time_offset=self.config.flow_matching_time_offset,
+                    time_scale=self.config.flow_matching_time_scale,
+                    alpha=self.config.flow_matching_beta_alpha,
+                    beta=self.config.flow_matching_beta_beta,
+                )
+                .to(dtype=action_dtype)
+                .view(batch_size, num_flow_timesteps)
+            )
+        else:
+            expected_timesteps_shape = (batch_size, num_flow_timesteps)
+            timesteps = timesteps.to(device=device, dtype=action_dtype)
+            if tuple(timesteps.shape) != expected_timesteps_shape:
+                raise ValueError(
+                    f"flow timesteps must have shape {expected_timesteps_shape}, got {tuple(timesteps.shape)}."
+                )
+
+        if self.config.mask_action_dim_padding:
+            actions = self._mask_action_dim_tensor(actions, action_dim_is_pad)
+
+        expected_noise_shape = (batch_size, num_flow_timesteps, actions.shape[1], actions.shape[2])
+        if noise is None:
+            noise = torch.randn(*expected_noise_shape, device=device, dtype=actions.dtype)
+        else:
+            noise = noise.to(device=device, dtype=actions.dtype)
+            if tuple(noise.shape) != expected_noise_shape:
+                raise ValueError(
+                    f"flow noise must have shape {expected_noise_shape}, got {tuple(noise.shape)}."
+                )
+        if self.config.mask_action_dim_padding:
+            noise = self._mask_action_dim_tensor(noise, action_dim_is_pad)
+
+        t_broadcast = timesteps.view(batch_size, num_flow_timesteps, 1, 1)
+        actions_expanded = actions.unsqueeze(1).expand(-1, num_flow_timesteps, -1, -1)
+        xt = (1.0 - t_broadcast) * noise + t_broadcast * actions_expanded
+        target_velocity = actions_expanded - noise
+        return actions, timesteps, xt, target_velocity
+
+    def _prepare_joint_training_backbone_inputs(
+        self,
+        model_inputs: dict[str, Tensor],
+    ) -> tuple[Tensor, Tensor | dict[str, Any], Tensor, Tensor]:
+        backbone = self._backbone()
+        input_ids = model_inputs.get("input_ids")
+        inputs_embeds = model_inputs.get("inputs_embeds")
+        if (input_ids is None) == (inputs_embeds is None):
+            raise ValueError(
+                "MolmoAct2 joint flow training requires exactly one of input_ids or inputs_embeds."
+            )
+
+        images = None
+        token_pooling = None
+        merge_visual_inputs = getattr(backbone, "merge_visual_inputs", None)
+        if callable(merge_visual_inputs):
+            images, token_pooling = merge_visual_inputs(
+                input_ids=input_ids,
+                pixel_values=model_inputs.get("pixel_values"),
+                image_token_pooling=model_inputs.get("image_token_pooling"),
+                image_grids=model_inputs.get("image_grids"),
+                image_num_crops=model_inputs.get("image_num_crops"),
+                pixel_values_videos=model_inputs.get("pixel_values_videos"),
+                video_token_pooling=model_inputs.get("video_token_pooling"),
+                video_grids=model_inputs.get("video_grids"),
+            )
+        elif (
+            model_inputs.get("pixel_values") is not None
+            or model_inputs.get("pixel_values_videos") is not None
+        ):
+            raise RuntimeError("MolmoAct2 checkpoint does not expose merge_visual_inputs for joint training.")
+
+        if images is not None and inputs_embeds is not None:
+            raise ValueError("MolmoAct2 joint flow training cannot combine inputs_embeds with visual inputs.")
+        if inputs_embeds is None:
+            inputs_embeds, _image_features = backbone.build_input_embeddings(input_ids, images, token_pooling)
+
+        cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
+        position_ids = model_inputs.get("position_ids")
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        attention_mask = model_inputs.get("attention_mask")
+        if isinstance(attention_mask, dict):
+            causal_mask_mapping = attention_mask
+        else:
+            causal_mask_mapping = backbone._build_native_attention_bias(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                token_type_ids=model_inputs.get("token_type_ids"),
+                past_key_values=None,
+            )
+        return inputs_embeds, causal_mask_mapping, position_ids, cache_position
+
+    @staticmethod
+    def _decoder_layer_kv_outputs(
+        layer_outputs: tuple[Any, ...], *, output_attentions: bool
+    ) -> tuple[Tensor, Tensor]:
+        output_idx = 2 if output_attentions else 1
+        return layer_outputs[output_idx], layer_outputs[output_idx + 1]
+
+    @staticmethod
+    def _action_time_conditioning(action_expert: torch.nn.Module, timesteps: Tensor) -> Tensor:
+        time_conditioning = getattr(action_expert, "_time_conditioning", None)
+        if callable(time_conditioning):
+            return time_conditioning(timesteps)
+        return action_expert.time_embed(timesteps)
+
+    def _compute_flow_matching_loss_joint_per_layer(
+        self,
+        *,
+        batch: dict[str, Tensor],
+        model_inputs: dict[str, Tensor],
+        timesteps: Tensor | None = None,
+        noise: Tensor | None = None,
+        reduction: str = "mean",
+    ) -> tuple[Tensor, Tensor]:
+        if reduction not in {"mean", "none"}:
+            raise ValueError(f"Unsupported reduction={reduction!r}. Expected 'mean' or 'none'.")
+        backbone = self._backbone()
+        transformer = getattr(backbone, "transformer", None)
+        action_expert = backbone._require_action_expert()
+        if transformer is None:
+            raise RuntimeError("MolmoAct2 joint flow training requires a patchable text transformer.")
+        if len(action_expert.blocks) != int(transformer.config.num_hidden_layers):
+            raise RuntimeError(
+                "MolmoAct2 joint flow training requires one action expert block per text transformer layer."
+            )
+
+        actions, timesteps, xt, target_velocity = self._prepare_flow_matching_tensors(
+            actions=batch[ACTION],
+            action_dim_is_pad=batch.get("action_dim_is_pad"),
+            timesteps=timesteps,
+            noise=noise,
+        )
+        num_flow_timesteps = max(1, int(self.config.num_flow_timesteps))
+        batch_size = int(actions.shape[0])
+        device = actions.device
+        xt_flat = xt.reshape(batch_size * num_flow_timesteps, actions.shape[1], actions.shape[2])
+        timesteps_flat = timesteps.reshape(batch_size * num_flow_timesteps)
+
+        hidden_states, causal_mask_mapping, position_ids, cache_position = (
+            self._prepare_joint_training_backbone_inputs(model_inputs)
+        )
+        if hidden_states.shape[0] != batch_size:
+            raise ValueError(
+                f"Backbone batch size {hidden_states.shape[0]} does not match action batch size {batch_size}."
+            )
+
+        encoder_attention_mask = self._encoder_attention_mask_for_action_expert(
+            input_ids=model_inputs.get("input_ids"),
+            attention_mask=model_inputs.get("attention_mask"),
+        )
+        action_attention_mask = None
+        if batch.get("action_horizon_is_pad") is not None:
+            action_attention_mask = ~batch["action_horizon_is_pad"].to(device=device, dtype=torch.bool)
+
+        valid_action = None
+        if action_attention_mask is not None:
+            valid_action = action_attention_mask.to(device=device, dtype=actions.dtype).unsqueeze(-1)
+            valid_action = self._expand_mask(valid_action, num_flow_timesteps)
+
+        rope_cache = None
+        if len(action_expert.blocks) > 0 and action_expert.blocks[0].self_attn.rope is not None:
+            rope_cache = action_expert.blocks[0].self_attn.rope.build_cache(
+                seq_len=actions.shape[1],
+                device=device,
+                dtype=actions.dtype,
+            )
+
+        cross_mask = action_expert._build_cross_attention_mask(
+            encoder_attention_mask,
+            batch_size,
+            actions.dtype,
+        )
+        cross_mask = self._expand_mask(cross_mask, num_flow_timesteps)
+        self_mask = action_expert._build_self_attention_mask(
+            action_attention_mask,
+            actions.shape[1],
+            device,
+            actions.dtype,
+        )
+        self_mask = self._expand_mask(self_mask, num_flow_timesteps)
+
+        conditioning = self._action_time_conditioning(action_expert, timesteps_flat)
+        action_hidden = action_expert.action_embed(xt_flat)
+        if valid_action is not None:
+            action_hidden = action_hidden * valid_action
+
+        if transformer.config.rope_scaling_layers is not None:
+            position_embeddings_mapping = {
+                "default": transformer.rotary_embs["default"](hidden_states, position_ids),
+                "scaling": transformer.rotary_embs["scaling"](hidden_states, position_ids),
+            }
+        else:
+            position_embeddings = transformer.rotary_emb(hidden_states, position_ids)
+
+        use_gradient_checkpointing = bool(
+            getattr(self.config, "gradient_checkpointing", False)
+            and self.training
+            and torch.is_grad_enabled()
+        )
+
+        def run_layer(
+            layer_idx: int, layer_hidden: Tensor, layer_action_hidden: Tensor
+        ) -> tuple[Tensor, Tensor]:
+            decoder_block = transformer.blocks[layer_idx]
+            action_block = action_expert.blocks[layer_idx]
+            if transformer.config.rope_scaling_layers is not None:
+                position_embeddings_i = (
+                    position_embeddings_mapping["scaling"]
+                    if layer_idx in transformer.config.rope_scaling_layers
+                    else position_embeddings_mapping["default"]
+                )
+            else:
+                position_embeddings_i = position_embeddings
+
+            layer_outputs = decoder_block(
+                layer_hidden,
+                position_embeddings=position_embeddings_i,
+                attention_mask=causal_mask_mapping,
+                position_ids=position_ids,
+                past_key_values=None,
+                output_attentions=False,
+                use_cache=False,
+                cache_position=cache_position,
+                collect_layer_kv_states=True,
+            )
+            next_hidden = layer_outputs[0]
+            key_states, value_states = self._decoder_layer_kv_outputs(layer_outputs, output_attentions=False)
+            key_states = backbone._cache_to_sequence(key_states)
+            value_states = backbone._cache_to_sequence(value_states)
+            if self.config.enable_knowledge_insulation:
+                key_states = key_states.detach()
+                value_states = value_states.detach()
+
+            k_ctx = action_expert._project_kv_tensor(key_states, action_expert.context_k_proj)
+            v_ctx = action_expert._project_kv_tensor(value_states, action_expert.context_v_proj)
+            k_norm = action_block.cross_attn.k_norm
+            if k_norm is not None:
+                k_ctx = k_norm(k_ctx.transpose(1, 2)).transpose(1, 2)
+            if num_flow_timesteps != 1:
+                k_ctx = self._expand_mask(k_ctx, num_flow_timesteps)
+                v_ctx = self._expand_mask(v_ctx, num_flow_timesteps)
+
+            next_action_hidden = action_block(
+                layer_action_hidden,
+                conditioning,
+                cross_kv=(k_ctx, v_ctx),
+                self_attn_mask=self_mask,
+                attn_mask=cross_mask,
+                is_causal=action_expert.config.causal_attn,
+                modulation=None,
+                rope_cache=rope_cache,
+            )
+            if valid_action is not None:
+                next_action_hidden = next_action_hidden * valid_action
+            return next_hidden, next_action_hidden
+
+        for layer_idx in range(int(transformer.config.num_hidden_layers)):
+            if use_gradient_checkpointing:
+                hidden_states, action_hidden = torch.utils.checkpoint.checkpoint(
+                    lambda layer_hidden, layer_action_hidden, idx=layer_idx: run_layer(
+                        idx,
+                        layer_hidden,
+                        layer_action_hidden,
+                    ),
+                    hidden_states,
+                    action_hidden,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states, action_hidden = run_layer(layer_idx, hidden_states, action_hidden)
+
+        hidden_states = transformer.ln_f(hidden_states)
+        pred_velocity = action_expert.final_layer(action_hidden, conditioning)
+        if valid_action is not None:
+            pred_velocity = pred_velocity * valid_action
+        pred_velocity = pred_velocity.reshape(
+            batch_size, num_flow_timesteps, actions.shape[1], actions.shape[2]
+        )
+
+        loss = F.mse_loss(pred_velocity, target_velocity, reduction="none")
+        loss = self._apply_action_chunk_padding_mask(loss, batch.get("action_horizon_is_pad"))
+        if self.config.mask_action_dim_padding:
+            loss = self._apply_action_dim_padding_mask(loss, batch.get("action_dim_is_pad"))
+        loss = loss.reshape(batch_size, -1).mean(dim=1)
+        if reduction == "mean":
+            loss = loss.mean()
+        return loss, hidden_states
+
+    def _discrete_token_weights(self, valid_positions: Tensor) -> Tensor | None:
+        mode = self.config.discrete_loss_token_weighting
+        if mode in {"none", "token", "root_subsegments"}:
+            return None
+        if mode != "root_subsegments_root_tokens" and mode != "root_tokens":
+            raise ValueError(f"Unsupported discrete_loss_token_weighting={mode!r}.")
+
+        token_counts = valid_positions.sum(dim=1).to(dtype=torch.float32)
+        example_weights = torch.zeros_like(token_counts)
+        nonempty = token_counts > 0
+        example_weights[nonempty] = 2.0 / torch.sqrt(token_counts[nonempty])
+        return example_weights[:, None].expand_as(valid_positions)[valid_positions].to(dtype=torch.float32)
+
+    @staticmethod
+    def _weighted_mean(values: Tensor, weights: Tensor | None) -> Tensor:
+        if weights is None:
+            return values.mean()
+        weights = weights.to(device=values.device, dtype=values.dtype)
+        return torch.dot(values, weights) / weights.sum().clamp_min(1.0)
+
+    @staticmethod
+    def _weighted_per_example(
+        values: Tensor,
+        weights: Tensor | None,
+        example_indices: Tensor,
+        batch_size: int,
+    ) -> Tensor:
+        values = values.float()
+        if weights is None:
+            weights = torch.ones_like(values)
+        else:
+            weights = weights.to(device=values.device, dtype=values.dtype)
+        loss_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
+        weight_sum = torch.zeros(batch_size, device=values.device, dtype=torch.float32)
+        loss_sum.scatter_add_(0, example_indices, values * weights)
+        weight_sum.scatter_add_(0, example_indices, weights)
+        global_weight_sum = weight_sum.sum().clamp_min(1.0)
+        return loss_sum * float(batch_size) / global_weight_sum
+
+    def _discrete_loss_from_backbone_outputs(
+        self,
+        batch: dict[str, Tensor],
+        outputs: Any,
+        reduction: str = "mean",
+    ) -> tuple[Tensor, Tensor | None]:
+        if reduction not in {"mean", "none"}:
+            raise ValueError(f"Unsupported reduction={reduction!r}. Expected 'mean' or 'none'.")
+        labels = batch.get("labels")
+        if labels is None:
+            raise RuntimeError("MolmoAct2 discrete training requires labels.")
+        hidden_states = outputs.last_hidden_state
+        if hidden_states is None:
+            raise RuntimeError("MolmoAct2 backbone did not return last_hidden_state.")
+
+        ignore_index = -100
+        shift_labels = F.pad(labels, (0, 1), value=ignore_index)[..., 1:].contiguous()
+        valid_positions = shift_labels != ignore_index
+        if not bool(valid_positions.any()):
+            raise RuntimeError("MolmoAct2 discrete training labels contain no valid action tokens.")
+
+        hidden_size = hidden_states.shape[-1]
+        selected_hidden = hidden_states.reshape(-1, hidden_size)[valid_positions.reshape(-1)]
+        selected_labels = shift_labels.reshape(-1)[valid_positions.reshape(-1)].to(
+            device=hidden_states.device
+        )
+        logits = F.linear(selected_hidden, self.model.lm_head.weight).float()
+        log_z = logits.logsumexp(dim=-1)
+        target_logits = logits.gather(dim=-1, index=selected_labels[:, None]).squeeze(-1)
+        token_ce_loss = log_z - target_logits
+        token_weights = self._discrete_token_weights(valid_positions)
+        if reduction == "none":
+            example_indices = valid_positions.nonzero(as_tuple=False)[:, 0].to(device=hidden_states.device)
+            ce_loss = self._weighted_per_example(
+                token_ce_loss,
+                token_weights,
+                example_indices,
+                int(labels.shape[0]),
+            )
+        else:
+            ce_loss = self._weighted_mean(token_ce_loss, token_weights)
+        if not self.config.softmax_auxiliary_loss:
+            return ce_loss, None
+
+        if reduction == "none":
+            z_loss = self.config.softmax_auxiliary_loss_scale * self._weighted_per_example(
+                log_z.pow(2),
+                token_weights,
+                example_indices,
+                int(labels.shape[0]),
+            )
+        else:
+            z_loss = self.config.softmax_auxiliary_loss_scale * self._weighted_mean(
+                log_z.pow(2), token_weights
+            )
+        return ce_loss, z_loss
+
+    @staticmethod
+    def _extract_discrete_token_bins(
+        generated_ids: list[int],
+        start_token_id: int,
+        end_token_id: int,
+        token_id_to_bin: dict[int, int],
+    ) -> list[int]:
+        start_idx = None
+        end_idx = None
+        for idx, token_id in enumerate(generated_ids):
+            if token_id == start_token_id:
+                start_idx = idx
+                break
+        if start_idx is not None:
+            for idx in range(start_idx + 1, len(generated_ids)):
+                if generated_ids[idx] == end_token_id:
+                    end_idx = idx
+                    break
+        span_start = 0 if start_idx is None else start_idx + 1
+        span_end = len(generated_ids) if end_idx is None else end_idx
+        return [
+            int(token_id_to_bin[token_id])
+            for token_id in generated_ids[span_start:span_end]
+            if token_id in token_id_to_bin
+        ]
+
+    def _action_token_id_to_bin(self) -> dict[int, int]:
+        method = getattr(self.model, "_action_token_id_to_bin", None)
+        if callable(method):
+            return dict(method())
+        start = getattr(self.model.config, "action_token_start_id", None)
+        num_tokens = int(getattr(self.model.config, "num_action_tokens", 0) or 0)
+        if start is None or num_tokens <= 0:
+            return {}
+        return {int(start) + idx: idx for idx in range(num_tokens)}
+
+    def _require_discrete_eos_token_id(self) -> int:
+        method = getattr(self.model, "_require_eos_token_id", None)
+        if callable(method):
+            return int(method())
+        eos_token_id = getattr(self.model.config, "eos_token_id", None)
+        if eos_token_id is None and getattr(self.model, "generation_config", None) is not None:
+            eos_token_id = getattr(self.model.generation_config, "eos_token_id", None)
+        if isinstance(eos_token_id, (list, tuple)):
+            eos_token_id = eos_token_id[0] if eos_token_id else None
+        if eos_token_id is None:
+            raise RuntimeError("Discrete action generation requires eos_token_id in the checkpoint config.")
+        return int(eos_token_id)
+
+    def _discrete_generation_max_steps(self) -> int:
+        if self.config.discrete_generation_max_steps is not None:
+            return int(self.config.discrete_generation_max_steps)
+        return max(1, self._generation_action_horizon() * 16)
+
+    def _continue_discrete_generation_from_output(
+        self,
+        initial_output: Any,
+        *,
+        past_key_values: Any | None,
+        attention_mask: Tensor | None,
+        end_token_id: int,
+        max_steps: int,
+        attention_bias: Tensor | None = None,
+    ) -> Tensor:
+        consume_generation_tokens = getattr(self.model, "_consume_generation_tokens", None)
+        ar_decode_step = getattr(self.model, "_run_ar_decode_step", None)
+        if ar_decode_step is None:
+            ar_decode_step = getattr(self.model, "_run_depth_decode_step", None)
+        if attention_bias is None and not callable(consume_generation_tokens):
+            raise RuntimeError("MolmoAct2 checkpoint does not expose discrete token generation helpers.")
+        if attention_bias is not None and not callable(ar_decode_step):
+            raise RuntimeError("MolmoAct2 checkpoint does not expose graph-backed AR decode helpers.")
+
+        generated_tokens: list[Tensor] = []
+        current_output = initial_output
+        current_past_key_values = past_key_values
+        current_attention_mask = attention_mask
+        hit_end = False
+        for _ in range(int(max_steps)):
+            next_token = torch.argmax(current_output.logits[:, -1, :], dim=-1)
+            generated_tokens.append(next_token)
+            if bool((next_token == int(end_token_id)).all()):
+                hit_end = True
+                break
+            if attention_bias is None:
+                current_output, current_attention_mask = consume_generation_tokens(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_mask=current_attention_mask,
+                )
+                current_past_key_values = current_output.past_key_values
+            else:
+                last_hidden, current_past_key_values = ar_decode_step(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_bias=attention_bias,
+                )
+                current_output = types.SimpleNamespace(
+                    logits=self.model.lm_head(last_hidden),
+                    past_key_values=current_past_key_values,
+                )
+        if not generated_tokens:
+            raise RuntimeError("Discrete continuation generated no tokens.")
+        if not hit_end:
+            raise RuntimeError(
+                f"Discrete continuation did not emit end token {int(end_token_id)} within {int(max_steps)} steps."
+            )
+        return torch.stack(generated_tokens, dim=1)
+
+    def _make_discrete_ar_graph_decode_inputs(
+        self,
+        model_inputs: dict[str, Tensor],
+        *,
+        max_steps: int,
+    ) -> tuple[Any | None, Tensor | None]:
+        if not bool(getattr(self.config, "enable_inference_cuda_graph", False)):
+            return None, None
+        if self.training or self.model.training:
+            return None, None
+        ar_decode_step = getattr(self.model, "_run_ar_decode_step", None)
+        if ar_decode_step is None:
+            ar_decode_step = getattr(self.model, "_run_depth_decode_step", None)
+        make_attention_bias = getattr(self.model, "_make_depth_decode_attention_bias", None)
+        if not callable(ar_decode_step) or not callable(make_attention_bias):
+            return None, None
+
+        make_static_cache = getattr(self.model, "_make_ar_decode_static_cache", None)
+        if callable(make_static_cache):
+            static_cache = make_static_cache(model_inputs, max_steps=max_steps)
+        else:
+            graph_manager = getattr(self.model, "depth_decode_cuda_graph_manager", None)
+            make_manager_static_cache = getattr(graph_manager, "make_static_cache", None)
+            if not callable(make_manager_static_cache):
+                return None, None
+            prompt_len = int(model_inputs["input_ids"].shape[1])
+            static_cache = make_manager_static_cache(max_cache_len=prompt_len + max(1, int(max_steps)))
+
+        attention_bias = make_attention_bias(model_inputs, static_cache)
+        return static_cache, attention_bias
+
+    def _decode_discrete_action_chunk(self, generated_token_ids: Tensor, *, action_dim: int) -> Tensor:
+        if (
+            getattr(self.model.config, "action_start_token_id", None) is None
+            or getattr(self.model.config, "action_end_token_id", None) is None
+        ):
+            raise RuntimeError("Discrete action generation requires <action_start>/<action_end> token IDs.")
+        token_id_to_bin = self._action_token_id_to_bin()
+        if not token_id_to_bin:
+            raise RuntimeError(
+                "Discrete action generation requires indexed action tokens in the checkpoint config."
+            )
+
+        action_tokenizer = self._load_discrete_action_tokenizer()
+        if generated_token_ids.ndim == 1:
+            generated_token_ids = generated_token_ids.unsqueeze(0)
+        if generated_token_ids.ndim == 3:
+            generated_token_ids = generated_token_ids[:, 0, :]
+        if generated_token_ids.ndim != 2:
+            raise ValueError(f"Unexpected generated token tensor shape {tuple(generated_token_ids.shape)}.")
+
+        chunks: list[Tensor] = []
+        for token_row in generated_token_ids:
+            generated_ids = [int(token_id) for token_id in token_row.detach().cpu().tolist()]
+            discrete_token_ids = self._extract_discrete_token_bins(
+                generated_ids,
+                int(self.model.config.action_start_token_id),
+                int(self.model.config.action_end_token_id),
+                token_id_to_bin,
+            )
+            if not discrete_token_ids:
+                raise RuntimeError(
+                    "Model generated no decodable action tokens between <action_start>/<action_end>."
+                )
+            try:
+                decoded = action_tokenizer.decode(
+                    [discrete_token_ids],
+                    time_horizon=self._generation_action_horizon(),
+                    action_dim=int(action_dim),
+                )
+            except TypeError:
+                decoded = action_tokenizer.decode([discrete_token_ids])
+            action_chunk = np.asarray(decoded, dtype=np.float32)
+            if action_chunk.ndim == 1:
+                action_chunk = action_chunk[None, :]
+            elif action_chunk.ndim == 3:
+                if int(action_chunk.shape[0]) != 1:
+                    action_chunk = action_chunk.reshape(action_chunk.shape[-2], action_chunk.shape[-1])
+                else:
+                    action_chunk = action_chunk[0]
+            elif action_chunk.ndim > 3:
+                action_chunk = action_chunk.reshape(action_chunk.shape[-2], action_chunk.shape[-1])
+            if action_chunk.ndim != 2:
+                raise RuntimeError(f"Decoded action chunk has unexpected shape {action_chunk.shape}.")
+            chunks.append(torch.as_tensor(action_chunk, device=token_row.device, dtype=torch.float32))
+        return torch.stack(chunks, dim=0)
+
+    def _generate_discrete_actions_from_inputs(
+        self,
+        *,
+        model_inputs: dict[str, Tensor],
+        action_dim: int,
+    ) -> Tensor:
+        model_inputs = self._drop_trivial_attention_mask(model_inputs)
+        max_steps = self._discrete_generation_max_steps()
+        static_cache, attention_bias = self._make_discrete_ar_graph_decode_inputs(
+            model_inputs,
+            max_steps=max_steps,
+        )
+        prefill_kwargs: dict[str, Any] = {}
+        if static_cache is not None:
+            prefill_kwargs["past_key_values"] = static_cache
+        prefill_output = self.model(
+            **model_inputs,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            **prefill_kwargs,
+        )
+        generated_token_ids = self._continue_discrete_generation_from_output(
+            prefill_output,
+            past_key_values=prefill_output.past_key_values,
+            attention_mask=model_inputs.get("attention_mask"),
+            end_token_id=self._require_discrete_eos_token_id(),
+            max_steps=max_steps,
+            attention_bias=attention_bias,
+        )
+        return self._decode_discrete_action_chunk(generated_token_ids, action_dim=action_dim)
+
+    def _generate_actions_from_inputs_with_rtc(
+        self,
+        *,
+        model_inputs: dict[str, Tensor],
+        action_dim_is_pad: Tensor | None,
+        num_steps: int | None,
+        generator: torch.Generator | None,
+        inference_delay: int | None,
+        prev_chunk_left_over: Tensor | None,
+        execution_horizon: int | None,
+    ) -> Tensor:
+        backbone = self._backbone()
+        action_expert = self._action_expert()
+        outputs = backbone(
+            **model_inputs,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+        )
+        encoder_kv_states = backbone._extract_kv_states(outputs.past_key_values)
+        encoder_attention_mask = self._encoder_attention_mask_for_action_expert(
+            input_ids=model_inputs.get("input_ids"),
+            attention_mask=model_inputs.get("attention_mask"),
+        )
+        depth_gate, depth_mask = backbone._depth_gate_from_condition(
+            input_ids=model_inputs.get("input_ids"),
+            encoder_attention_mask=encoder_attention_mask,
+            layer_kv_states=encoder_kv_states,
+        )
+        encoder_kv_states = backbone._apply_depth_gate_to_layer_kv_states(
+            encoder_kv_states,
+            depth_mask,
+            depth_gate,
+        )
+
+        steps = int(num_steps or backbone.config.flow_matching_num_steps)
+        if steps <= 0:
+            raise ValueError(f"num_steps must be >= 1, got {steps}.")
+        source_tensor = encoder_kv_states[0][0]
+        batch_size = int(source_tensor.shape[0])
+        device = source_tensor.device
+        trajectory = torch.randn(
+            batch_size,
+            self._generation_action_horizon(),
+            int(backbone.config.max_action_dim),
+            device=device,
+            dtype=torch.float32,
+            generator=generator,
+        )
+        if self.config.mask_action_dim_padding:
+            trajectory = self._mask_action_dim_tensor(trajectory, action_dim_is_pad)
+
+        action_context = action_expert.prepare_context(
+            encoder_kv_states=encoder_kv_states,
+            encoder_attention_mask=encoder_attention_mask,
+            state_embeddings=None,
+            batch_size=batch_size,
+            seq_len=trajectory.shape[1],
+            device=device,
+            dtype=trajectory.dtype,
+        )
+        flow_timesteps = [
+            torch.full((batch_size,), idx / steps, device=device, dtype=trajectory.dtype)
+            for idx in range(steps)
+        ]
+        modulation_cache = action_expert.get_or_prepare_modulation_cache(
+            flow_timesteps,
+            cache_key=(steps, batch_size, device, trajectory.dtype),
+        )
+
+        dt = 1.0 / steps
+        mask_enabled = self.config.mask_action_dim_padding
+        for idx, flow_timestep in enumerate(flow_timesteps):
+            modulation = modulation_cache[idx]
+
+            def denoise_step(input_trajectory: Tensor, step_modulation=modulation) -> Tensor:
+                velocity = action_expert.forward_with_context(
+                    input_trajectory,
+                    step_modulation.conditioning,
+                    context=action_context,
+                    modulation=step_modulation,
+                )
+                if mask_enabled:
+                    velocity = self._mask_action_dim_tensor(velocity, action_dim_is_pad)
+                return velocity
+
+            if self._rtc_enabled():
+                if self.rtc_processor is None:
+                    raise RuntimeError("RTC is enabled but rtc_processor is not initialized.")
+
+                def rtc_denoise_step(input_trajectory: Tensor) -> Tensor:
+                    return -denoise_step(input_trajectory)
+
+                rtc_time = 1.0 - float(flow_timestep[0].item())
+                rtc_velocity = self.rtc_processor.denoise_step(
+                    x_t=trajectory,
+                    prev_chunk_left_over=prev_chunk_left_over,
+                    inference_delay=int(inference_delay or 0),
+                    time=rtc_time,
+                    original_denoise_step_partial=rtc_denoise_step,
+                    execution_horizon=execution_horizon,
+                )
+                velocity = -rtc_velocity
+            else:
+                velocity = denoise_step(trajectory)
+
+            trajectory = trajectory + dt * velocity
+            if mask_enabled:
+                trajectory = self._mask_action_dim_tensor(trajectory, action_dim_is_pad)
+            if self.rtc_processor is not None and self.rtc_processor.is_debug_enabled():
+                self.rtc_processor.track(time=float(flow_timestep[0].item()), x_t=trajectory, v_t=velocity)
+
+        return trajectory
+
+    def forward(
+        self,
+        batch: dict[str, Tensor],
+        reduction: str = "mean",
+    ) -> tuple[Tensor, dict[str, Any]]:
+        if reduction not in {"mean", "none"}:
+            raise ValueError(f"Unsupported reduction={reduction!r}. Expected 'mean' or 'none'.")
+        model_inputs = self._model_inputs(batch)
+        losses: list[Tensor] = []
+        metrics: dict[str, Any] = {}
+
+        if self.config.action_mode == "discrete":
+            outputs = self._backbone()(
+                **model_inputs,
+                use_cache=False,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+            discrete_ce_loss, discrete_z_loss = self._discrete_loss_from_backbone_outputs(
+                batch, outputs, reduction=reduction
+            )
+            discrete_loss = (
+                discrete_ce_loss if discrete_z_loss is None else discrete_ce_loss + discrete_z_loss
+            )
+            losses.append(discrete_loss)
+            metrics["discrete_ce_loss"] = discrete_ce_loss.detach().float().mean().item()
+            if discrete_z_loss is not None:
+                metrics["discrete_z_loss"] = discrete_z_loss.detach().float().mean().item()
+
+        elif self.config.action_mode == "continuous":
+            flow_loss, _ = self._compute_flow_matching_loss_joint_per_layer(
+                batch=batch,
+                model_inputs=model_inputs,
+                reduction=reduction,
+            )
+            losses.append(flow_loss)
+            metrics["action_flow_loss"] = flow_loss.detach().float().mean().item()
+
+        else:
+            flow_loss, hidden_states = self._compute_flow_matching_loss_joint_per_layer(
+                batch=batch,
+                model_inputs=model_inputs,
+                reduction=reduction,
+            )
+            outputs = types.SimpleNamespace(last_hidden_state=hidden_states)
+            discrete_ce_loss, discrete_z_loss = self._discrete_loss_from_backbone_outputs(
+                batch, outputs, reduction=reduction
+            )
+            discrete_loss = (
+                discrete_ce_loss if discrete_z_loss is None else discrete_ce_loss + discrete_z_loss
+            )
+            losses.append(discrete_loss)
+            metrics["discrete_ce_loss"] = discrete_ce_loss.detach().float().mean().item()
+            if discrete_z_loss is not None:
+                metrics["discrete_z_loss"] = discrete_z_loss.detach().float().mean().item()
+            losses.append(flow_loss)
+            metrics["action_flow_loss"] = flow_loss.detach().float().mean().item()
+
+        loss = torch.stack(losses).sum(dim=0)
+        metrics["loss"] = loss.detach().float().mean().item()
+        return loss, metrics
+
+    @torch.no_grad()
+    def predict_action_chunk(self, batch: dict[str, Tensor], **kwargs) -> Tensor:
+        if "action_mode" in kwargs:
+            raise TypeError(
+                "MolmoAct2 predict_action_chunk got unexpected keyword argument 'action_mode'; "
+                "use 'inference_action_mode'."
+            )
+        model_inputs = self._model_inputs(batch)
+        inference_action_mode = self._resolve_inference_action_mode(kwargs.get("inference_action_mode"))
+        num_steps = kwargs.get("num_steps", getattr(self.config, "num_inference_steps", None))
+        generator = kwargs.get("generator")
+        model_dtype = _torch_dtype(self.config.model_dtype)
+        device = next(self.parameters()).device
+        batch_size = int(next(iter(model_inputs.values())).shape[0])
+        if generator is None:
+            generator = self._rollout_generator_for_inputs(
+                batch,
+                batch_size=batch_size,
+                device=device,
+            )
+        action_dim = self._output_action_dim(batch)
+        autocast_context = (
+            torch.autocast(device_type=device.type, dtype=model_dtype)
+            if device.type in {"cuda", "cpu"} and model_dtype in {torch.bfloat16, torch.float16}
+            else nullcontext()
+        )
+        with autocast_context:
+            if inference_action_mode == "discrete":
+                if self._rtc_enabled():
+                    raise ValueError("RTC is only supported for continuous MolmoAct2 inference.")
+                actions = self._generate_discrete_actions_from_inputs(
+                    model_inputs=model_inputs,
+                    action_dim=action_dim,
+                )
+            elif self._rtc_enabled():
+                actions = self._generate_actions_from_inputs_with_rtc(
+                    model_inputs=model_inputs,
+                    action_dim_is_pad=batch.get("action_dim_is_pad"),
+                    num_steps=num_steps,
+                    generator=generator,
+                    inference_delay=kwargs.get("inference_delay"),
+                    prev_chunk_left_over=kwargs.get("prev_chunk_left_over"),
+                    execution_horizon=kwargs.get("execution_horizon"),
+                )
+            else:
+                actions = self._backbone().generate_actions_from_inputs(
+                    **model_inputs,
+                    action_dim_is_pad=batch.get("action_dim_is_pad"),
+                    action_horizon=self._generation_action_horizon(),
+                    num_steps=num_steps,
+                    generator=generator,
+                )
+        return actions[:, : self.config.n_action_steps, :action_dim].to(dtype=torch.float32)
+
+    @torch.no_grad()
+    def select_action(self, batch: dict[str, Tensor], **kwargs) -> Tensor:
+        if self._rtc_enabled():
+            raise AssertionError("RTC is not supported for select_action, use it with predict_action_chunk")
+        self.eval()
+        if len(self._action_queue) == 0:
+            actions = self.predict_action_chunk(batch, **kwargs)[:, : self.config.n_action_steps]
+            self._action_queue.extend(actions.transpose(0, 1))
+        return self._action_queue.popleft()
+
+    def _get_default_peft_targets(self) -> dict[str, Any]:
+        target_modules = self._lora_target_modules(prefix=r"model\.model")
+        return {
+            "target_modules": target_modules,
+            "modules_to_save": [],
+            "r": self.config.lora_rank,
+            "lora_alpha": self.config.lora_alpha,
+            "lora_dropout": self.config.lora_dropout,
+            "bias": self.config.lora_bias,
+        }
+
+    def _get_inner_peft_targets(self) -> dict[str, Any]:
+        target_modules = self._lora_target_modules(prefix="model")
+        return {
+            "target_modules": target_modules,
+            "modules_to_save": [],
+            "r": self.config.lora_rank,
+            "lora_alpha": self.config.lora_alpha,
+            "lora_dropout": self.config.lora_dropout,
+            "bias": self.config.lora_bias,
+        }
+
+    def _lora_target_modules(self, *, prefix: str) -> str:
+        vlm_linear_leaves = "w1|w2|w3|wq|wk|wv|wo|att_proj|attn_out|ff_proj|ff_out|patch_embedding"
+        target_modules = rf"{prefix}\.(transformer|vision_backbone)\.(?:.*\.)?({vlm_linear_leaves})$"
+        if self.config.enable_lora_action_expert:
+            action_expert_linear_paths = (
+                r"time_embed\.(1|3)|"
+                r"action_embed|context_k_proj|context_v_proj|"
+                r"blocks\.\d+\.self_attn\.(qkv|out_proj)|"
+                r"blocks\.\d+\.cross_attn\.(q_proj|out_proj)|"
+                r"blocks\.\d+\.mlp\.(up_proj|gate_proj|down_proj)|"
+                r"blocks\.\d+\.modulation\.linear|"
+                r"final_layer\.(modulation\.linear|linear)"
+            )
+            target_modules = (
+                f"({target_modules}|"
+                rf"{prefix}\.action_expert\.({action_expert_linear_paths})$)"
+            )
+        return target_modules
+
+    def _build_inner_lora_config(self):
+        require_package("peft", extra="molmoact2")
+        from peft import LoraConfig
+
+        return LoraConfig(**self._get_inner_peft_targets())
+
+    def _apply_lora_adapters(self) -> None:
+        require_package("peft", extra="molmoact2")
+        from peft import get_peft_model
+
+        peft_config = self._build_inner_lora_config()
+        self._validate_peft_config(peft_config)
+
+        for param in self.model.parameters():
+            param.requires_grad_(False)
+        self.model = get_peft_model(self.model, peft_config)
+        if not self.config.enable_lora_action_expert:
+            self._unfreeze_action_expert_parameters()
+        self.train(self.training)
+
+    def _validate_peft_config(self, peft_config) -> None:
+        del peft_config
+        if not self.config.checkpoint_path:
+            raise ValueError("MolmoAct2 LoRA fine-tuning requires `policy.checkpoint_path`.")
diff --git a/src/lerobot/policies/molmoact2/processor_molmoact2.py b/src/lerobot/policies/molmoact2/processor_molmoact2.py
new file mode 100644
index 000000000..6c7a3ed5c
--- /dev/null
+++ b/src/lerobot/policies/molmoact2/processor_molmoact2.py
@@ -0,0 +1,1083 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+import re
+from contextlib import suppress
+from copy import deepcopy
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+from huggingface_hub import snapshot_download
+from torch import Tensor
+
+from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    NormalizerProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    RenameObservationsProcessorStep,
+    UnnormalizerProcessorStep,
+    policy_action_to_transition,
+    transition_to_policy_action,
+)
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    ACTION,
+    OBS_IMAGES,
+    OBS_STATE,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _scipy_available, _transformers_available, require_package
+
+from .configuration_molmoact2 import MolmoAct2Config, infer_molmoact2_max_sequence_length
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import Qwen2Tokenizer
+
+    from .hf_model.image_processing_molmoact2 import MolmoAct2ImageProcessor
+    from .hf_model.processing_molmoact2 import MolmoAct2Processor
+    from .hf_model.video_processing_molmoact2 import MolmoAct2VideoProcessor
+else:
+    Qwen2Tokenizer = None
+    MolmoAct2ImageProcessor = None
+    MolmoAct2Processor = None
+    MolmoAct2VideoProcessor = None
+
+if TYPE_CHECKING or (_transformers_available and _scipy_available):
+    from .hf_model.action_tokenizer import UniversalActionProcessor
+else:
+    UniversalActionProcessor = None
+
+ACTION_OUTPUT_TOKEN = "<action_output>"  # nosec B105
+ACTION_START_TOKEN = "<action_start>"  # nosec B105
+ACTION_END_TOKEN = "<action_end>"  # nosec B105
+ACTION_TOKEN_PREFIX = "<action_"  # nosec B105
+STATE_START_TOKEN = "<state_start>"  # nosec B105
+STATE_END_TOKEN = "<state_end>"  # nosec B105
+STATE_TOKEN_PREFIX = "<state_"  # nosec B105
+SETUP_START_TOKEN = "<setup_start>"  # nosec B105
+SETUP_END_TOKEN = "<setup_end>"  # nosec B105
+CONTROL_START_TOKEN = "<control_start>"  # nosec B105
+CONTROL_END_TOKEN = "<control_end>"  # nosec B105
+
+_QUESTION_TRAILING_SENTENCE_PUNCTUATION = ".,!?;:,\u2026"
+_QUESTION_TRAILING_CLOSERS = "\"'\u201d\u2019)]}"
+_QUESTION_SURROUNDING_DELIMITERS = "\"'`\u201c\u201d\u2018\u2019[](){}"
+_QUESTION_PREFIX_PATTERNS = tuple(
+    re.compile(pattern, flags=re.IGNORECASE)
+    for pattern in (
+        r"^(?:task|instruction|language[_ ]instruction|goal)\s*[:\-]\s*",
+        r"^(?:the\s+task\s+is\s+to|your\s+task\s+is\s+to)\s+",
+    )
+)
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_checkpoint_location(
+    checkpoint_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    checkpoint_path = str(checkpoint_path or "").strip()
+    if not checkpoint_path:
+        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
+    local_path = Path(checkpoint_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=checkpoint_path,
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+def _load_hf_norm_stats_for_tag(
+    checkpoint_path: str,
+    *,
+    revision: str | None,
+    force_download: bool,
+    norm_tag: str | None,
+) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    norm_tag = str(norm_tag or "").strip()
+    if not norm_tag:
+        raise ValueError("MolmoAct2 HF checkpoint inference requires `policy.norm_tag` for normalization.")
+
+    checkpoint_location = Path(
+        _resolve_checkpoint_location(
+            checkpoint_path,
+            revision=revision,
+            force_download=force_download,
+        )
+    )
+    config_path = checkpoint_location / "config.json"
+    norm_stats_filename = "norm_stats.json"
+    if config_path.exists():
+        with suppress(OSError, json.JSONDecodeError):
+            norm_stats_filename = str(
+                json.loads(config_path.read_text()).get("norm_stats_filename") or norm_stats_filename
+            )
+
+    stats_path = checkpoint_location / norm_stats_filename
+    if not stats_path.exists():
+        raise FileNotFoundError(
+            f"MolmoAct2 HF checkpoint is missing {norm_stats_filename!r}; cannot resolve norm_tag={norm_tag!r}."
+        )
+    payload = json.loads(stats_path.read_text())
+    metadata_by_tag = payload.get("metadata_by_tag")
+    if not isinstance(metadata_by_tag, dict):
+        raise ValueError(f"MolmoAct2 norm stats file {stats_path} has no metadata_by_tag mapping.")
+    metadata = metadata_by_tag.get(norm_tag)
+    if metadata is None:
+        available = sorted(str(tag) for tag in metadata_by_tag)
+        raise ValueError(f"Unknown MolmoAct2 norm_tag={norm_tag!r}. Available tags: {available}.")
+    if not isinstance(metadata, dict):
+        raise ValueError(f"MolmoAct2 norm_tag={norm_tag!r} metadata must be a mapping.")
+
+    def numeric_stats(raw_stats: dict[str, Any]) -> dict[str, Any]:
+        stats: dict[str, Any] = {}
+        for key, value in raw_stats.items():
+            if key == "names":
+                continue
+            if isinstance(value, (list, tuple)) and any(isinstance(item, str) for item in value):
+                continue
+            stats[key] = deepcopy(value)
+        return stats
+
+    action_stats = metadata.get("action_stats")
+    state_stats = metadata.get("state_stats")
+    if not isinstance(action_stats, dict) or not isinstance(state_stats, dict):
+        raise ValueError(f"MolmoAct2 norm_tag={norm_tag!r} must define action_stats and state_stats.")
+    return {ACTION: numeric_stats(action_stats), OBS_STATE: numeric_stats(state_stats)}, metadata
+
+
+def _strip_processor_config(config: dict[str, Any], *metadata_keys: str) -> dict[str, Any]:
+    return {
+        key: value
+        for key, value in config.items()
+        if key not in {"auto_map", "processor_class", *metadata_keys}
+    }
+
+
+def _load_local_molmoact2_processor(checkpoint_location: str) -> Any:
+    if (
+        Qwen2Tokenizer is None
+        or MolmoAct2ImageProcessor is None
+        or MolmoAct2Processor is None
+        or MolmoAct2VideoProcessor is None
+    ):
+        raise RuntimeError("transformers is required to load MolmoAct2 processor.")
+
+    checkpoint_path = Path(checkpoint_location)
+    processor_config_path = checkpoint_path / "processor_config.json"
+    if not processor_config_path.exists():
+        raise FileNotFoundError(f"MolmoAct2 checkpoint is missing {processor_config_path}.")
+    processor_config = json.loads(processor_config_path.read_text())
+
+    image_config = _strip_processor_config(
+        dict(processor_config.get("image_processor") or {}),
+        "image_processor_type",
+    )
+    video_config = _strip_processor_config(
+        dict(processor_config.get("video_processor") or {}),
+        "video_processor_type",
+    )
+    image_processor = MolmoAct2ImageProcessor(**image_config)
+    video_processor = MolmoAct2VideoProcessor(**video_config)
+    tokenizer = Qwen2Tokenizer.from_pretrained(
+        checkpoint_location,
+        token=_hf_token(),
+    )
+
+    chat_template_path = checkpoint_path / "chat_template.jinja"
+    chat_template = chat_template_path.read_text() if chat_template_path.exists() else None
+    return MolmoAct2Processor(
+        image_processor=image_processor,
+        video_processor=video_processor,
+        tokenizer=tokenizer,
+        chat_template=chat_template,
+        image_use_col_tokens=processor_config.get("image_use_col_tokens", True),
+        use_single_crop_col_tokens=processor_config.get("use_single_crop_col_tokens"),
+        use_single_crop_start_token=processor_config.get("use_single_crop_start_token", True),
+        video_use_col_tokens=processor_config.get("video_use_col_tokens", False),
+        use_frame_special_tokens=processor_config.get("use_frame_special_tokens", True),
+    )
+
+
+def _to_numpy(value: Any) -> np.ndarray:
+    if isinstance(value, np.ndarray):
+        return value
+    if torch.is_tensor(value):
+        return value.detach().cpu().numpy()
+    return np.asarray(value)
+
+
+def _normalize_image(value: Any) -> np.ndarray:
+    arr = _to_numpy(value)
+    while arr.ndim > 3 and int(arr.shape[0]) == 1:
+        arr = arr[0]
+    if arr.ndim == 2:
+        arr = np.stack([arr] * 3, axis=-1)
+    if arr.ndim == 3 and arr.shape[0] in {1, 3, 4} and arr.shape[-1] not in {1, 3, 4}:
+        arr = np.moveaxis(arr, 0, -1)
+    if arr.ndim == 3 and arr.shape[-1] == 1:
+        arr = np.repeat(arr, 3, axis=-1)
+    if arr.ndim != 3 or arr.shape[-1] not in {3, 4}:
+        raise ValueError(f"Unsupported image shape for MolmoAct2: {arr.shape}.")
+    if arr.shape[-1] == 4:
+        arr = arr[..., :3]
+    if arr.dtype in (np.float16, np.float32, np.float64):
+        if arr.size > 0 and float(np.nanmax(arr)) <= 1.0:
+            arr = arr * 255.0
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    elif arr.dtype != np.uint8:
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    return arr
+
+
+def _normalize_question_text(text: str) -> str:
+    normalized = re.sub(r"\s+", " ", str(text or "")).strip()
+    if not normalized:
+        return ""
+    previous = None
+    while normalized and normalized != previous:
+        previous = normalized
+        normalized = normalized.strip().strip(_QUESTION_SURROUNDING_DELIMITERS).strip()
+        for pattern in _QUESTION_PREFIX_PATTERNS:
+            normalized = pattern.sub("", normalized, count=1).strip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_SENTENCE_PUNCTUATION).rstrip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_CLOSERS).rstrip()
+        normalized = normalized.rstrip(_QUESTION_TRAILING_SENTENCE_PUNCTUATION).rstrip()
+    chunks = [chunk.strip() for chunk in re.split(r"[.!?]+", normalized) if chunk.strip()]
+    if len(chunks) > 1:
+        normalized = "; ".join(chunks)
+    return normalized.lower()
+
+
+def _wrap_setup_text(setup_type: str, add_setup_tokens: bool) -> str:
+    setup_type = str(setup_type or "")
+    if setup_type.startswith(SETUP_START_TOKEN) and setup_type.endswith(SETUP_END_TOKEN):
+        return setup_type
+    if not setup_type or not add_setup_tokens:
+        return setup_type
+    return f"{SETUP_START_TOKEN}{setup_type}{SETUP_END_TOKEN}"
+
+
+def _wrap_control_text(control_mode: str, add_control_tokens: bool) -> str:
+    control_mode = str(control_mode or "")
+    if control_mode.startswith(CONTROL_START_TOKEN) and control_mode.endswith(CONTROL_END_TOKEN):
+        return control_mode
+    if not control_mode or not add_control_tokens:
+        return control_mode
+    return f"{CONTROL_START_TOKEN}{control_mode}{CONTROL_END_TOKEN}"
+
+
+def _build_discrete_state_string(state: np.ndarray, num_state_tokens: int) -> str:
+    if num_state_tokens <= 0:
+        raise ValueError(f"num_state_tokens must be > 0, got {num_state_tokens}.")
+    arr = np.asarray(state, dtype=np.float32)
+    arr = np.nan_to_num(arr, nan=0.0, posinf=1.0, neginf=-1.0)
+    arr = np.clip(arr, -1.0, 1.0)
+    scaled = (arr + 1.0) / 2.0 * float(num_state_tokens - 1)
+    token_ids = np.clip(np.rint(scaled).astype(np.int64), 0, int(num_state_tokens) - 1).reshape(-1)
+    return f"{STATE_START_TOKEN}{''.join(f'{STATE_TOKEN_PREFIX}{int(token_id)}>' for token_id in token_ids)}{STATE_END_TOKEN}"
+
+
+def _build_robot_text(
+    *,
+    task: str,
+    discrete_state_string: str,
+    setup_type: str,
+    control_mode: str,
+    add_setup_tokens: bool,
+    add_control_tokens: bool,
+    num_images: int,
+) -> str:
+    setup_text = _wrap_setup_text(setup_type, add_setup_tokens=add_setup_tokens)
+    control_text = _wrap_control_text(control_mode, add_control_tokens=add_control_tokens)
+    state_clause = (
+        f" The current state of the robot is {discrete_state_string}." if discrete_state_string else ""
+    )
+    prompt = (
+        f"The task is to {task}. The setup is {setup_text}.{state_clause} "
+        f"The expected control mode is {control_text}. Given these, what action should the robot take to complete the task?"
+    )
+    if num_images <= 0:
+        image_prefix = ""
+    elif num_images == 1:
+        image_prefix = "<|image|>"
+    else:
+        image_prefix = "".join(f"Image {idx + 1}<|image|>" for idx in range(num_images))
+    return f"{image_prefix}<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{ACTION_OUTPUT_TOKEN}"
+
+
+def _as_text_list(value: Any, batch_size: int) -> list[str]:
+    if value is None:
+        return [""] * batch_size
+    if isinstance(value, str):
+        return [value] * batch_size
+    if torch.is_tensor(value):
+        if value.ndim == 0:
+            return [str(value.item())] * batch_size
+        flat = value.detach().cpu().reshape(-1).tolist()
+        texts = [str(item) for item in flat]
+    elif isinstance(value, np.ndarray):
+        if value.ndim == 0:
+            return [str(value.item())] * batch_size
+        texts = [str(item) for item in value.reshape(-1).tolist()]
+    elif isinstance(value, (list, tuple)):
+        texts = [str(item) for item in value]
+    else:
+        texts = [str(value)]
+    if len(texts) == batch_size:
+        return texts
+    if len(texts) == 1:
+        return texts * batch_size
+    raise ValueError(f"Expected {batch_size} task strings, got {len(texts)}.")
+
+
+def _tokenize_discrete_action(action: np.ndarray, processor: Any) -> list[int]:
+    arr = np.asarray(action, dtype=np.float32)
+    if arr.ndim == 2:
+        arr = arr[None, :, :]
+    elif arr.ndim == 1:
+        arr = arr[None, None, :]
+    tokens_out = processor(arr)
+    if isinstance(tokens_out, dict):
+        tokens_out = tokens_out.get("input_ids", next(iter(tokens_out.values())))
+    if isinstance(tokens_out, np.ndarray):
+        tokens_out = tokens_out.tolist()
+    if torch.is_tensor(tokens_out):
+        tokens_out = tokens_out.detach().cpu().tolist()
+    if not isinstance(tokens_out, list):
+        raise TypeError(f"Unexpected discrete action tokenizer output type: {type(tokens_out)}")
+    if tokens_out and isinstance(tokens_out[0], (list, tuple, np.ndarray)):
+        tokens_out = tokens_out[0]
+    return [int(token_id) for token_id in tokens_out]
+
+
+def _build_discrete_action_string(action: np.ndarray, processor: Any) -> str:
+    token_ids = _tokenize_discrete_action(action, processor)
+    pieces = "".join(f"{ACTION_TOKEN_PREFIX}{int(token_id)}>" for token_id in token_ids)
+    return f"{ACTION_START_TOKEN}{pieces}{ACTION_END_TOKEN}"
+
+
+def _single_token_id(tokenizer: Any, token: str) -> int:
+    token_ids = tokenizer.encode(token, add_special_tokens=False)
+    if len(token_ids) != 1:
+        raise ValueError(f"MolmoAct2 token {token!r} must encode to one token, got {token_ids}.")
+    return int(token_ids[0])
+
+
+def _flatten_feature_names(raw_names: Any) -> list[str] | None:
+    if raw_names is None:
+        return None
+    if isinstance(raw_names, dict):
+        names: list[str] = []
+        for value in raw_names.values():
+            if isinstance(value, (list, tuple)):
+                names.extend(str(item) for item in value)
+            elif value is not None:
+                names.append(str(value))
+        return names or None
+    if isinstance(raw_names, (list, tuple)):
+        names = [str(item) for item in raw_names]
+        return names or None
+    return [str(raw_names)]
+
+
+def _feature_dim(stats: dict[str, Any] | None) -> int | None:
+    if not isinstance(stats, dict):
+        return None
+    for key in ("mean", "std", "min", "max", "q01", "q99", "q10", "q90", "mask"):
+        value = stats.get(key)
+        if value is None:
+            continue
+        if torch.is_tensor(value):
+            return int(value.shape[-1]) if value.ndim > 0 else None
+        arr = np.asarray(value)
+        return int(arr.shape[-1]) if arr.ndim > 0 else None
+    return None
+
+
+def _stats_array(value: Any) -> np.ndarray | None:
+    if value is None:
+        return None
+    if torch.is_tensor(value):
+        return value.detach().cpu().numpy() if value.ndim > 0 else None
+    arr = np.asarray(value)
+    return arr if arr.ndim > 0 else None
+
+
+def _validate_masked_passthrough_stats(feature_stats: dict[str, Any], mask: list[bool], key: str) -> None:
+    min_values = _stats_array(feature_stats.get("min"))
+    max_values = _stats_array(feature_stats.get("max"))
+    if min_values is None or max_values is None:
+        return
+
+    mask_array = np.asarray(mask, dtype=bool)
+    if (
+        mask_array.ndim != 1
+        or min_values.shape[-1] != mask_array.shape[0]
+        or max_values.shape[-1] != mask_array.shape[0]
+        or not bool((~mask_array).any())
+    ):
+        return
+
+    passthrough_min = min_values[..., ~mask_array]
+    passthrough_max = max_values[..., ~mask_array]
+    if bool(((passthrough_min < -1.0) | (passthrough_max > 1.0)).any()):
+        raise ValueError(
+            f"MolmoAct2 {key} gripper values are not under [-1, 1]. Please set normalize_gripper=True."
+        )
+
+
+def _feature_names_from_meta(dataset_meta: Any | None, feature_key: str) -> list[str] | None:
+    if dataset_meta is None:
+        return None
+
+    root = getattr(dataset_meta, "root", None)
+    candidate_roots = []
+    if root is not None:
+        repo_id = str(getattr(dataset_meta, "repo_id", "") or "").strip()
+        if repo_id:
+            candidate_roots.append(Path(root) / repo_id)
+        candidate_roots.append(Path(root))
+    for candidate_root in candidate_roots:
+        info_path = candidate_root / "meta" / "info.json"
+        if info_path.exists():
+            try:
+                with info_path.open("r", encoding="utf-8") as f:
+                    info = json.load(f)
+                names = _flatten_feature_names((info.get("features") or {}).get(feature_key, {}).get("names"))
+                if names:
+                    return names
+            except (OSError, json.JSONDecodeError, AttributeError):
+                pass
+
+    for container in (
+        getattr(getattr(dataset_meta, "info", None), "features", None),
+        getattr(dataset_meta, "features", None),
+    ):
+        if not isinstance(container, dict):
+            continue
+        feature = container.get(feature_key)
+        if not isinstance(feature, dict):
+            continue
+        names = _flatten_feature_names(feature.get("names"))
+        if names:
+            return names
+    return None
+
+
+def _add_gripper_masks_to_stats(
+    dataset_stats: dict[str, dict[str, Any]] | None,
+    dataset_meta: Any | None,
+    *,
+    normalize_gripper: bool,
+    dataset_feature_names: dict[str, Any] | None = None,
+) -> dict[str, dict[str, Any]] | None:
+    if not dataset_stats:
+        return dataset_stats
+
+    stats = deepcopy(dataset_stats)
+    for key in (ACTION, OBS_STATE):
+        feature_stats = stats.get(key)
+        if not isinstance(feature_stats, dict):
+            continue
+        dim = _feature_dim(feature_stats)
+        if dim is None:
+            continue
+
+        if normalize_gripper:
+            feature_stats["mask"] = [True] * dim
+            continue
+
+        names = _flatten_feature_names((dataset_feature_names or {}).get(key))
+        if names is None:
+            names = _feature_names_from_meta(dataset_meta, key)
+        if names is None:
+            names = _flatten_feature_names(feature_stats.get("names"))
+        if names is None:
+            continue
+        if len(names) != dim:
+            continue
+        mask = ["gripper" not in name.lower() for name in names]
+        _validate_masked_passthrough_stats(feature_stats, mask, key)
+        feature_stats["mask"] = mask
+    return stats
+
+
+def _normalization_masks_from_stats(
+    dataset_stats: dict[str, dict[str, Any]] | None,
+) -> dict[str, list[bool]]:
+    masks: dict[str, list[bool]] = {}
+    for key in (ACTION, OBS_STATE):
+        feature_stats = (dataset_stats or {}).get(key)
+        if not isinstance(feature_stats, dict):
+            continue
+        mask = feature_stats.get("mask")
+        if isinstance(mask, Tensor):
+            mask = mask.detach().cpu().tolist()
+        if isinstance(mask, list) and all(isinstance(value, bool) for value in mask):
+            masks[key] = mask
+    return masks
+
+
+class _MolmoAct2MaskedNormalizationMixin:
+    @staticmethod
+    def _broadcast_feature_mask(mask: Tensor, tensor: Tensor) -> Tensor | None:
+        mask = mask.to(device=tensor.device, dtype=torch.bool)
+        if mask.ndim != 1 or tensor.shape[-1] != mask.shape[0]:
+            return None
+        while mask.ndim < tensor.ndim:
+            mask = mask.unsqueeze(0)
+        return mask
+
+    @staticmethod
+    def _validate_masked_passthrough_range(tensor: Tensor, mask: Tensor, key: str) -> None:
+        passthrough_mask = ~mask.expand_as(tensor)
+        if not bool(passthrough_mask.any()):
+            return
+        passthrough_values = tensor[passthrough_mask]
+        if bool(((passthrough_values < -1.0) | (passthrough_values > 1.0)).any()):
+            raise ValueError(
+                f"MolmoAct2 {key} gripper values are not under [-1, 1]. Please set normalize_gripper=True."
+            )
+
+    def _apply_transform(
+        self, tensor: Tensor, key: str, feature_type: Any, *, inverse: bool = False
+    ) -> Tensor:
+        transformed = super()._apply_transform(tensor, key, feature_type, inverse=inverse)
+        stats = getattr(self, "_tensor_stats", {}).get(key, {})
+        mask = stats.get("mask") if isinstance(stats, dict) else None
+        if mask is None:
+            return transformed
+        mask = self._broadcast_feature_mask(mask, tensor)
+        if mask is None:
+            return transformed
+        if not inverse:
+            self._validate_masked_passthrough_range(tensor, mask, key)
+        return torch.where(mask, transformed, tensor)
+
+
+@ProcessorStepRegistry.register(name="molmoact2_masked_normalizer")
+@dataclass
+class MolmoAct2MaskedNormalizerProcessorStep(_MolmoAct2MaskedNormalizationMixin, NormalizerProcessorStep):
+    pass
+
+
+@ProcessorStepRegistry.register(name="molmoact2_masked_unnormalizer")
+@dataclass
+class MolmoAct2MaskedUnnormalizerProcessorStep(_MolmoAct2MaskedNormalizationMixin, UnnormalizerProcessorStep):
+    pass
+
+
+@ProcessorStepRegistry.register(name="molmoact2_clamp_normalized")
+@dataclass
+class MolmoAct2ClampNormalizedProcessorStep(ProcessorStep):
+    """Clamp q01/q99-normalized state and action to the range used by the old trainer."""
+
+    normalization_masks: dict[str, list[bool]] | None = None
+
+    @staticmethod
+    def _broadcast_feature_mask(mask: list[bool], tensor: Tensor) -> Tensor | None:
+        tensor_mask = torch.tensor(mask, device=tensor.device, dtype=torch.bool)
+        if tensor_mask.ndim != 1 or tensor.shape[-1] != tensor_mask.shape[0]:
+            return None
+        while tensor_mask.ndim < tensor.ndim:
+            tensor_mask = tensor_mask.unsqueeze(0)
+        return tensor_mask
+
+    @staticmethod
+    def _validate_masked_passthrough_range(tensor: Tensor, mask: Tensor, key: str) -> None:
+        passthrough_mask = ~mask.expand_as(tensor)
+        if not bool(passthrough_mask.any()):
+            return
+        passthrough_values = tensor[passthrough_mask]
+        if bool(((passthrough_values < -1.0) | (passthrough_values > 1.0)).any()):
+            raise ValueError(
+                f"MolmoAct2 {key} gripper values are not under [-1, 1]. Please set normalize_gripper=True."
+            )
+
+    def _clamp_tensor(self, tensor: Tensor, key: str) -> Tensor:
+        mask = (self.normalization_masks or {}).get(key)
+        if mask is None:
+            return tensor.clamp(-1.0, 1.0)
+        tensor_mask = self._broadcast_feature_mask(mask, tensor)
+        if tensor_mask is None:
+            return tensor.clamp(-1.0, 1.0)
+        self._validate_masked_passthrough_range(tensor, tensor_mask, key)
+        return torch.where(tensor_mask, tensor.clamp(-1.0, 1.0), tensor)
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        transition = transition.copy()
+        observation = transition.get(TransitionKey.OBSERVATION)
+        if isinstance(observation, dict) and OBS_STATE in observation:
+            observation = observation.copy()
+            observation[OBS_STATE] = self._clamp_tensor(torch.as_tensor(observation[OBS_STATE]), OBS_STATE)
+            transition[TransitionKey.OBSERVATION] = observation
+        action = transition.get(TransitionKey.ACTION)
+        if action is not None:
+            transition[TransitionKey.ACTION] = self._clamp_tensor(torch.as_tensor(action), ACTION)
+        return transition
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+
+@ProcessorStepRegistry.register(name="molmoact2_pack_inputs")
+@dataclass
+class MolmoAct2PackInputsProcessorStep(ProcessorStep):
+    checkpoint_path: str
+    checkpoint_revision: str | None = None
+    checkpoint_force_download: bool = False
+    action_mode: str = "both"
+    discrete_action_tokenizer: str = "allenai/MolmoAct2-FAST-Tokenizer"
+    image_keys: list[str] = field(default_factory=list)
+    allow_image_key_fallback: bool = False
+    setup_type: str = ""
+    control_mode: str = ""
+    normalize_language: bool = True
+    add_setup_tokens: bool = True
+    add_control_tokens: bool = True
+    num_state_tokens: int = 256
+    max_sequence_length: int | None = None
+    chunk_size: int = 30
+    max_action_dim: int = 32
+    env_action_dim: int | None = None
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="molmoact2")
+
+        checkpoint_location = _resolve_checkpoint_location(
+            self.checkpoint_path,
+            revision=self.checkpoint_revision,
+            force_download=bool(self.checkpoint_force_download),
+        )
+        self.processor = _load_local_molmoact2_processor(checkpoint_location)
+        self.action_processor = None
+        if self.action_mode in {"discrete", "both"}:
+            require_package("scipy", extra="molmoact2")
+            if UniversalActionProcessor is None:
+                raise RuntimeError("transformers and scipy are required to load MolmoAct2 action tokenizer.")
+            self.action_processor = UniversalActionProcessor.from_pretrained_local(
+                self.discrete_action_tokenizer,
+            )
+        self._action_start_id = _single_token_id(self.processor.tokenizer, ACTION_START_TOKEN)
+        self._action_end_id = _single_token_id(self.processor.tokenizer, ACTION_END_TOKEN)
+        self._eos_token = self.processor.tokenizer.eos_token or ""
+        self._eos_token_id = self.processor.tokenizer.eos_token_id
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "checkpoint_path": self.checkpoint_path,
+            "checkpoint_revision": self.checkpoint_revision,
+            "checkpoint_force_download": self.checkpoint_force_download,
+            "action_mode": self.action_mode,
+            "discrete_action_tokenizer": self.discrete_action_tokenizer,
+            "image_keys": list(self.image_keys),
+            "allow_image_key_fallback": self.allow_image_key_fallback,
+            "setup_type": self.setup_type,
+            "control_mode": self.control_mode,
+            "normalize_language": self.normalize_language,
+            "add_setup_tokens": self.add_setup_tokens,
+            "add_control_tokens": self.add_control_tokens,
+            "num_state_tokens": self.num_state_tokens,
+            "max_sequence_length": self.max_sequence_length,
+            "chunk_size": self.chunk_size,
+            "max_action_dim": self.max_action_dim,
+            "env_action_dim": self.env_action_dim,
+        }
+
+    def _resolve_max_sequence_length(
+        self,
+        *,
+        num_images: int,
+        state_dim: int,
+        action_dim: int,
+        action_horizon: int,
+        include_discrete_action: bool,
+    ) -> int:
+        if self.max_sequence_length is not None:
+            return int(self.max_sequence_length)
+        return infer_molmoact2_max_sequence_length(
+            num_images=num_images,
+            state_dim=state_dim,
+            action_dim=action_dim,
+            action_horizon=action_horizon,
+            include_discrete_action=include_discrete_action,
+        )
+
+    def _batch_size(self, observation: dict[str, Any], action: Tensor | None) -> int:
+        if action is not None:
+            return int(action.shape[0])
+        state = observation.get(OBS_STATE)
+        if torch.is_tensor(state) or isinstance(state, np.ndarray):
+            return int(state.shape[0]) if getattr(state, "ndim", 0) > 1 else 1
+        for key in self._resolve_image_keys(observation):
+            value = observation[key]
+            if torch.is_tensor(value) or isinstance(value, np.ndarray):
+                return int(value.shape[0]) if getattr(value, "ndim", 0) == 4 else 1
+        return 1
+
+    @staticmethod
+    def _observation_image_keys(observation: dict[str, Any]) -> list[str]:
+        keys = [key for key in observation if str(key).startswith(f"{OBS_IMAGES}.")]
+        if not keys:
+            keys = [key for key in observation if str(key).startswith("observation.image")]
+        return sorted(keys)
+
+    def _resolve_image_keys(self, observation: dict[str, Any]) -> list[str]:
+        if self.image_keys:
+            missing = [key for key in self.image_keys if key not in observation]
+            if missing:
+                fallback_keys = self._observation_image_keys(observation)
+                if self.allow_image_key_fallback and fallback_keys:
+                    return fallback_keys
+                raise ValueError(f"MolmoAct2 image_keys missing from observation: {missing}.")
+            return list(self.image_keys)
+        keys = self._observation_image_keys(observation)
+        if not keys:
+            raise ValueError("MolmoAct2 requires at least one image observation.")
+        return sorted(keys)
+
+    def _extract_images(self, observation: dict[str, Any], batch_size: int) -> list[list[np.ndarray]]:
+        images_by_example: list[list[np.ndarray]] = [[] for _ in range(batch_size)]
+        for key in self._resolve_image_keys(observation):
+            value = observation[key]
+            for batch_idx in range(batch_size):
+                item = value
+                if (torch.is_tensor(value) or isinstance(value, np.ndarray)) and getattr(
+                    value, "ndim", 0
+                ) >= 4:
+                    item = value[batch_idx]
+                images_by_example[batch_idx].append(_normalize_image(item))
+        return images_by_example
+
+    def _extract_state(self, observation: dict[str, Any], batch_size: int) -> Tensor:
+        if OBS_STATE not in observation:
+            raise ValueError("MolmoAct2 requires observation.state for discrete state prompting.")
+        state = torch.as_tensor(observation[OBS_STATE], dtype=torch.float32)
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if int(state.shape[0]) != batch_size:
+            raise ValueError(f"State batch size {state.shape[0]} does not match batch size {batch_size}.")
+        return state
+
+    def _pad_action(self, action: Tensor, action_is_pad: Any | None) -> tuple[Tensor, Tensor, Tensor]:
+        if action.ndim == 2:
+            action = action.unsqueeze(1)
+        if action.ndim != 3:
+            raise ValueError(f"MolmoAct2 expected action shape [B, T, D], got {tuple(action.shape)}.")
+        if action.shape[-1] > self.max_action_dim:
+            raise ValueError(
+                f"Action dim {action.shape[-1]} exceeds MolmoAct2 max_action_dim={self.max_action_dim}."
+            )
+        padded = torch.zeros(
+            (*action.shape[:-1], self.max_action_dim),
+            device=action.device,
+            dtype=torch.float32,
+        )
+        padded[..., : action.shape[-1]] = action.to(dtype=torch.float32)
+        action_dim_is_pad = torch.ones(
+            (action.shape[0], self.max_action_dim), device=action.device, dtype=torch.bool
+        )
+        action_dim_is_pad[:, : action.shape[-1]] = False
+        if action_is_pad is None:
+            action_horizon_is_pad = torch.zeros(action.shape[:2], device=action.device, dtype=torch.bool)
+        else:
+            action_horizon_is_pad = torch.as_tensor(action_is_pad, device=action.device, dtype=torch.bool)
+            if action_horizon_is_pad.ndim == 1:
+                action_horizon_is_pad = action_horizon_is_pad.unsqueeze(0)
+            if tuple(action_horizon_is_pad.shape) != tuple(action.shape[:2]):
+                raise ValueError(
+                    "action_is_pad must match action horizon shape: "
+                    f"got {tuple(action_horizon_is_pad.shape)} for action {tuple(action.shape)}."
+                )
+        return padded, action_horizon_is_pad, action_dim_is_pad
+
+    def _build_labels(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
+        labels = torch.full_like(input_ids, -100)
+        for batch_idx in range(input_ids.shape[0]):
+            valid = attention_mask[batch_idx].to(dtype=torch.bool)
+            row = input_ids[batch_idx]
+            starts = (row == self._action_start_id).nonzero(as_tuple=False).flatten().tolist()
+            ends = (row == self._action_end_id).nonzero(as_tuple=False).flatten().tolist()
+            end_ptr = 0
+            for start in starts:
+                while end_ptr < len(ends) and ends[end_ptr] < start:
+                    end_ptr += 1
+                if end_ptr >= len(ends):
+                    raise ValueError(
+                        "Found <action_start> without matching <action_end> in MolmoAct2 labels."
+                    )
+                end = int(ends[end_ptr])
+                label_end = end + 1
+                if (
+                    self._eos_token_id is not None
+                    and label_end < int(row.shape[0])
+                    and int(row[label_end]) == int(self._eos_token_id)
+                ):
+                    label_end += 1
+                labels[batch_idx, start:label_end] = row[start:label_end]
+                end_ptr += 1
+            if not starts:
+                raise ValueError("No discrete action span found in MolmoAct2 training text.")
+            labels[batch_idx] = torch.where(
+                valid, labels[batch_idx], torch.full_like(labels[batch_idx], -100)
+            )
+        return labels
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        transition = transition.copy()
+        observation = transition.get(TransitionKey.OBSERVATION) or {}
+        if not isinstance(observation, dict):
+            raise ValueError("MolmoAct2 expected an observation dictionary.")
+        complementary = dict(transition.get(TransitionKey.COMPLEMENTARY_DATA) or {})
+
+        raw_action = transition.get(TransitionKey.ACTION)
+        action = torch.as_tensor(raw_action, dtype=torch.float32) if raw_action is not None else None
+        batch_size = self._batch_size(observation, action)
+        state = self._extract_state(observation, batch_size)
+        images_by_example = self._extract_images(observation, batch_size)
+
+        task_source = complementary.get("task")
+        if task_source is None:
+            task_source = observation.get("task")
+        if task_source is None:
+            task_source = observation.get("observation.language")
+        if task_source is None:
+            task_source = complementary.get("language_instruction")
+        tasks = _as_text_list(task_source, batch_size)
+        if self.normalize_language:
+            tasks = [_normalize_question_text(task) for task in tasks]
+        complementary["task"] = tasks
+
+        action_padded = None
+        action_horizon_is_pad = None
+        action_dim_is_pad = torch.ones((batch_size, self.max_action_dim), dtype=torch.bool)
+        real_action_dim = int(self.env_action_dim or 0)
+        if action is not None:
+            action_is_pad = complementary.get("action_is_pad")
+            if action_is_pad is None:
+                action_is_pad = complementary.get("action_horizon_is_pad")
+            action_padded, action_horizon_is_pad, action_dim_is_pad = self._pad_action(action, action_is_pad)
+            real_action_dim = int(action.shape[-1])
+        elif real_action_dim > 0:
+            action_dim_is_pad[:, :real_action_dim] = False
+
+        prompt_texts: list[str] = []
+        full_texts: list[str] = []
+        flat_images: list[np.ndarray] = []
+        state_np = state.detach().cpu().numpy()
+        build_action_labels = action is not None and self.action_mode in {"discrete", "both"}
+        for batch_idx in range(batch_size):
+            images = images_by_example[batch_idx]
+            flat_images.extend(images)
+            discrete_state = _build_discrete_state_string(state_np[batch_idx], self.num_state_tokens)
+            prompt = _build_robot_text(
+                task=tasks[batch_idx],
+                discrete_state_string=discrete_state,
+                setup_type=self.setup_type,
+                control_mode=self.control_mode,
+                add_setup_tokens=self.add_setup_tokens,
+                add_control_tokens=self.add_control_tokens,
+                num_images=len(images),
+            )
+            prompt_texts.append(prompt)
+            if build_action_labels:
+                if self.action_processor is None:
+                    raise ValueError("Discrete MolmoAct2 training requires an action tokenizer.")
+                answer = _build_discrete_action_string(
+                    action[batch_idx].detach().cpu().numpy(), self.action_processor
+                )
+                full_texts.append(f"{prompt}{answer}{self._eos_token}")
+            else:
+                full_texts.append(prompt)
+
+        text = full_texts if build_action_labels else prompt_texts
+        inputs = self.processor(text=text, images=flat_images, return_tensors="pt", padding=True)
+        if action is None:
+            action_horizon = self.chunk_size
+        elif action.ndim == 2:
+            action_horizon = 1
+        else:
+            action_horizon = int(action.shape[1])
+        max_sequence_length = self._resolve_max_sequence_length(
+            num_images=max((len(images) for images in images_by_example), default=0),
+            state_dim=int(state.shape[-1]),
+            action_dim=max(real_action_dim, 1),
+            action_horizon=action_horizon,
+            include_discrete_action=build_action_labels,
+        )
+        if int(inputs["input_ids"].shape[1]) > max_sequence_length:
+            raise ValueError(
+                f"MolmoAct2 sequence length {int(inputs['input_ids'].shape[1])} exceeds "
+                f"max_sequence_length={max_sequence_length}."
+            )
+
+        if build_action_labels:
+            inputs["labels"] = self._build_labels(inputs["input_ids"], inputs["attention_mask"])
+
+        complementary.update(dict(inputs))
+        complementary["action_dim_is_pad"] = action_dim_is_pad
+        if action_horizon_is_pad is not None:
+            complementary["action_horizon_is_pad"] = action_horizon_is_pad
+
+        if action_padded is not None:
+            transition[TransitionKey.ACTION] = action_padded
+        transition[TransitionKey.COMPLEMENTARY_DATA] = complementary
+        return transition
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+
+@ProcessorStepRegistry.register(name="molmoact2_clamp_action")
+@dataclass
+class MolmoAct2ClampActionProcessorStep(ProcessorStep):
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        transition = transition.copy()
+        action = transition.get(TransitionKey.ACTION)
+        if action is not None:
+            transition[TransitionKey.ACTION] = torch.as_tensor(action).clamp(-1.0, 1.0)
+        return transition
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+
+def make_molmoact2_pre_post_processors(
+    config: MolmoAct2Config,
+    dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
+    dataset_meta: Any | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    env_action_dim = None
+    if config.output_features and ACTION in config.output_features:
+        env_action_dim = int(config.output_features[ACTION].shape[0])
+
+    hf_metadata: dict[str, Any] = {}
+    if dataset_stats is None and str(config.norm_tag or "").strip():
+        dataset_stats, hf_metadata = _load_hf_norm_stats_for_tag(
+            config.checkpoint_path,
+            revision=config.checkpoint_revision,
+            force_download=bool(config.checkpoint_force_download),
+            norm_tag=config.norm_tag,
+        )
+
+    image_keys = list(config.image_keys)
+    visual_feature_keys = [
+        key for key, feature in config.input_features.items() if feature.type == FeatureType.VISUAL
+    ]
+    if not image_keys and isinstance(hf_metadata.get("camera_keys"), list):
+        metadata_image_keys = [str(key) for key in hf_metadata["camera_keys"]]
+        if not visual_feature_keys or all(key in config.input_features for key in metadata_image_keys):
+            image_keys = metadata_image_keys
+    if not image_keys:
+        image_keys = visual_feature_keys
+    setup_type = config.setup_type or str(hf_metadata.get("setup_type") or "")
+    control_mode = config.control_mode or str(hf_metadata.get("control_mode") or "")
+    chunk_size = int(hf_metadata.get("action_horizon") or config.chunk_size)
+
+    masked_dataset_stats = _add_gripper_masks_to_stats(
+        dataset_stats,
+        dataset_meta,
+        normalize_gripper=config.normalize_gripper,
+        dataset_feature_names=config.dataset_feature_names,
+    )
+    normalization_masks = _normalization_masks_from_stats(masked_dataset_stats)
+
+    input_steps: list[ProcessorStep] = [
+        RenameObservationsProcessorStep(rename_map={}),
+        AddBatchDimensionProcessorStep(),
+        MolmoAct2MaskedNormalizerProcessorStep(
+            features={**config.input_features, **config.output_features},
+            norm_map=config.normalization_mapping,
+            stats=masked_dataset_stats,
+        ),
+        MolmoAct2ClampNormalizedProcessorStep(normalization_masks=normalization_masks),
+        MolmoAct2PackInputsProcessorStep(
+            checkpoint_path=config.checkpoint_path,
+            checkpoint_revision=config.checkpoint_revision,
+            checkpoint_force_download=config.checkpoint_force_download,
+            action_mode=config.action_mode,
+            discrete_action_tokenizer=config.discrete_action_tokenizer,
+            image_keys=image_keys,
+            allow_image_key_fallback=not bool(config.image_keys),
+            setup_type=setup_type,
+            control_mode=control_mode,
+            normalize_language=config.normalize_language,
+            add_setup_tokens=config.add_setup_tokens,
+            add_control_tokens=config.add_control_tokens,
+            num_state_tokens=config.num_state_tokens,
+            max_sequence_length=config.max_sequence_length,
+            chunk_size=chunk_size,
+            max_action_dim=config.expected_max_action_dim,
+            env_action_dim=env_action_dim,
+        ),
+        DeviceProcessorStep(device=config.device),
+    ]
+
+    output_steps: list[ProcessorStep] = [
+        MolmoAct2ClampActionProcessorStep(),
+        MolmoAct2MaskedUnnormalizerProcessorStep(
+            features=config.output_features,
+            norm_map=config.normalization_mapping,
+            stats=masked_dataset_stats,
+        ),
+        DeviceProcessorStep(device="cpu"),
+    ]
+
+    return (
+        PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+            steps=input_steps,
+            name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+        ),
+        PolicyProcessorPipeline[PolicyAction, PolicyAction](
+            steps=output_steps,
+            name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+            to_transition=policy_action_to_transition,
+            to_output=transition_to_policy_action,
+        ),
+    )
diff --git a/tests/policies/molmoact2/test_molmoact2.py b/tests/policies/molmoact2/test_molmoact2.py
new file mode 100644
index 000000000..3631bcc9b
--- /dev/null
+++ b/tests/policies/molmoact2/test_molmoact2.py
@@ -0,0 +1,1397 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for MolmoAct2's LeRobot policy interface."""
+
+# ruff: noqa: E402
+
+from __future__ import annotations
+
+import json
+from collections import deque
+from types import SimpleNamespace
+
+import numpy as np
+import pytest
+import torch
+import torch.nn.functional as F  # noqa: N812
+
+pytest.importorskip("transformers")
+pytest.importorskip("scipy")
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.policies import get_policy_class, make_policy_config
+from lerobot.policies.molmoact2 import (
+    configuration_molmoact2 as molmoact2_config,
+    modeling_molmoact2 as molmoact2_modeling,
+    processor_molmoact2 as molmoact2_processor,
+)
+from lerobot.policies.molmoact2.configuration_molmoact2 import (
+    MolmoAct2Config,
+    MolmoAct2CosineDecayWithWarmupSchedulerConfig,
+    infer_molmoact2_max_sequence_length,
+)
+from lerobot.policies.molmoact2.modeling_molmoact2 import MolmoAct2Policy
+from lerobot.policies.molmoact2.processor_molmoact2 import (
+    MolmoAct2ClampNormalizedProcessorStep,
+    MolmoAct2MaskedNormalizerProcessorStep,
+    MolmoAct2MaskedUnnormalizerProcessorStep,
+    MolmoAct2PackInputsProcessorStep,
+    _add_gripper_masks_to_stats,
+    _build_discrete_state_string,
+    _normalize_question_text,
+    make_molmoact2_pre_post_processors,
+)
+from lerobot.policies.rtc.configuration_rtc import RTCConfig
+from lerobot.types import TransitionKey
+from lerobot.utils.constants import ACTION, OBS_STATE
+
+
+def test_molmoact2_policy_registration():
+    cfg = make_policy_config("molmoact2", checkpoint_path="/tmp/not-a-real-checkpoint")
+
+    assert cfg.type == "molmoact2"
+    assert cfg.action_mode == "both"
+    assert cfg.normalize_gripper is False
+    assert cfg.enable_knowledge_insulation is False
+    assert cfg.freeze_embedding is True
+    assert cfg.per_episode_seed is False
+    assert cfg.eval_seed is None
+    assert cfg.normalize_language is True
+    assert cfg.get_scheduler_preset().num_decay_steps is None
+    assert cfg.action_delta_indices == list(range(cfg.chunk_size))
+    assert get_policy_class("molmoact2") is MolmoAct2Policy
+
+
+def test_molmoact2_checkpoint_download_ignores_remote_python(monkeypatch):
+    download_kwargs = {}
+
+    def fake_snapshot_download(**kwargs):
+        download_kwargs.update(kwargs)
+        return "/tmp/downloaded-molmoact2"
+
+    monkeypatch.setattr(molmoact2_config, "snapshot_download", fake_snapshot_download)
+
+    checkpoint_location = molmoact2_config._resolve_checkpoint_location("allenai/MolmoAct2")
+
+    assert checkpoint_location == "/tmp/downloaded-molmoact2"
+    assert download_kwargs["ignore_patterns"] == ["*.py", "*.pyc", "__pycache__/*"]
+
+
+def test_molmoact2_scheduler_decay_steps_auto_match_training_steps():
+    param = torch.nn.Parameter(torch.ones(()))
+    optimizer = torch.optim.AdamW([param], lr=0.001)
+    config = MolmoAct2CosineDecayWithWarmupSchedulerConfig(
+        peak_lr=0.01,
+        decay_lr=0.001,
+        num_warmup_steps=10,
+        num_decay_steps=None,
+    )
+
+    scheduler = config.build(optimizer, num_training_steps=100)
+    for _ in range(100):
+        optimizer.step()
+        scheduler.step()
+
+    assert scheduler.get_last_lr() == pytest.approx([0.0001])
+
+
+def test_molmoact2_rollout_generator_uses_eval_seed_per_task():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(per_episode_seed=True, eval_seed=1000)
+    policy._rollout_action_generator = None
+    policy._rollout_task_key = None
+    policy._rollout_index_for_task = -1
+
+    policy.reset()
+    first = policy._rollout_generator_for_inputs(
+        {"task": ["pick", "pick", "pick"]},
+        batch_size=3,
+        device=torch.device("cpu"),
+    )
+    expected_first = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1000, batch_size=3)
+    )
+    assert torch.allclose(torch.rand(4, generator=first), torch.rand(4, generator=expected_first))
+
+    policy.reset()
+    second = policy._rollout_generator_for_inputs(
+        {"task": ["pick", "pick", "pick"]},
+        batch_size=3,
+        device=torch.device("cpu"),
+    )
+    expected_second = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1003, batch_size=3)
+    )
+    assert torch.allclose(torch.rand(4, generator=second), torch.rand(4, generator=expected_second))
+
+    policy.reset()
+    new_task = policy._rollout_generator_for_inputs(
+        {"task": ["place", "place", "place"]},
+        batch_size=3,
+        device=torch.device("cpu"),
+    )
+    expected_new_task = torch.Generator().manual_seed(
+        MolmoAct2Policy._combine_rollout_seeds(first_seed=1000, batch_size=3)
+    )
+    assert torch.allclose(torch.rand(4, generator=new_task), torch.rand(4, generator=expected_new_task))
+
+
+def test_molmoact2_gripper_mask_uses_feature_names(tmp_path):
+    meta_dir = tmp_path / "meta"
+    meta_dir.mkdir()
+    (meta_dir / "info.json").write_text(
+        json.dumps(
+            {
+                "features": {
+                    ACTION: {"names": {"motors": ["x", "gripper"]}},
+                    OBS_STATE: {"names": {"motors": ["joint", "gripper"]}},
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+    dataset_meta = SimpleNamespace(root=tmp_path)
+    stats = {
+        ACTION: {"q01": [0.0, 0.0], "q99": [10.0, 10.0]},
+        OBS_STATE: {"q01": [0.0, 0.0], "q99": [10.0, 10.0]},
+    }
+
+    masked_stats = _add_gripper_masks_to_stats(stats, dataset_meta, normalize_gripper=False)
+
+    assert masked_stats is not None
+    assert masked_stats[ACTION]["mask"] == [True, False]
+    assert masked_stats[OBS_STATE]["mask"] == [True, False]
+
+    features = {
+        ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(2,)),
+        OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(2,)),
+    }
+    norm_map = {
+        FeatureType.ACTION: NormalizationMode.QUANTILES,
+        FeatureType.STATE: NormalizationMode.QUANTILES,
+    }
+    transition = {
+        TransitionKey.OBSERVATION: {OBS_STATE: torch.tensor([[5.0, 0.7]])},
+        TransitionKey.ACTION: torch.tensor([[5.0, -0.7]]),
+    }
+    normalizer = MolmoAct2MaskedNormalizerProcessorStep(
+        features=features,
+        norm_map=norm_map,
+        stats=masked_stats,
+    )
+    normalized = normalizer(transition)
+
+    assert torch.equal(normalized[TransitionKey.OBSERVATION][OBS_STATE], torch.tensor([[0.0, 0.7]]))
+    assert torch.equal(normalized[TransitionKey.ACTION], torch.tensor([[0.0, -0.7]]))
+
+    with pytest.raises(ValueError, match="gripper values are not under \\[-1, 1\\]"):
+        normalizer(
+            {
+                TransitionKey.OBSERVATION: {OBS_STATE: torch.tensor([[5.0, 7.0]])},
+                TransitionKey.ACTION: torch.tensor([[5.0, -0.7]]),
+            }
+        )
+
+    unnormalizer = MolmoAct2MaskedUnnormalizerProcessorStep(
+        features={ACTION: features[ACTION]},
+        norm_map=norm_map,
+        stats=masked_stats,
+    )
+    unnormalized = unnormalizer({TransitionKey.ACTION: torch.tensor([[0.0, -0.7]])})
+
+    assert torch.equal(unnormalized[TransitionKey.ACTION], torch.tensor([[5.0, -0.7]]))
+
+
+def test_molmoact2_gripper_mask_validates_dataset_stats(tmp_path):
+    meta_dir = tmp_path / "meta"
+    meta_dir.mkdir()
+    (meta_dir / "info.json").write_text(
+        json.dumps({"features": {ACTION: {"names": ["x", "gripper"]}}}),
+        encoding="utf-8",
+    )
+    stats = {
+        ACTION: {
+            "min": [-0.5, -2.0],
+            "max": [0.5, 0.5],
+        }
+    }
+
+    with pytest.raises(ValueError, match="gripper values are not under \\[-1, 1\\]"):
+        _add_gripper_masks_to_stats(stats, SimpleNamespace(root=tmp_path), normalize_gripper=False)
+
+    masked_stats = _add_gripper_masks_to_stats(stats, SimpleNamespace(root=tmp_path), normalize_gripper=True)
+    assert masked_stats is not None
+    assert masked_stats[ACTION]["mask"] == [True, True]
+
+
+def test_molmoact2_clamp_normalized_respects_masked_gripper_dims():
+    step = MolmoAct2ClampNormalizedProcessorStep(
+        normalization_masks={
+            ACTION: [True, False],
+            OBS_STATE: [True, False],
+        }
+    )
+    transition = {
+        TransitionKey.OBSERVATION: {OBS_STATE: torch.tensor([[-2.0, 0.8]])},
+        TransitionKey.ACTION: torch.tensor([[2.0, -0.8]]),
+    }
+
+    clamped = step(transition)
+
+    assert torch.equal(clamped[TransitionKey.OBSERVATION][OBS_STATE], torch.tensor([[-1.0, 0.8]]))
+    assert torch.equal(clamped[TransitionKey.ACTION], torch.tensor([[1.0, -0.8]]))
+
+    with pytest.raises(ValueError, match="gripper values are not under \\[-1, 1\\]"):
+        step({TransitionKey.OBSERVATION: {OBS_STATE: torch.tensor([[0.0, 1.2]])}})
+
+
+def test_molmoact2_normalize_gripper_true_keeps_all_dims_normalized(tmp_path):
+    meta_dir = tmp_path / "meta"
+    meta_dir.mkdir()
+    (meta_dir / "info.json").write_text(
+        json.dumps({"features": {ACTION: {"names": ["x", "gripper"]}}}),
+        encoding="utf-8",
+    )
+    stats = {ACTION: {"q01": [0.0, 0.0], "q99": [10.0, 10.0]}}
+
+    masked_stats = _add_gripper_masks_to_stats(
+        stats,
+        SimpleNamespace(root=tmp_path),
+        normalize_gripper=True,
+    )
+
+    assert masked_stats is not None
+    assert masked_stats[ACTION]["mask"] == [True, True]
+
+
+def test_molmoact2_uses_supplied_stats_with_repo_scoped_names(tmp_path):
+    repo_root = tmp_path / "test-org" / "libero"
+    (repo_root / "meta").mkdir(parents=True)
+    (repo_root / "meta" / "info.json").write_text(
+        json.dumps({"features": {ACTION: {"names": ["x", "gripper"]}}}),
+        encoding="utf-8",
+    )
+    base_stats = {ACTION: {"q01": [0.0, 0.0], "q99": [10.0, 10.0]}}
+
+    masked_stats = _add_gripper_masks_to_stats(
+        base_stats,
+        SimpleNamespace(root=tmp_path, repo_id="test-org/libero"),
+        normalize_gripper=False,
+    )
+
+    assert masked_stats is not None
+    assert masked_stats[ACTION]["q01"] == [0.0, 0.0]
+    assert masked_stats[ACTION]["mask"] == [True, False]
+
+
+def test_molmoact2_uses_config_feature_names_without_dataset_meta():
+    base_stats = {ACTION: {"q01": [0.0, 0.0], "q99": [10.0, 10.0]}}
+
+    masked_stats = _add_gripper_masks_to_stats(
+        base_stats,
+        None,
+        normalize_gripper=False,
+        dataset_feature_names={ACTION: ["x", "gripper"]},
+    )
+
+    assert masked_stats is not None
+    assert masked_stats[ACTION]["mask"] == [True, False]
+
+
+def test_molmoact2_processor_uses_available_visual_features_over_missing_metadata_keys(monkeypatch):
+    monkeypatch.setattr(
+        molmoact2_processor,
+        "_load_hf_norm_stats_for_tag",
+        lambda *args, **kwargs: (
+            {},
+            {"camera_keys": ["observation.images.image", "observation.images.wrist_image"]},
+        ),
+    )
+    monkeypatch.setattr(MolmoAct2PackInputsProcessorStep, "__post_init__", lambda self: None)
+    cfg = MolmoAct2Config(
+        checkpoint_path="/tmp/not-a-real-checkpoint",
+        norm_tag="libero",
+        input_features={
+            "observation.images.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
+            "observation.images.image2": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
+            OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(7,)),
+        },
+        output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,))},
+    )
+
+    preprocessor, _ = make_molmoact2_pre_post_processors(cfg)
+    pack_step = next(
+        step for step in preprocessor.steps if isinstance(step, MolmoAct2PackInputsProcessorStep)
+    )
+
+    assert pack_step.image_keys == ["observation.images.image", "observation.images.image2"]
+    assert pack_step.allow_image_key_fallback is True
+
+
+def test_molmoact2_metadata_image_keys_can_fall_back_to_observation_keys():
+    step = object.__new__(MolmoAct2PackInputsProcessorStep)
+    step.image_keys = ["observation.images.image", "observation.images.wrist_image"]
+    step.allow_image_key_fallback = True
+    observation = {
+        "observation.images.image": torch.zeros(3, 4, 4),
+        "observation.images.image2": torch.zeros(3, 4, 4),
+    }
+
+    assert step._resolve_image_keys(observation) == ["observation.images.image", "observation.images.image2"]
+
+
+def test_molmoact2_explicit_image_keys_stay_strict():
+    step = object.__new__(MolmoAct2PackInputsProcessorStep)
+    step.image_keys = ["observation.images.image", "observation.images.wrist_image"]
+    step.allow_image_key_fallback = False
+    observation = {
+        "observation.images.image": torch.zeros(3, 4, 4),
+        "observation.images.image2": torch.zeros(3, 4, 4),
+    }
+
+    with pytest.raises(ValueError, match="wrist_image"):
+        step._resolve_image_keys(observation)
+
+
+def test_enable_lora_vlm_builds_policy_local_peft_config():
+    pytest.importorskip("peft")
+    policy_cfg = MolmoAct2Config(
+        checkpoint_path="/tmp/not-a-real-checkpoint",
+        device="cpu",
+        enable_lora_vlm=True,
+        lora_rank=64,
+        push_to_hub=False,
+    )
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = policy_cfg
+
+    peft_config = policy._build_inner_lora_config()
+
+    assert peft_config.r == 64
+    assert peft_config.target_modules == policy._get_inner_peft_targets()["target_modules"]
+    assert not policy_cfg.use_peft
+
+
+def test_cuda_graph_managers_are_inference_only():
+    class DummyManager:
+        def __init__(self):
+            self.enabled = None
+
+        def set_enabled(self, enabled):
+            self.enabled = enabled
+
+    class DummyBackbone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.action_cuda_graph_manager = DummyManager()
+
+        def _require_action_expert(self):
+            return torch.nn.Linear(1, 1)
+
+    class DummyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = DummyBackbone()
+            self.depth_decode_cuda_graph_manager = DummyManager()
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(train_action_expert_only=False, enable_inference_cuda_graph=True)
+    policy.model = DummyModel()
+
+    policy.train()
+    assert policy.model.model.action_cuda_graph_manager.enabled is False
+    assert policy.model.depth_decode_cuda_graph_manager.enabled is False
+
+    policy.eval()
+    assert policy.model.model.action_cuda_graph_manager.enabled is True
+    assert policy.model.depth_decode_cuda_graph_manager.enabled is True
+
+    policy.config.enable_inference_cuda_graph = False
+    policy.eval()
+    assert policy.model.model.action_cuda_graph_manager.enabled is False
+    assert policy.model.depth_decode_cuda_graph_manager.enabled is False
+
+
+def test_lora_action_expert_target_is_opt_in():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        lora_rank=64,
+        lora_alpha=16,
+        lora_dropout=0.05,
+        lora_bias="none",
+        enable_lora_action_expert=False,
+    )
+
+    targets = policy._get_default_peft_targets()["target_modules"]
+
+    assert "transformer|vision_backbone" in targets
+    assert "action_expert" not in targets
+
+    policy.config.enable_lora_action_expert = True
+    targets = policy._get_default_peft_targets()["target_modules"]
+
+    assert "action_expert" in targets
+    assert "state_encoder" not in targets
+    assert "state_norm" not in targets
+    assert "kv_proj" not in targets
+
+
+def test_enable_lora_vlm_wraps_loaded_hf_model_locally():
+    pytest.importorskip("peft")
+
+    class DummyInnerModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.transformer = torch.nn.Module()
+            self.transformer.wq = torch.nn.Linear(2, 2)
+            self.action_expert = torch.nn.Module()
+            self.action_expert.action_embed = torch.nn.Linear(2, 2)
+
+    class DummyHFModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.config = {}
+            self.model = DummyInnerModel()
+
+        def forward(self, x):
+            return self.model.transformer.wq(x)
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        checkpoint_path="/tmp/base",
+        lora_rank=2,
+        lora_alpha=4,
+        lora_dropout=0.0,
+        lora_bias="none",
+        enable_lora_action_expert=False,
+        train_action_expert_only=False,
+        enable_inference_cuda_graph=False,
+    )
+    policy.model = DummyHFModel()
+
+    policy._apply_lora_adapters()
+
+    assert policy._backbone() is policy.model.base_model.model.model
+    trainable = [name for name, param in policy.named_parameters() if param.requires_grad]
+    assert trainable
+    assert any("lora_" in name for name in trainable)
+    assert any("action_expert.action_embed" in name and "lora_" not in name for name in trainable)
+    assert policy.model(torch.ones(1, 2)).shape == (1, 2)
+
+
+def test_lora_vlm_unfreezes_action_expert_base_weights():
+    class DummyInnerModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.transformer = torch.nn.Module()
+            self.transformer.wq = torch.nn.Linear(2, 2)
+            self.action_expert = torch.nn.Module()
+            self.action_expert.action_embed = torch.nn.Linear(2, 2)
+
+    class DummyHFModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = DummyInnerModel()
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.model = DummyHFModel()
+
+    for param in policy.parameters():
+        param.requires_grad_(False)
+    policy._unfreeze_action_expert_parameters()
+
+    trainable = [name for name, param in policy.named_parameters() if param.requires_grad]
+    assert trainable
+    assert all("action_expert" in name for name in trainable)
+
+
+def test_train_action_expert_only_requires_continuous_action_mode():
+    with pytest.raises(ValueError, match="requires action_mode='continuous'"):
+        MolmoAct2Config(action_mode="both", train_action_expert_only=True)
+
+    with pytest.raises(ValueError, match="incompatible with enable_lora_vlm"):
+        MolmoAct2Config(action_mode="continuous", train_action_expert_only=True, enable_lora_vlm=True)
+
+    cfg = MolmoAct2Config(action_mode="continuous", train_action_expert_only=True)
+    assert cfg.train_action_expert_only
+
+
+def test_molmoact2_sequence_length_is_inferred_from_fixed_token_budget():
+    cfg = MolmoAct2Config(
+        action_mode="both",
+        chunk_size=10,
+        n_action_steps=10,
+        image_keys=["observation.images.image", "observation.images.wrist_image"],
+        input_features={OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,))},
+        output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,))},
+    )
+
+    assert cfg.max_sequence_length is None
+    assert cfg.inferred_max_sequence_length() == 640
+    assert cfg.inferred_max_sequence_length(include_discrete_action=False) == 576
+    assert (
+        infer_molmoact2_max_sequence_length(
+            num_images=2,
+            state_dim=8,
+            action_dim=7,
+            action_horizon=30,
+            include_discrete_action=True,
+        )
+        == 768
+    )
+
+
+def test_molmoact2_sequence_length_override_is_preserved():
+    cfg = MolmoAct2Config(max_sequence_length=1024)
+
+    assert cfg.inferred_max_sequence_length(num_images=2, state_dim=8, action_dim=7) == 1024
+
+
+def test_train_action_expert_only_freezes_non_action_expert_params():
+    class DummyBackbone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.transformer = torch.nn.Linear(2, 2)
+            self.vision_backbone = torch.nn.Linear(2, 2)
+            self.action_expert = torch.nn.Linear(2, 2)
+
+        def _require_action_expert(self):
+            return self.action_expert
+
+    class DummyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = DummyBackbone()
+            self.lm_head = torch.nn.Linear(2, 2)
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(train_action_expert_only=True)
+    policy.model = DummyModel()
+
+    policy._freeze_non_action_expert_parameters()
+    policy.train()
+
+    assert policy.model.model.action_expert.training
+    assert not policy.model.training
+    assert not policy.model.model.transformer.training
+    assert all(param.requires_grad for param in policy.model.model.action_expert.parameters())
+    assert not any(param.requires_grad for param in policy.model.model.transformer.parameters())
+    assert not any(param.requires_grad for param in policy.model.model.vision_backbone.parameters())
+    assert not any(param.requires_grad for param in policy.model.lm_head.parameters())
+
+
+def test_load_hf_model_accepts_max_action_horizon_schema(monkeypatch):
+    class DummyLoadedModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(
+                max_action_dim=32,
+                max_action_horizon=30,
+                action_mode="both",
+                add_action_expert=True,
+            )
+            self.model = torch.nn.Module()
+            self.embed_tokens = torch.nn.Embedding(4, 4)
+            self.lm_head = torch.nn.Linear(4, 4, bias=False)
+
+        def get_input_embeddings(self):
+            return self.embed_tokens
+
+    loaded_model = DummyLoadedModel()
+    resolved_kwargs = {}
+
+    def fake_resolve_checkpoint_location(checkpoint_path, **kwargs):
+        resolved_kwargs.update(kwargs)
+        return checkpoint_path
+
+    config_kwargs = {}
+    model_kwargs = {}
+
+    class DummyHFConfig:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args
+            config_kwargs.update(kwargs)
+            return SimpleNamespace()
+
+    class DummyMolmoAct2ForConditionalGeneration:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args
+            model_kwargs.update(kwargs)
+            return loaded_model
+
+    monkeypatch.setattr(molmoact2_modeling, "_resolve_checkpoint_location", fake_resolve_checkpoint_location)
+    monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
+    monkeypatch.setattr(
+        molmoact2_modeling,
+        "MolmoAct2ForConditionalGeneration",
+        DummyMolmoAct2ForConditionalGeneration,
+    )
+    monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(
+        checkpoint_path="/tmp/new-schema-checkpoint",
+        checkpoint_revision="main",
+        checkpoint_force_download=True,
+        chunk_size=10,
+        n_action_steps=10,
+        action_mode="both",
+    )
+
+    policy._load_hf_model()
+
+    assert policy.model is loaded_model
+    assert not hasattr(policy.model.config, "action_horizon")
+    assert policy.model.config.max_action_horizon == 10
+    assert policy._generation_action_horizon() == 10
+    assert resolved_kwargs == {"revision": "main", "force_download": True}
+    assert "trust_remote_code" not in config_kwargs
+    assert "trust_remote_code" not in model_kwargs
+
+
+def test_load_hf_model_chunk_size_overrides_larger_than_checkpoint_horizon(monkeypatch):
+    class DummyLoadedModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(
+                max_action_dim=32,
+                max_action_horizon=10,
+                action_mode="both",
+                add_action_expert=True,
+            )
+            self.model = torch.nn.Module()
+            self.embed_tokens = torch.nn.Embedding(4, 4)
+            self.lm_head = torch.nn.Linear(4, 4, bias=False)
+
+        def get_input_embeddings(self):
+            return self.embed_tokens
+
+    loaded_model = DummyLoadedModel()
+    monkeypatch.setattr(
+        molmoact2_modeling,
+        "_resolve_checkpoint_location",
+        lambda checkpoint_path, **kwargs: checkpoint_path,
+    )
+
+    class DummyHFConfig:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args, kwargs
+            return SimpleNamespace()
+
+    class DummyMolmoAct2ForConditionalGeneration:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args, kwargs
+            return loaded_model
+
+    monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
+    monkeypatch.setattr(
+        molmoact2_modeling,
+        "MolmoAct2ForConditionalGeneration",
+        DummyMolmoAct2ForConditionalGeneration,
+    )
+    monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(
+        checkpoint_path="/tmp/new-schema-checkpoint",
+        chunk_size=30,
+        n_action_steps=30,
+        action_mode="both",
+    )
+
+    policy._load_hf_model()
+
+    assert policy.model.config.max_action_horizon == 30
+    assert policy._generation_action_horizon() == 30
+
+
+def test_load_hf_model_rejects_legacy_action_horizon_schema(monkeypatch):
+    class DummyLoadedModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(
+                max_action_dim=32,
+                action_horizon=30,
+                action_mode="both",
+                add_action_expert=True,
+            )
+            self.model = torch.nn.Module()
+
+    monkeypatch.setattr(
+        molmoact2_modeling,
+        "_resolve_checkpoint_location",
+        lambda checkpoint_path, **kwargs: checkpoint_path,
+    )
+
+    class DummyHFConfig:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args, kwargs
+            return SimpleNamespace()
+
+    class DummyMolmoAct2ForConditionalGeneration:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            del args, kwargs
+            return DummyLoadedModel()
+
+    monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
+    monkeypatch.setattr(
+        molmoact2_modeling,
+        "MolmoAct2ForConditionalGeneration",
+        DummyMolmoAct2ForConditionalGeneration,
+    )
+    monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(
+        checkpoint_path="/tmp/legacy-schema-checkpoint",
+        chunk_size=10,
+        n_action_steps=10,
+        action_mode="both",
+    )
+
+    with pytest.raises(ValueError, match="max_action_horizon"):
+        policy._load_hf_model()
+
+
+def test_rtc_processor_initialization_and_select_action_guard():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(rtc_config=RTCConfig(enabled=True))
+
+    policy.init_rtc_processor()
+
+    assert policy.rtc_processor is not None
+    with pytest.raises(AssertionError, match="RTC is not supported for select_action"):
+        policy.select_action({})
+
+
+def test_select_action_uses_single_full_batch_queue():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(rtc_config=None, n_action_steps=2)
+    policy._action_queue = deque(maxlen=2)
+    calls = 0
+
+    def predict_action_chunk(batch, **kwargs):
+        nonlocal calls
+        del batch, kwargs
+        calls += 1
+        return torch.tensor(
+            [
+                [[1.0], [2.0]],
+                [[3.0], [4.0]],
+            ]
+        )
+
+    policy.predict_action_chunk = predict_action_chunk
+
+    first = policy.select_action({})
+    second = policy.select_action({})
+
+    assert calls == 1
+    assert torch.equal(first, torch.tensor([[1.0], [3.0]]))
+    assert torch.equal(second, torch.tensor([[2.0], [4.0]]))
+
+
+def test_inference_action_mode_is_explicit_and_has_no_action_mode_alias():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(action_mode="both", inference_action_mode=None)
+    policy._checkpoint_action_mode = None
+
+    with pytest.raises(ValueError, match="inference_action_mode.*explicitly"):
+        policy._resolve_inference_action_mode(None)
+    with pytest.raises(TypeError, match="unexpected keyword argument 'action_mode'"):
+        policy.predict_action_chunk({}, action_mode="continuous")
+
+
+def test_rtc_generation_uses_previous_chunk_prefix():
+    class DummyActionExpert(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.weight = torch.nn.Parameter(torch.tensor(1.0))
+
+        def prepare_context(self, **kwargs):
+            del kwargs
+            return SimpleNamespace()
+
+        def get_or_prepare_modulation_cache(self, timesteps, *, cache_key=None):
+            del cache_key
+            return [SimpleNamespace(conditioning=timestep) for timestep in timesteps]
+
+        def forward_with_context(self, actions, timesteps, *, context, modulation=None):
+            del timesteps, context, modulation
+            return torch.ones_like(actions) * self.weight
+
+    class DummyBackbone(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(
+                flow_matching_num_steps=2,
+                max_action_horizon=4,
+                max_action_dim=3,
+            )
+            self.action_expert = DummyActionExpert()
+            self.batch_size = 1
+
+        def _require_action_expert(self):
+            return self.action_expert
+
+        def forward(self, **kwargs):
+            self.batch_size = int(kwargs["input_ids"].shape[0])
+            return SimpleNamespace(past_key_values=object())
+
+        def _extract_kv_states(self, past_key_values):
+            del past_key_values
+            kv = torch.zeros(self.batch_size, 1, 1)
+            return [(kv, kv)]
+
+        def _get_encoder_attention_mask(self, input_ids, attention_mask):
+            del input_ids
+            return attention_mask
+
+        def _depth_gate_from_condition(self, **kwargs):
+            del kwargs
+            return None, None
+
+        def _apply_depth_gate_to_layer_kv_states(self, encoder_kv_states, depth_mask, depth_gate):
+            del depth_mask, depth_gate
+            return encoder_kv_states
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        mask_action_dim_padding=True,
+        rtc_config=RTCConfig(enabled=True, execution_horizon=2, max_guidance_weight=1.0),
+    )
+    policy.rtc_processor = None
+    policy.model = torch.nn.Module()
+    policy.model.model = DummyBackbone()
+    policy.init_rtc_processor()
+    model_inputs = {
+        "input_ids": torch.ones(1, 2, dtype=torch.long),
+        "attention_mask": torch.ones(1, 2, dtype=torch.long),
+    }
+    action_dim_is_pad = torch.tensor([[False, False, False]])
+
+    without_prefix = policy._generate_actions_from_inputs_with_rtc(
+        model_inputs=model_inputs,
+        action_dim_is_pad=action_dim_is_pad,
+        num_steps=2,
+        generator=torch.Generator().manual_seed(0),
+        inference_delay=0,
+        prev_chunk_left_over=None,
+        execution_horizon=None,
+    )
+    with_prefix = policy._generate_actions_from_inputs_with_rtc(
+        model_inputs=model_inputs,
+        action_dim_is_pad=action_dim_is_pad,
+        num_steps=2,
+        generator=torch.Generator().manual_seed(0),
+        inference_delay=0,
+        prev_chunk_left_over=torch.zeros(1, 4, 3),
+        execution_horizon=None,
+    )
+
+    assert without_prefix.shape == (1, 4, 3)
+    assert not torch.allclose(without_prefix, with_prefix)
+
+
+def test_discrete_state_string_matches_molmoact2_bins():
+    state = np.asarray([-1.0, 0.0, 1.0, np.nan, np.inf, -np.inf], dtype=np.float32)
+
+    assert _build_discrete_state_string(state, 256) == (
+        "<state_start><state_0><state_128><state_255><state_128><state_255><state_0><state_end>"
+    )
+
+
+def test_question_normalization_matches_release_prompt_style():
+    assert _normalize_question_text("Instruction: Pick up the cube, please!") == "pick up the cube, please"
+    assert (
+        _normalize_question_text("The task is to open drawer. Then close it.") == "open drawer; then close it"
+    )
+
+
+def test_action_padding_marks_only_real_dimensions():
+    step = object.__new__(MolmoAct2PackInputsProcessorStep)
+    step.max_action_dim = 32
+    action = torch.ones(2, 3, 7)
+
+    padded, horizon_is_pad, dim_is_pad = step._pad_action(action, None)
+
+    assert padded.shape == (2, 3, 32)
+    assert torch.equal(padded[..., :7], action)
+    assert torch.count_nonzero(padded[..., 7:]) == 0
+    assert not horizon_is_pad.any()
+    assert not dim_is_pad[:, :7].any()
+    assert dim_is_pad[:, 7:].all()
+
+
+def test_action_dim_padding_loss_reduces_like_old_trainer():
+    loss = torch.arange(2 * 2 * 3 * 4, dtype=torch.float32).reshape(2, 2, 3, 4)
+    action_dim_is_pad = torch.tensor(
+        [
+            [False, False, True, True],
+            [False, True, True, True],
+        ]
+    )
+
+    reduced = MolmoAct2Policy._apply_action_dim_padding_mask(loss, action_dim_is_pad)
+
+    expected = torch.stack(
+        [
+            loss[0, :, :, :2].sum(dim=-1) / 2,
+            loss[1, :, :, :1].sum(dim=-1) / 1,
+        ],
+        dim=0,
+    )
+    assert torch.equal(reduced, expected)
+
+
+def test_action_chunk_padding_keeps_old_mean_denominator():
+    loss = torch.ones(1, 2, 4, 3)
+    action_horizon_is_pad = torch.tensor([[False, False, True, True]])
+
+    masked = MolmoAct2Policy._apply_action_chunk_padding_mask(loss, action_horizon_is_pad)
+
+    assert masked.mean().item() == 0.5
+
+
+def test_selected_discrete_loss_matches_full_causal_lm_loss():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        softmax_auxiliary_loss=False,
+        softmax_auxiliary_loss_scale=1e-4,
+        discrete_loss_token_weighting="none",
+    )
+    policy.model = torch.nn.Module()
+    policy.model.lm_head = torch.nn.Linear(3, 5, bias=False)
+    outputs = type("Outputs", (), {})()
+    outputs.last_hidden_state = torch.randn(2, 4, 3)
+    labels = torch.tensor(
+        [
+            [-100, 1, 2, -100],
+            [-100, -100, 3, 4],
+        ]
+    )
+
+    selected_loss, z_loss = policy._discrete_loss_from_backbone_outputs({"labels": labels}, outputs)
+
+    logits = policy.model.lm_head(outputs.last_hidden_state)
+    shift_labels = F.pad(labels, (0, 1), value=-100)[..., 1:].contiguous()
+    expected_loss = F.cross_entropy(logits.float().view(-1, 5), shift_labels.view(-1), ignore_index=-100)
+    assert torch.allclose(selected_loss, expected_loss)
+    assert z_loss is None
+
+
+def test_discrete_z_loss_matches_old_trainer_formula():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        softmax_auxiliary_loss=True,
+        softmax_auxiliary_loss_scale=1e-4,
+        discrete_loss_token_weighting="none",
+    )
+    policy.model = torch.nn.Module()
+    policy.model.lm_head = torch.nn.Linear(3, 5, bias=False)
+    outputs = type("Outputs", (), {})()
+    outputs.last_hidden_state = torch.randn(2, 4, 3)
+    labels = torch.tensor(
+        [
+            [-100, 1, 2, -100],
+            [-100, -100, 3, 4],
+        ]
+    )
+
+    ce_loss, z_loss = policy._discrete_loss_from_backbone_outputs({"labels": labels}, outputs)
+
+    logits = policy.model.lm_head(outputs.last_hidden_state).float()
+    shift_labels = F.pad(labels, (0, 1), value=-100)[..., 1:].contiguous()
+    valid = shift_labels != -100
+    expected_ce = F.cross_entropy(logits.view(-1, 5), shift_labels.view(-1), ignore_index=-100)
+    expected_z = 1e-4 * logits.logsumexp(dim=-1)[valid].pow(2).mean()
+    assert torch.allclose(ce_loss, expected_ce)
+    assert z_loss is not None
+    assert torch.allclose(z_loss, expected_z)
+
+
+def test_discrete_reduction_none_preserves_mean_loss():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        softmax_auxiliary_loss=True,
+        softmax_auxiliary_loss_scale=1e-4,
+        discrete_loss_token_weighting="root_subsegments_root_tokens",
+    )
+    policy.model = torch.nn.Module()
+    policy.model.lm_head = torch.nn.Linear(3, 5, bias=False)
+    outputs = type("Outputs", (), {})()
+    outputs.last_hidden_state = torch.randn(3, 5, 3)
+    labels = torch.tensor(
+        [
+            [-100, 1, -100, -100, -100],
+            [-100, -100, 2, 3, -100],
+            [-100, 4, 3, 2, 1],
+        ]
+    )
+
+    ce_mean, z_mean = policy._discrete_loss_from_backbone_outputs(
+        {"labels": labels},
+        outputs,
+        reduction="mean",
+    )
+    ce_none, z_none = policy._discrete_loss_from_backbone_outputs(
+        {"labels": labels},
+        outputs,
+        reduction="none",
+    )
+
+    assert ce_none.shape == (3,)
+    assert z_none is not None
+    assert z_none.shape == (3,)
+    assert torch.allclose(ce_none.mean(), ce_mean)
+    assert torch.allclose(z_none.mean(), z_mean)
+
+
+def test_forward_reduction_none_returns_per_sample_discrete_loss():
+    class DummyBackbone(torch.nn.Module):
+        def __init__(self, hidden_states):
+            super().__init__()
+            self.hidden_states = hidden_states
+
+        def forward(self, **kwargs):
+            del kwargs
+            return SimpleNamespace(last_hidden_state=self.hidden_states)
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        action_mode="discrete",
+        inference_action_mode="discrete",
+        model_dtype="float32",
+        softmax_auxiliary_loss=True,
+        softmax_auxiliary_loss_scale=1e-4,
+        discrete_loss_token_weighting="none",
+    )
+    policy.model = torch.nn.Module()
+    policy.model.lm_head = torch.nn.Linear(3, 5, bias=False)
+    hidden_states = torch.randn(2, 4, 3)
+    policy._backbone = lambda: DummyBackbone(hidden_states)
+    batch = {
+        "input_ids": torch.ones(2, 4, dtype=torch.long),
+        "labels": torch.tensor(
+            [
+                [-100, 1, 2, -100],
+                [-100, -100, 3, 4],
+            ]
+        ),
+    }
+
+    loss_none, metrics_none = policy.forward(batch, reduction="none")
+    loss_mean, metrics_mean = policy.forward(batch, reduction="mean")
+
+    assert loss_none.shape == (2,)
+    assert torch.allclose(loss_none.mean(), loss_mean)
+    assert metrics_none["loss"] == pytest.approx(metrics_mean["loss"])
+
+
+def test_discrete_root_token_weighting_matches_old_loss_mask_scaling():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(
+        softmax_auxiliary_loss=True,
+        softmax_auxiliary_loss_scale=1e-4,
+        discrete_loss_token_weighting="root_subsegments_root_tokens",
+    )
+    policy.model = torch.nn.Module()
+    policy.model.lm_head = torch.nn.Linear(3, 5, bias=False)
+    outputs = type("Outputs", (), {})()
+    outputs.last_hidden_state = torch.randn(2, 4, 3)
+    labels = torch.tensor(
+        [
+            [-100, -100, 1, -100],
+            [-100, 2, 3, 4],
+        ]
+    )
+
+    ce_loss, z_loss = policy._discrete_loss_from_backbone_outputs({"labels": labels}, outputs)
+
+    logits = policy.model.lm_head(outputs.last_hidden_state).float()
+    shift_labels = F.pad(labels, (0, 1), value=-100)[..., 1:].contiguous()
+    valid = shift_labels != -100
+    log_z = logits.logsumexp(dim=-1)
+    token_ce = log_z - logits.gather(dim=-1, index=shift_labels.clamp_min(0).unsqueeze(-1)).squeeze(-1)
+    weights = torch.zeros_like(token_ce)
+    counts = valid.sum(dim=1).float()
+    weights[valid] = (2.0 / torch.sqrt(counts))[:, None].expand_as(weights)[valid]
+    expected_ce = (token_ce * weights).sum() / weights.sum()
+    expected_z = 1e-4 * (log_z.pow(2) * weights).sum() / weights.sum()
+    assert torch.allclose(ce_loss, expected_ce)
+    assert z_loss is not None
+    assert torch.allclose(z_loss, expected_z)
+
+
+class _DummyActionTokenizer:
+    def decode(self, tokens, *, time_horizon=None, action_dim=None):
+        decoded = []
+        for token_row in tokens:
+            decoded.append(np.full((time_horizon, action_dim), sum(token_row), dtype=np.float32))
+        return np.stack(decoded)
+
+
+def test_discrete_decode_extracts_action_bins_for_each_batch():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = SimpleNamespace(chunk_size=2)
+    policy.action_tokenizer = _DummyActionTokenizer()
+    policy.model = torch.nn.Module()
+    policy.model.config = SimpleNamespace(
+        action_start_token_id=10,
+        action_end_token_id=11,
+        action_token_start_id=100,
+        num_action_tokens=4,
+        action_horizon=2,
+    )
+
+    actions = policy._decode_discrete_action_chunk(
+        torch.tensor(
+            [
+                [10, 100, 101, 11, 2],
+                [10, 102, 103, 11, 2],
+            ]
+        ),
+        action_dim=2,
+    )
+
+    assert actions.shape == (2, 2, 2)
+    assert torch.equal(actions[0], torch.ones(2, 2))
+    assert torch.equal(actions[1], torch.full((2, 2), 5.0))
+
+
+def test_discrete_predict_action_chunk_uses_hf_cached_generation_path():
+    class DummyOutput:
+        def __init__(self, token_id, batch_size):
+            logits = torch.full((batch_size, 1, 128), -1e9)
+            logits[:, :, token_id] = 1.0
+            self.logits = logits
+            self.past_key_values = object()
+
+    class DummyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.weight = torch.nn.Parameter(torch.tensor(1.0))
+            self.config = SimpleNamespace(
+                action_start_token_id=10,
+                action_end_token_id=11,
+                action_token_start_id=100,
+                num_action_tokens=4,
+                action_horizon=2,
+            )
+            self.tokens = [10, 100, 101, 11, 2]
+            self.index = 0
+
+        def forward(self, **kwargs):
+            batch_size = int(kwargs["input_ids"].shape[0])
+            return DummyOutput(self.tokens[self.index], batch_size)
+
+        def _consume_generation_tokens(self, token_ids, *, past_key_values, attention_mask):
+            del past_key_values
+            self.index += 1
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones_like(token_ids[:, None])], dim=-1)
+            return DummyOutput(self.tokens[self.index], int(token_ids.shape[0])), attention_mask
+
+        def _require_eos_token_id(self):
+            return 2
+
+        def _action_token_id_to_bin(self):
+            return {100: 0, 101: 1, 102: 2, 103: 3}
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(
+        action_mode="discrete",
+        inference_action_mode="discrete",
+        model_dtype="float32",
+        output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(2,))},
+        discrete_generation_max_steps=None,
+        discrete_action_tokenizer="unused",
+        chunk_size=2,
+        n_action_steps=1,
+        rtc_config=None,
+    )
+    policy._checkpoint_action_mode = None
+    policy.model = DummyModel()
+    policy.action_tokenizer = _DummyActionTokenizer()
+
+    actions = policy.predict_action_chunk(
+        {
+            "input_ids": torch.ones(1, 3, dtype=torch.long),
+            "attention_mask": torch.ones(1, 3, dtype=torch.long),
+        }
+    )
+
+    assert policy.model.index == 4
+    assert actions.shape == (1, 1, 2)
+    assert torch.equal(actions, torch.ones(1, 1, 2))
+
+
+def test_discrete_predict_action_chunk_uses_graph_backed_ar_decode_when_enabled():
+    class DummyOutput:
+        def __init__(self, token_id, past_key_values):
+            logits = torch.full((1, 1, 128), -1e9)
+            logits[:, :, token_id] = 1.0
+            self.logits = logits
+            self.past_key_values = past_key_values
+
+    class DummyLmHead(torch.nn.Module):
+        def forward(self, hidden_states):
+            token_id = int(hidden_states[0, 0, 0].item())
+            logits = torch.full((1, 1, 128), -1e9)
+            logits[:, :, token_id] = 1.0
+            return logits
+
+    class DummyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.weight = torch.nn.Parameter(torch.tensor(1.0))
+            self.lm_head = DummyLmHead()
+            self.config = SimpleNamespace(
+                action_start_token_id=10,
+                action_end_token_id=11,
+                action_token_start_id=100,
+                num_action_tokens=4,
+                action_horizon=2,
+            )
+            self.tokens = [10, 100, 101, 11, 2]
+            self.index = 0
+            self.used_static_cache = False
+            self.graph_steps = 0
+
+        def forward(self, **kwargs):
+            self.used_static_cache = kwargs.get("past_key_values") == "static-cache"
+            return DummyOutput(self.tokens[self.index], kwargs.get("past_key_values"))
+
+        def _make_ar_decode_static_cache(self, inputs, *, max_steps):
+            assert int(inputs["input_ids"].shape[1]) == 3
+            assert max_steps == 32
+            return "static-cache"
+
+        def _make_depth_decode_attention_bias(self, inputs, past_key_values):
+            assert past_key_values == "static-cache"
+            return torch.ones(1, 1, 35, 35, dtype=torch.float32)
+
+        def _run_ar_decode_step(self, token_ids, *, past_key_values, attention_bias):
+            assert past_key_values == "static-cache"
+            assert attention_bias.shape == (1, 1, 35, 35)
+            self.index += 1
+            self.graph_steps += 1
+            return torch.tensor([[[float(self.tokens[self.index])]]]), past_key_values
+
+        def _require_eos_token_id(self):
+            return 2
+
+        def _action_token_id_to_bin(self):
+            return {100: 0, 101: 1, 102: 2, 103: 3}
+
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.config = MolmoAct2Config(
+        action_mode="discrete",
+        inference_action_mode="discrete",
+        model_dtype="float32",
+        output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(2,))},
+        discrete_generation_max_steps=None,
+        discrete_action_tokenizer="unused",
+        chunk_size=2,
+        n_action_steps=1,
+        rtc_config=None,
+        enable_inference_cuda_graph=True,
+    )
+    policy._checkpoint_action_mode = None
+    policy.model = DummyModel()
+    policy.action_tokenizer = _DummyActionTokenizer()
+    torch.nn.Module.train(policy, False)
+
+    actions = policy.predict_action_chunk(
+        {
+            "input_ids": torch.ones(1, 3, dtype=torch.long),
+            "attention_mask": torch.ones(1, 3, dtype=torch.long),
+        }
+    )
+
+    assert policy.model.used_static_cache
+    assert policy.model.graph_steps == 4
+    assert actions.shape == (1, 1, 2)
+    assert torch.equal(actions, torch.ones(1, 1, 2))
+
+
+class _DummyMolmoBackbone(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embed = torch.nn.Embedding(5, 3)
+
+    def get_input_embeddings(self):
+        return self.embed
+
+
+class _DummyMolmoModel(torch.nn.Module):
+    def __init__(self, *, tie_lm_head: bool = False):
+        super().__init__()
+        self.model = _DummyMolmoBackbone()
+        self.lm_head = torch.nn.Linear(3, 5, bias=False)
+        if tie_lm_head:
+            self.lm_head.weight = self.model.embed.weight
+
+    def get_input_embeddings(self):
+        return self.model.embed
+
+
+def test_freeze_embedding_freezes_input_embeddings_only_when_untied():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.model = _DummyMolmoModel()
+
+    policy._freeze_input_embeddings()
+
+    assert not policy.model.model.embed.weight.requires_grad
+    assert policy.model.lm_head.weight.requires_grad
+
+
+def test_freeze_embedding_rejects_tied_lm_head_without_mutating():
+    policy = object.__new__(MolmoAct2Policy)
+    torch.nn.Module.__init__(policy)
+    policy.model = _DummyMolmoModel(tie_lm_head=True)
+
+    with pytest.raises(RuntimeError, match="would also freeze lm_head"):
+        policy._freeze_input_embeddings()
+
+    assert policy.model.model.embed.weight.requires_grad
diff --git a/uv.lock b/uv.lock
index 3eb1dda23..eebbb7f95 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2915,6 +2915,11 @@ metaworld = [
     { name = "scipy" },
     { name = "torchcodec", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'win32'" },
 ]
+molmoact2 = [
+    { name = "peft" },
+    { name = "scipy" },
+    { name = "transformers" },
+]
 motorbridge-dep = [
     { name = "motorbridge" },
 ]
@@ -3131,6 +3136,7 @@ requires-dist = [
     { name = "lerobot", extras = ["matplotlib-dep"], marker = "extra == 'sarm'" },
     { name = "lerobot", extras = ["matplotlib-dep"], marker = "extra == 'unitree-g1'" },
     { name = "lerobot", extras = ["metaworld"], marker = "extra == 'all'" },
+    { name = "lerobot", extras = ["molmoact2"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["motorbridge-dep"], marker = "extra == 'rebot'" },
     { name = "lerobot", extras = ["motorbridge-smart-servo-dep"], marker = "extra == 'rebot'" },
     { name = "lerobot", extras = ["multi-task-dit"], marker = "extra == 'all'" },
@@ -3138,6 +3144,7 @@ requires-dist = [
     { name = "lerobot", extras = ["openarms"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["peft"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'groot'" },
+    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'molmoact2'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'peft'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'wallx'" },
     { name = "lerobot", extras = ["phone"], marker = "extra == 'all'" },
@@ -3165,6 +3172,7 @@ requires-dist = [
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'aloha'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'libero'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'metaworld'" },
+    { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'molmoact2'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'phone'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'pi'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'wallx'" },
@@ -3176,6 +3184,7 @@ requires-dist = [
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'groot'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'hilserl'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'libero'" },
+    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'molmoact2'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'multi-task-dit'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'peft'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'pi'" },
@@ -3249,7 +3258,7 @@ requires-dist = [
     { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
     { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "smolvla", "multi-task-dit", "groot", "sarm", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "molmoact2", "smolvla", "multi-task-dit", "groot", "sarm", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
 
 [[package]]
 name = "librt"

From b8ad81bf397d59dda69ccfc7e74e847f0a9d4fbf Mon Sep 17 00:00:00 2001
From: Khalil Meftah <khalil.meftah@huggingface.co>
Date: Fri, 29 May 2026 21:45:39 +0200
Subject: [PATCH 07/45] feat(rewards): add ROBOMETER reward model (#3627)

* feat/add ROBOMETER reward model

* feat(rewards): add Robometer offline progress labeling script

* fix(rewards/robometer): add missing input keys mm_token_type_ids

* chore(rewards/robometer): default to lerobot/Robometer-4b model

* doc(rewards/robometer): update citation and original github link

* feat(rewards/robometer): add image key argument to compute Robometer progress
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/robometer.mdx                     | 185 +++++++
 pyproject.toml                                |   2 +
 src/lerobot/rewards/__init__.py               |   2 +
 src/lerobot/rewards/factory.py                |  18 +-
 src/lerobot/rewards/robometer/__init__.py     |  19 +
 .../rewards/robometer/compute_rabc_weights.py | 320 ++++++++++++
 .../robometer/configuration_robometer.py      | 158 ++++++
 .../rewards/robometer/modeling_robometer.py   | 481 ++++++++++++++++++
 .../rewards/robometer/processor_robometer.py  | 338 ++++++++++++
 .../lerobot_rewardmodel_modelcard_template.md |   2 +
 tests/rewards/test_modeling_robometer.py      | 340 +++++++++++++
 tests/rewards/test_robometer_processor.py     | 354 +++++++++++++
 uv.lock                                       |  11 +-
 14 files changed, 2229 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/robometer.mdx
 create mode 100644 src/lerobot/rewards/robometer/__init__.py
 create mode 100644 src/lerobot/rewards/robometer/compute_rabc_weights.py
 create mode 100644 src/lerobot/rewards/robometer/configuration_robometer.py
 create mode 100644 src/lerobot/rewards/robometer/modeling_robometer.py
 create mode 100644 src/lerobot/rewards/robometer/processor_robometer.py
 create mode 100644 tests/rewards/test_modeling_robometer.py
 create mode 100644 tests/rewards/test_robometer_processor.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 1d4d9e770..a216548d8 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -75,6 +75,8 @@
 - sections:
   - local: sarm
     title: SARM
+  - local: robometer
+    title: ROBOMETER
   - local: topreward
     title: TOPReward
   title: "Reward Models"
diff --git a/docs/source/robometer.mdx b/docs/source/robometer.mdx
new file mode 100644
index 000000000..5af6882d3
--- /dev/null
+++ b/docs/source/robometer.mdx
@@ -0,0 +1,185 @@
+# ROBOMETER
+
+ROBOMETER is a **general-purpose video-language robotic reward model**. It predicts dense, frame-level task progress and frame-level success from a trajectory video and a task description.
+
+**Paper**: [ROBOMETER: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons](https://arxiv.org/abs/2603.02115)
+**Project**: [robometer.github.io](https://robometer.github.io/)
+**Original code**: [github.com/robometer/robometer](https://github.com/robometer/robometer)
+**Checkpoint**: [lerobot/Robometer-4B](https://huggingface.co/lerobot/Robometer-4B)
+
+## Overview
+
+ROBOMETER builds on `Qwen/Qwen3-VL-4B-Instruct` and adds three lightweight prediction heads:
+
+- **Progress head**: predicts per-frame task progress in `[0, 1]`.
+- **Success head**: predicts per-frame task success probability.
+- **Preference head**: predicts which of two trajectories better completes the task during training.
+
+The paper trains ROBOMETER with a composite objective:
+
+```text
+L = L_pref + L_prog + L_succ
+```
+
+The LeRobot integration is currently **inference-only**. It preserves the preference head so that the published `Robometer-4B` checkpoint loads without remapping, but `compute_reward()` queries the progress or success head only.
+
+## What the LeRobot Integration Covers
+
+- Standard `reward_model.type=robometer` configuration through LeRobot.
+- Qwen3-VL image and text preprocessing through `RobometerEncoderProcessorStep`.
+- LeRobot reward-model save/load APIs through `PreTrainedRewardModel`.
+- Dense, frame-level progress and success predictions internally.
+- A scalar reward through `compute_reward()` for downstream LeRobot reward-model usage.
+
+This page focuses on using the published ROBOMETER checkpoint as a zero-shot reward model. Training ROBOMETER from scratch is outside the current LeRobot integration.
+
+## Installation Requirements
+
+1. Install LeRobot by following the [Installation Guide](./installation).
+2. Install the ROBOMETER dependencies:
+
+```bash
+pip install -e ".[robometer]"
+```
+
+If you use `uv` directly from a source checkout:
+
+```bash
+uv sync --extra robometer
+```
+
+ROBOMETER uses a Qwen3-VL-4B backbone, so GPU inference is strongly recommended.
+
+## Model Inputs and Outputs
+
+ROBOMETER expects:
+
+- A trajectory video or sequence of frames.
+- A natural-language task description.
+
+In LeRobot datasets, the preprocessor reads:
+
+| Config field              | Default                  | Meaning                                               |
+| ------------------------- | ------------------------ | ----------------------------------------------------- |
+| `reward_model.image_key`  | `observation.images.top` | Camera/video observation used by ROBOMETER            |
+| `reward_model.task_key`   | `task`                   | Key in complementary data that stores the task string |
+| `reward_model.max_frames` | `8`                      | Maximum number of frames passed to ROBOMETER          |
+
+The model predicts per-frame progress and success internally. The LeRobot reward API returns a scalar per sample:
+
+- `reward_output="progress"` (default): return the last-frame progress, clamped to `[0, 1]`.
+- `reward_output="success"`: return `1.0` if the last-frame success probability is above `success_threshold`, otherwise `0.0`.
+
+## Usage
+
+### Load the Reward Model Directly
+
+```python
+from lerobot.rewards.robometer import RobometerConfig, RobometerRewardModel
+
+cfg = RobometerConfig(
+    pretrained_path="lerobot/Robometer-4B",
+    device="cuda",
+    reward_output="progress",
+)
+reward_model = RobometerRewardModel.from_pretrained(cfg.pretrained_path, config=cfg)
+```
+
+### Encode Frames and Compute a Reward
+
+For a direct Python call, provide frames as `uint8` arrays with shape `(T, H, W, C)` and a task string:
+
+```python
+from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_FEATURE_PREFIX
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+
+# frames: np.ndarray, shape (T, H, W, C), dtype uint8
+# task: str
+encoder = RobometerEncoderProcessorStep(
+    base_model_id=cfg.base_model_id,
+    use_multi_image=cfg.use_multi_image,
+    use_per_frame_progress_token=cfg.use_per_frame_progress_token,
+    max_frames=cfg.max_frames,
+)
+
+encoded = encoder.encode_samples([(frames, task)])
+batch = {f"{ROBOMETER_FEATURE_PREFIX}{key}": value for key, value in encoded.items()}
+
+reward = reward_model.compute_reward(batch)
+```
+
+`reward` is a tensor of shape `(batch_size,)`.
+
+### Use the Reward Factory
+
+You can also instantiate ROBOMETER through the reward factory:
+
+```python
+from lerobot.rewards import make_reward_model, make_reward_model_config, make_reward_pre_post_processors
+
+cfg = make_reward_model_config(
+    "robometer",
+    pretrained_path="lerobot/Robometer-4B",
+    device="cuda",
+    image_key="observation.images.top",
+)
+reward_model = make_reward_model(cfg)
+preprocessor, postprocessor = make_reward_pre_post_processors(cfg)
+```
+
+The preprocessor writes Qwen-VL tensors under the `observation.robometer.*` namespace, and `compute_reward()` reads those encoded tensors.
+
+## Configuration Notes
+
+### Backbone and Vocabulary
+
+The published checkpoint uses a Qwen3-VL-4B backbone. ROBOMETER adds five special tokens to the tokenizer in a fixed order:
+
+```text
+<|split_token|>
+<|reward_token|>
+<|pref_token|>
+<|sim_token|>
+<|prog_token|>
+```
+
+`<|prog_token|>` is inserted after each frame and is the hidden-state position used for per-frame progress and success prediction. `<|split_token|>` and `<|pref_token|>` are used by the paper's pairwise trajectory preference objective. `<|reward_token|>` and `<|sim_token|>` are preserved for checkpoint compatibility.
+
+The LeRobot config stores a serialized `vlm_config` with the post-resize vocabulary so the model can reload from `config.json` without downloading the base Qwen weights first. For `Qwen/Qwen3-VL-4B-Instruct`, the tokenizer length is `151669`, and the five ROBOMETER tokens produce the checkpoint vocabulary size `151674`.
+
+### Progress Prediction
+
+In the published checkpoint, progress is discrete. The progress head outputs logits over `progress_discrete_bins=10` uniformly spaced bin centers in `[0, 1]`. LeRobot converts these logits into a continuous value by applying a softmax and taking the expectation over bin centers, matching the upstream ROBOMETER implementation.
+
+### Success Prediction
+
+The success head outputs raw logits per frame. LeRobot converts them to probabilities with `sigmoid`. When `reward_output="success"`, `compute_reward()` thresholds the last-frame success probability using `success_threshold`.
+
+## Limitations
+
+- The current LeRobot integration is inference-only; it does not implement ROBOMETER training or preference-pair training.
+- `compute_reward()` returns a scalar per sample for the LeRobot reward-model API, even though ROBOMETER predicts per-frame progress and success internally.
+- ROBOMETER is video-language based; it does not use privileged robot state such as contact forces or object poses.
+
+## References
+
+- [ROBOMETER project](https://robometer.github.io/)
+- [ROBOMETER paper](https://arxiv.org/abs/2603.02115)
+- [Original ROBOMETER code](https://github.com/robometer/robometer)
+- [Published ROBOMETER-4B checkpoint](https://huggingface.co/lerobot/Robometer-4B)
+- [Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)
+
+## Citation
+
+```bibtex
+@inproceedings{liang2026robometer,
+title = {Robometer: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons},
+author={Anthony Liang and Yigit Korkmaz and Jiahui Zhang and Minyoung Hwang and Abrar Anwar and Sidhant Kaushik and Aditya Shah and Alex S. Huang and Luke Zettlemoyer and Dieter Fox and Yu Xiang and Anqi Li and Andreea Bobu and Abhishek Gupta and Stephen Tu and Erdem Biyik and Jesse Zhang},
+year={2026},
+booktitle={Robotics: Science and Systems 2026},
+}
+```
+
+## License
+
+This LeRobot integration follows the **Apache 2.0 License** used by LeRobot. Check the upstream ROBOMETER code and model pages for the licenses of the original implementation and released checkpoints.
diff --git a/pyproject.toml b/pyproject.toml
index a6785c564..ef7a36873 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -212,6 +212,7 @@ groot = [
     "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
 ]
 sarm = ["lerobot[transformers-dep]", "pydantic>=2.0.0,<3.0.0", "faker>=33.0.0,<35.0.0", "lerobot[matplotlib-dep]", "lerobot[qwen-vl-utils-dep]"]
+robometer = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]", "lerobot[peft-dep]"]
 topreward = ["lerobot[transformers-dep]"]
 xvla = ["lerobot[transformers-dep]"]
 eo1 = ["lerobot[transformers-dep]", "lerobot[qwen-vl-utils-dep]"]
@@ -291,6 +292,7 @@ all = [
     "lerobot[libero]; sys_platform == 'linux'",
     "lerobot[metaworld]",
     "lerobot[sarm]",
+    "lerobot[robometer]",
     "lerobot[topreward]",
     "lerobot[peft]",
     # "lerobot[unitree_g1]", TODO: Unitree requires specific installation instructions for unitree_sdk2
diff --git a/src/lerobot/rewards/__init__.py b/src/lerobot/rewards/__init__.py
index ae23424e3..093a34cb2 100644
--- a/src/lerobot/rewards/__init__.py
+++ b/src/lerobot/rewards/__init__.py
@@ -20,12 +20,14 @@ from .factory import (
     make_reward_pre_post_processors as make_reward_pre_post_processors,
 )
 from .pretrained import PreTrainedRewardModel as PreTrainedRewardModel
+from .robometer.configuration_robometer import RobometerConfig as RobometerConfig
 from .sarm.configuration_sarm import SARMConfig as SARMConfig
 from .topreward.configuration_topreward import TOPRewardConfig as TOPRewardConfig
 
 __all__ = [
     # Configuration classes
     "RewardClassifierConfig",
+    "RobometerConfig",
     "SARMConfig",
     "TOPRewardConfig",
     # Base class
diff --git a/src/lerobot/rewards/factory.py b/src/lerobot/rewards/factory.py
index d500cc593..2d73ae575 100644
--- a/src/lerobot/rewards/factory.py
+++ b/src/lerobot/rewards/factory.py
@@ -25,6 +25,7 @@ from lerobot.processor import PolicyAction, PolicyProcessorPipeline
 
 from .classifier.configuration_classifier import RewardClassifierConfig
 from .pretrained import PreTrainedRewardModel
+from .robometer.configuration_robometer import RobometerConfig
 from .sarm.configuration_sarm import SARMConfig
 from .topreward.configuration_topreward import TOPRewardConfig
 
@@ -38,7 +39,7 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
 
     Args:
         name: The name of the reward model. Supported names are "reward_classifier",
-              "sarm", "topreward".
+              "sarm", "robometer", "topreward".
 
     Returns:
         The reward model class corresponding to the given name.
@@ -54,6 +55,10 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
         from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel
 
         return SARMRewardModel
+    elif name == "robometer":
+        from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+        return RobometerRewardModel
     elif name == "topreward":
         from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
 
@@ -74,7 +79,7 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
 
     Args:
         reward_type: The type of the reward model. Supported types include
-                     "reward_classifier", "sarm", "topreward".
+                     "reward_classifier", "sarm", "robometer", "topreward".
         **kwargs: Keyword arguments to be passed to the configuration class constructor.
 
     Returns:
@@ -87,6 +92,8 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
         return RewardClassifierConfig(**kwargs)
     elif reward_type == "sarm":
         return SARMConfig(**kwargs)
+    elif reward_type == "robometer":
+        return RobometerConfig(**kwargs)
     elif reward_type == "topreward":
         return TOPRewardConfig(**kwargs)
     else:
@@ -168,6 +175,13 @@ def make_reward_pre_post_processors(
             dataset_stats=kwargs.get("dataset_stats"),
             dataset_meta=kwargs.get("dataset_meta"),
         )
+    elif isinstance(reward_cfg, RobometerConfig):
+        from lerobot.rewards.robometer.processor_robometer import make_robometer_pre_post_processors
+
+        return make_robometer_pre_post_processors(
+            config=reward_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+        )
 
     elif isinstance(reward_cfg, TOPRewardConfig):
         from lerobot.rewards.topreward.processor_topreward import make_topreward_pre_post_processors
diff --git a/src/lerobot/rewards/robometer/__init__.py b/src/lerobot/rewards/robometer/__init__.py
new file mode 100644
index 000000000..d20d92d37
--- /dev/null
+++ b/src/lerobot/rewards/robometer/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_robometer import RobometerConfig
+from .modeling_robometer import RobometerRewardModel
+from .processor_robometer import make_robometer_pre_post_processors
+
+__all__ = ["RobometerConfig", "RobometerRewardModel", "make_robometer_pre_post_processors"]
diff --git a/src/lerobot/rewards/robometer/compute_rabc_weights.py b/src/lerobot/rewards/robometer/compute_rabc_weights.py
new file mode 100644
index 000000000..8e2b016fb
--- /dev/null
+++ b/src/lerobot/rewards/robometer/compute_rabc_weights.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute per-frame Robometer progress and success curves for a LeRobot dataset.
+
+For each episode, builds per-frame sub-samples using the frame-steps
+strategy from the Robometer eval server: for each original frame ``t``,
+linspace-subsample ``[0, t]`` into ``K`` frames (default 4, matching
+``NUM_SUBSAMPLED_FRAMES`` in the eval server), run one forward through
+the Robometer processor + model, and keep the last-frame progress value.
+All sub-samples are the same size ``K`` so they batch cleanly.
+
+The parquet uses the same schema as SARM's
+:mod:`lerobot.rewards.sarm.compute_rabc_weights` so existing consumers —
+:class:`lerobot.rewards.sarm.rabc.RABCWeights` (which reads
+``progress_sparse``) and the progress-overlay script in
+``examples/dataset/create_progress_videos.py`` — work without modification.
+
+Usage:
+    # Dense per-frame progress for one episode
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --episodes 0
+
+    # All episodes with batching
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --batch-size 16
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import torch
+from tqdm import tqdm
+
+from lerobot.datasets import LeRobotDataset
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+from lerobot.types import TransitionKey
+
+DEFAULT_OUTPUT_FILENAME = "robometer_progress.parquet"
+
+# Upstream Robometer eval server uses K=4 for frame-steps sub-samples.
+DEFAULT_NUM_SUBSAMPLED_FRAMES = 4
+
+
+def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
+    """Read ``reward_model_path`` from parquet metadata if available."""
+    if not parquet_path.exists():
+        return None
+    try:
+        metadata = pq.read_metadata(parquet_path).schema.to_arrow_schema().metadata
+        if metadata and b"reward_model_path" in metadata:
+            return metadata[b"reward_model_path"].decode()
+    except Exception:  # nosec B110
+        return None
+    return None
+
+
+def _resolve_task(sample: dict[str, Any], default: str) -> str:
+    """Best-effort task extraction from a dataset sample."""
+    task = sample.get("task")
+    if isinstance(task, str) and task:
+        return task
+    return default
+
+
+def _build_subsample_indices(num_frames: int, num_subsampled_frames: int) -> list[np.ndarray]:
+    """Frame-steps linspace expansion.
+
+    For each ``t in [0, num_frames - 1]`` returns ``num_subsampled_frames``
+    indices from ``np.linspace(0, t, num_subsampled_frames)`` — the first
+    and last frames are always included. Each entry is a fixed-size array
+    so the model can batch them.
+    """
+    return [np.linspace(0, t, num_subsampled_frames).round().astype(np.int64) for t in range(num_frames)]
+
+
+def compute_robometer_progress(
+    dataset_repo_id: str,
+    reward_model_path: str,
+    output_path: str | None = None,
+    device: str = "cuda",
+    batch_size: int = 32,
+    num_subsampled_frames: int = DEFAULT_NUM_SUBSAMPLED_FRAMES,
+    episodes: list[int] | None = None,
+    image_key: str | None = None,
+) -> Path:
+    """Run Robometer over a dataset and write per-frame progress + success."""
+    logging.info(f"Loading Robometer: {reward_model_path}")
+    config = RobometerConfig(pretrained_path=reward_model_path, device=device)
+    if image_key is not None:
+        config.image_key = image_key
+    model = RobometerRewardModel.from_pretrained(reward_model_path, config=config)
+    model.to(device).eval()
+
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=config.base_model_id,
+        image_key=config.image_key,
+        task_key=config.task_key,
+        default_task=config.default_task,
+        max_frames=num_subsampled_frames,
+        use_multi_image=config.use_multi_image,
+        use_per_frame_progress_token=config.use_per_frame_progress_token,
+    )
+
+    image_key = config.image_key
+
+    logging.info(f"Loading dataset: {dataset_repo_id}")
+    dataset = LeRobotDataset(dataset_repo_id, download_videos=True)
+    logging.info(f"Dataset: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
+
+    episode_indices = list(range(dataset.num_episodes)) if episodes is None else episodes
+    logging.info(f"Processing {len(episode_indices)} episode(s)")
+
+    all_index: list[int] = []
+    all_episode: list[int] = []
+    all_frame: list[int] = []
+    all_progress: list[float] = []
+
+    for episode_idx in tqdm(episode_indices, desc="Episodes"):
+        ep = dataset.meta.episodes[episode_idx]
+        ep_start = int(ep["dataset_from_index"])
+        ep_end = int(ep["dataset_to_index"])
+        num_frames = ep_end - ep_start
+        if num_frames <= 0:
+            continue
+
+        first_sample = dataset[ep_start]
+        task = _resolve_task(first_sample, default=config.default_task or "perform the task")
+
+        ep_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
+
+        sub_indices = _build_subsample_indices(num_frames, num_subsampled_frames)
+
+        progress_per_frame = np.zeros(num_frames, dtype=np.float32)
+
+        for start in tqdm(range(0, num_frames, batch_size), desc=f"  Ep {episode_idx}", leave=False):
+            end = min(start + batch_size, num_frames)
+            frames_batch = torch.stack([ep_frames[sub_indices[i]] for i in range(start, end)])
+
+            transition = {
+                TransitionKey.OBSERVATION: {image_key: frames_batch},
+                TransitionKey.COMPLEMENTARY_DATA: {"task": task},
+            }
+            encoded = encoder(transition)
+            obs = encoded[TransitionKey.OBSERVATION]
+            batch = {
+                key: value.to(device) if isinstance(value, torch.Tensor) else value
+                for key, value in obs.items()
+            }
+
+            with torch.no_grad():
+                rewards = model.compute_reward(batch)
+            progress_per_frame[start:end] = rewards.cpu().numpy()
+
+        for local in range(num_frames):
+            all_index.append(ep_start + local)
+            all_episode.append(episode_idx)
+            all_frame.append(local)
+            all_progress.append(float(progress_per_frame[local]))
+
+        if device.startswith("cuda"):
+            torch.cuda.empty_cache()
+
+    table = pa.table(
+        {
+            "index": np.asarray(all_index, dtype=np.int64),
+            "episode_index": np.asarray(all_episode, dtype=np.int64),
+            "frame_index": np.asarray(all_frame, dtype=np.int64),
+            "progress_sparse": np.asarray(all_progress, dtype=np.float32),
+        }
+    ).replace_schema_metadata({b"reward_model_path": reward_model_path.encode()})
+
+    out = Path(dataset.root) / DEFAULT_OUTPUT_FILENAME if output_path is None else Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(table, out)
+    logging.info(f"Saved {len(table)} frame values to {out}")
+
+    progress_arr = np.asarray(all_progress, dtype=np.float32)
+    if progress_arr.size:
+        logging.info(
+            f"Progress: mean={float(progress_arr.mean()):.4f}, "
+            f"std={float(progress_arr.std()):.4f}, "
+            f"min={float(progress_arr.min()):.4f}, "
+            f"max={float(progress_arr.max()):.4f}"
+        )
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute per-frame Robometer progress curves for RA-BC weighting.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Dense per-frame progress for one episode
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --episodes 0
+
+    # All episodes, smaller batches for memory-constrained GPUs
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --batch-size 16
+        """,
+    )
+    parser.add_argument(
+        "--dataset-repo-id", type=str, required=True, help="HuggingFace dataset repo id or local path."
+    )
+    parser.add_argument(
+        "--reward-model-path", type=str, default=None, help="Robometer checkpoint repo id or local path."
+    )
+    parser.add_argument("--output-path", type=str, default=None, help="Output parquet path.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use (default: cuda).")
+    parser.add_argument(
+        "--batch-size", type=int, default=32, help="Sub-samples per Qwen forward (default: 32)."
+    )
+    parser.add_argument(
+        "--num-subsampled-frames",
+        type=int,
+        default=DEFAULT_NUM_SUBSAMPLED_FRAMES,
+        help=f"Frames per sub-sample (default: {DEFAULT_NUM_SUBSAMPLED_FRAMES}, matches eval server).",
+    )
+    parser.add_argument(
+        "--episodes", type=int, nargs="+", default=None, help="Process only these episode indices."
+    )
+    parser.add_argument(
+        "--image-key", type=str, default=None, help="Image observation key (default: from config)."
+    )
+    parser.add_argument(
+        "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    reward_model_path = args.reward_model_path
+    if reward_model_path is None:
+        temp_dataset = LeRobotDataset(args.dataset_repo_id, download_videos=False)
+        parquet_path = Path(temp_dataset.root) / DEFAULT_OUTPUT_FILENAME
+        reward_model_path = get_reward_model_path_from_parquet(parquet_path)
+        if reward_model_path:
+            logging.info(f"Using reward model from parquet metadata: {reward_model_path}")
+        else:
+            raise ValueError(
+                "--reward-model-path is required (no existing parquet with model metadata found)."
+            )
+
+    output_path = compute_robometer_progress(
+        dataset_repo_id=args.dataset_repo_id,
+        reward_model_path=reward_model_path,
+        output_path=args.output_path,
+        device=args.device,
+        batch_size=args.batch_size,
+        num_subsampled_frames=args.num_subsampled_frames,
+        episodes=args.episodes,
+        image_key=args.image_key,
+    )
+
+    print(f"\nRobometer progress saved to: {output_path}")
+
+    if args.push_to_hub:
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        hub_path = DEFAULT_OUTPUT_FILENAME
+
+        print(f"\nUploading to Hub: {args.dataset_repo_id}/{hub_path}")
+        api.upload_file(
+            path_or_fileobj=str(output_path),
+            path_in_repo=hub_path,
+            repo_id=args.dataset_repo_id,
+            repo_type="dataset",
+        )
+        print(
+            "Successfully uploaded to: "
+            f"https://huggingface.co/datasets/{args.dataset_repo_id}/blob/main/{hub_path}"
+        )
+
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: hf://datasets/{args.dataset_repo_id}/{hub_path}")
+        print("  rabc_head_mode: sparse")
+    else:
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: {output_path}")
+        print("  rabc_head_mode: sparse")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/lerobot/rewards/robometer/configuration_robometer.py b/src/lerobot/rewards/robometer/configuration_robometer.py
new file mode 100644
index 000000000..fdaf7c9fd
--- /dev/null
+++ b/src/lerobot/rewards/robometer/configuration_robometer.py
@@ -0,0 +1,158 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.utils.constants import OBS_IMAGES
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoConfig, AutoTokenizer
+else:
+    AutoConfig = None  # type: ignore[assignment]
+    AutoTokenizer = None  # type: ignore[assignment]
+
+
+# Special tokens Robometer adds to the Qwen-VL tokenizer at construction time.
+# The order is part of the data contract: upstream resized ``embed_tokens``
+# after adding these tokens in this exact order, so changing the set or order
+# would silently misalign the saved embedding rows with their token ids.
+# ``<|reward_token|>`` and ``<|sim_token|>`` are leftover from earlier upstream
+# heads (never read at inference) but still occupy rows the checkpoint expects.
+ROBOMETER_SPECIAL_TOKENS = (
+    "<|split_token|>",
+    "<|reward_token|>",
+    "<|pref_token|>",
+    "<|sim_token|>",
+    "<|prog_token|>",
+)
+
+
+@RewardModelConfig.register_subclass("robometer")
+@dataclass
+class RobometerConfig(RewardModelConfig):
+    """Configuration for the Robometer reward model."""
+
+    pretrained_path: str | None = "lerobot/Robometer-4B"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+
+    max_frames: int | None = 8
+    reward_output: str = "progress"  # "progress" or "success"
+    success_threshold: float = 0.5
+
+    license: str | None = "apache-2.0"
+    tags: list[str] | None = field(
+        default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
+    )
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    torch_dtype: str = "bfloat16"
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    average_temporal_patches: bool = True
+    frame_pooling: str = "mean"  # "mean" | "boundary" | "attention"
+    frame_pooling_attn_temperature: float = 1.0
+    progress_loss_type: str = "discrete"  # "l1" | "l2" | "discrete"
+    progress_discrete_bins: int = 10
+
+    # Serialised Qwen backbone config (post-resize). Always populated by
+    # ``__post_init__`` from ``base_model_id`` + ``len(tokenizer) + 5``, so it
+    # is non-empty after construction. Saved into ``config.json`` automatically
+    # by the base ``_save_pretrained``.
+    vlm_config: dict[str, Any] = field(default_factory=dict)
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "REWARD": NormalizationMode.IDENTITY,
+        }
+    )
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.reward_output not in {"progress", "success"}:
+            raise ValueError(f"reward_output must be 'progress' or 'success', got {self.reward_output!r}")
+        if self.max_frames is not None and self.max_frames < 1:
+            raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
+        if self.frame_pooling not in {"mean", "boundary", "attention"}:
+            raise ValueError(f"frame_pooling must be mean/boundary/attention; got {self.frame_pooling!r}")
+        if self.frame_pooling_attn_temperature <= 0:
+            raise ValueError("frame_pooling_attn_temperature must be > 0")
+        if self.progress_loss_type not in {"l1", "l2", "discrete"}:
+            raise ValueError(f"progress_loss_type must be l1/l2/discrete; got {self.progress_loss_type!r}")
+        if self.use_per_frame_progress_token and not self.use_multi_image:
+            raise ValueError("use_per_frame_progress_token=True requires use_multi_image=True")
+
+        if self.image_key not in self.input_features:
+            self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
+        self.output_features.setdefault("progress", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+        self.output_features.setdefault("success", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+
+        # Deterministically populate ``vlm_config`` so it is non-empty after
+        # construction. For ``Qwen/Qwen3-VL-4B-Instruct`` this gives
+        # ``len(tokenizer) + 5 = 151,669 + 5 = 151,674`` — the exact post-resize
+        # vocab the published ``Robometer-4B`` checkpoint was saved with.
+        if not self.vlm_config:
+            require_package("transformers", extra="robometer")
+            vlm = AutoConfig.from_pretrained(self.base_model_id).to_dict()
+            tokenizer = AutoTokenizer.from_pretrained(self.base_model_id)
+            text_config = vlm.get("text_config")
+            if not isinstance(text_config, dict):
+                raise ValueError(
+                    f"Backbone config for {self.base_model_id!r} has no nested `text_config`; "
+                    "Robometer expects a Qwen-VL-style config."
+                )
+            text_config["vocab_size"] = len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)
+            self.vlm_config = vlm
+
+    @property
+    def use_discrete_progress(self) -> bool:
+        """Whether the progress head outputs distribution logits over bins."""
+        return self.progress_loss_type.lower() == "discrete"
+
+    @property
+    def vlm_backbone_config(self):
+        """Reconstruct the Qwen backbone config from :attr:`vlm_config`."""
+        require_package("transformers", extra="robometer")
+        config_dict = deepcopy(self.vlm_config)
+        model_type = config_dict.pop("model_type", None)
+        if model_type is None:
+            raise ValueError("vlm_config must include `model_type` to reconstruct the backbone config")
+        return AutoConfig.for_model(model_type, **config_dict)
+
+    @property
+    def observation_delta_indices(self) -> list[int] | None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> None:
+        return None
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def validate_features(self) -> None:
+        if self.image_key not in self.input_features:
+            raise ValueError(f"Robometer requires image input feature {self.image_key!r}")
diff --git a/src/lerobot/rewards/robometer/modeling_robometer.py b/src/lerobot/rewards/robometer/modeling_robometer.py
new file mode 100644
index 000000000..aea49deae
--- /dev/null
+++ b/src/lerobot/rewards/robometer/modeling_robometer.py
@@ -0,0 +1,481 @@
+# Copyright 2026 Anthony Liang, Yigit Korkmaz, Stephen Tu, Erdem Bıyık, Jesse Zhang
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROBOMETER: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons.
+
+Paper:         https://arxiv.org/abs/2603.02115
+Project:       https://robometer.github.io
+Original code: https://github.com/aliang8/robometer
+Model:         https://huggingface.co/robometer/Robometer-4B
+
+Robometer is a general-purpose, video-language-input reward model built on
+``Qwen/Qwen3-VL-4B-Instruct``. It is trained with a dual reward-prediction
+objective:
+
+- A frame-level progress loss anchoring reward magnitude on expert data.
+- A trajectory-comparison preference loss imposing global ordering constraints
+  across trajectories sharing the same instruction.
+
+To support downstream RL it also predicts a frame-level binary success. The
+training prompt inserts three learnable tokens:
+
+- ``<|prog_token|>`` after each frame to read per-frame progress and success.
+- ``<|pref_token|>`` at the end to read pairwise preference (training-only).
+- ``<|split_token|>`` between two trajectories in preference samples
+  (training-only).
+
+Progress is modeled as a categorical distribution over ``progress_discrete_bins``
+uniformly-spaced centers in ``[0, 1]`` (C51-style), and the continuous estimate
+is recovered as the softmax-weighted mean of those centers — see
+:func:`convert_bins_to_continuous`.
+
+This LeRobot port is **inference-only**: the preference head is preserved in
+the state dict for byte-equivalence with the published ``Robometer-4B``
+checkpoint but is not queried by :meth:`RobometerRewardModel.compute_reward`,
+which returns the last-frame progress (clamped to ``[0, 1]``) or sigmoid'd
+success probability depending on :attr:`RobometerConfig.reward_output`.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import Tensor, nn
+
+from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.utils.constants import OBS_PREFIX
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoModelForImageTextToText
+else:
+    AutoModelForImageTextToText = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+# Namespace for Robometer's pre-encoded Qwen-VL observation tensors.
+ROBOMETER_FEATURE_PREFIX = f"{OBS_PREFIX}robometer."
+ROBOMETER_QWEN_INPUT_KEYS = (
+    "input_ids",
+    "attention_mask",
+    "pixel_values",
+    "pixel_values_videos",
+    "image_grid_thw",
+    "video_grid_thw",
+    "second_per_grid_ts",
+    "mm_token_type_ids",
+)
+ROBOMETER_METADATA_KEYS = (
+    "prog_token_id",
+    "vision_start_token_id",
+    "vision_end_token_id",
+    "video_merge_size",
+)
+ROBOMETER_INPUT_KEYS = ROBOMETER_QWEN_INPUT_KEYS + ROBOMETER_METADATA_KEYS
+
+
+def convert_bins_to_continuous(bin_logits: Tensor) -> Tensor:
+    """Collapse per-bin logits into a single value in ``[0, 1]``.
+
+    The discrete progress head outputs ``num_bins`` logits per frame. Bins are
+    evenly spaced centers in ``[0, 1]``; the continuous prediction is the
+    softmax-weighted mean of those centers.
+    """
+    bin_probs = torch.softmax(bin_logits, dim=-1)
+    num_bins = bin_logits.shape[-1]
+    bin_centers = torch.linspace(0.0, 1.0, num_bins, device=bin_logits.device, dtype=bin_logits.dtype)
+    return (bin_probs * bin_centers).sum(dim=-1)
+
+
+def _squeeze_last_safe(x: Tensor) -> Tensor:
+    """Drop a trailing singleton dim only when present."""
+    return x.squeeze(-1) if x.ndim > 1 and x.shape[-1] == 1 else x
+
+
+def _torch_dtype(name: str) -> torch.dtype:
+    dtype = getattr(torch, name, None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    raise ValueError(f"Unknown torch dtype: {name!r}")
+
+
+class RobometerPredictionHead(nn.Sequential):
+    """Small MLP head used for Robometer's progress / success / preference outputs."""
+
+    def __init__(self, hidden_dim: int, output_size: int, *, dropout: float, with_sigmoid: bool) -> None:
+        layers: list[nn.Module] = [
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_size),
+        ]
+        if with_sigmoid:
+            layers.append(nn.Sigmoid())
+        super().__init__(*layers)
+
+
+def decode_progress_outputs(
+    progress_logits: Tensor | None,
+    success_logits: Tensor | None,
+    *,
+    is_discrete_mode: bool,
+) -> dict[str, list[list[float]]]:
+    """Decode RBM head outputs into per-frame floats.
+
+    Args:
+        progress_logits: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        success_logits: ``(B, T)`` raw logits, ``sigmoid``-ed to probabilities.
+        is_discrete_mode: if True the progress logits get a softmax over bins
+            and are projected onto bin centers via :func:`convert_bins_to_continuous`.
+
+    Returns:
+        Dict with ``progress_pred`` and ``success_probs``, each a list of
+        length ``B`` of per-frame float lists.
+    """
+    progress_pred: list[list[float]] = []
+    success_probs: list[list[float]] = []
+
+    if progress_logits is not None:
+        for sample_logits in progress_logits:
+            if is_discrete_mode:
+                continuous = convert_bins_to_continuous(sample_logits.detach().float().cpu())
+                progress_pred.append(continuous.flatten().tolist())
+            else:
+                progress_pred.append(sample_logits.detach().float().cpu().flatten().tolist())
+
+    if success_logits is not None:
+        for sample_logits in success_logits:
+            success_probs.append(torch.sigmoid(sample_logits.detach().float().cpu()).flatten().tolist())
+
+    return {"progress_pred": progress_pred, "success_probs": success_probs}
+
+
+class RobometerRewardModel(PreTrainedRewardModel):
+    """Robometer (RBM) reward model — inference-only LeRobot port.
+
+    Wraps a Qwen-VL backbone (default: ``Qwen/Qwen3-VL-4B-Instruct``) with three
+    prediction heads from the paper (progress, success, preference). At
+    inference time only the progress and success heads are queried; the
+    preference head is kept on the module so the published ``Robometer-4B``
+    safetensors load unchanged.
+    """
+
+    name = "robometer"
+    config_class = RobometerConfig
+
+    def __init__(self, config: RobometerConfig, *, dropout: float = 0.1) -> None:
+        require_package("transformers", extra="robometer")
+        super().__init__(config)
+        self.config = config
+
+        # Two backbone-build paths (EO-1 style, branched on ``pretrained_path``):
+        #
+        #   - Fresh training (``pretrained_path is None``): download the base
+        #     Qwen weights and resize the embed table to match
+        #     ``vlm_config.text_config.vocab_size`` — populated deterministically
+        #     in ``RobometerConfig.__post_init__`` as
+        #     ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``
+        #
+        #   - Loading a saved checkpoint (``pretrained_path`` is set): rebuild
+        #     the empty architecture from ``vlm_config`` via
+        #     ``AutoModelForImageTextToText.from_config`` so the subsequent
+        #     ``model.safetensors`` load is a direct fill of the right shape —
+        #     no redundant Qwen weight download.
+        torch_dtype = _torch_dtype(config.torch_dtype)
+        if config.pretrained_path is None:
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                config.base_model_id,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+            target_vocab = config.vlm_config["text_config"]["vocab_size"]
+            self.model.resize_token_embeddings(target_vocab)
+        else:
+            self.model = AutoModelForImageTextToText.from_config(
+                config.vlm_backbone_config,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+
+        # All Qwen-VL backbones Robometer supports expose `text_config.hidden_size`.
+        # Falls back to the top-level `hidden_size` so future non-multimodal
+        # variants would still resolve.
+        backbone_config = self.model.config
+        text_config = getattr(backbone_config, "text_config", None)
+        hidden_size = getattr(text_config, "hidden_size", None) if text_config is not None else None
+        if hidden_size is None:
+            hidden_size = getattr(backbone_config, "hidden_size", None)
+        if hidden_size is None:
+            raise AttributeError(
+                f"Could not infer hidden_size from backbone config of {config.base_model_id}"
+            )
+        hidden_dim = int(hidden_size)
+
+        # Robometer's three prediction heads + frame-pool attention.
+        progress_output = config.progress_discrete_bins if config.use_discrete_progress else 1
+        self.progress_head = RobometerPredictionHead(
+            hidden_dim,
+            progress_output,
+            dropout=dropout,
+            with_sigmoid=not config.use_discrete_progress,
+        )
+        self.preference_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.success_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.frame_pool_attn = nn.Linear(hidden_dim, 1, bias=False)
+
+        # Match the dtype of the loaded base model so weight loading is a no-op cast.
+        model_dtype = next(self.model.parameters()).dtype
+        self.progress_head.to(dtype=model_dtype)
+        self.preference_head.to(dtype=model_dtype)
+        self.success_head.to(dtype=model_dtype)
+        self.frame_pool_attn.to(dtype=model_dtype)
+
+    def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
+        inputs = {
+            key: batch[f"{ROBOMETER_FEATURE_PREFIX}{key}"]
+            for key in ROBOMETER_INPUT_KEYS
+            if f"{ROBOMETER_FEATURE_PREFIX}{key}" in batch
+        }
+        if "input_ids" not in inputs:
+            raise KeyError(
+                f"Robometer batch missing pre-encoded inputs (expected "
+                f"`{ROBOMETER_FEATURE_PREFIX}input_ids`). Make sure the "
+                "RobometerEncoderProcessorStep ran before `compute_reward`."
+            )
+
+        device = next(self.model.parameters()).device
+        inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
+
+        self.eval()
+        with torch.no_grad():
+            progress_logits, success_logits = self._compute_rbm_logits(inputs)
+
+        decoded = decode_progress_outputs(
+            progress_logits,
+            success_logits,
+            is_discrete_mode=self.config.use_discrete_progress,
+        )
+        values = (
+            decoded["success_probs"] if self.config.reward_output == "success" else decoded["progress_pred"]
+        )
+
+        rewards = torch.stack([torch.as_tensor(seq, dtype=torch.float32)[-1] for seq in values])
+        if self.config.reward_output == "success":
+            rewards = (rewards > self.config.success_threshold).float()
+        else:
+            # Match upstream Robometer's ``extract_rewards_from_output``: per-frame
+            # progress predictions are clamped to ``[0, 1]`` before being returned.
+            rewards = rewards.clamp(0.0, 1.0)
+        return rewards.to(self.config.device or "cpu")
+
+    def _compute_rbm_logits(
+        self,
+        inputs: dict[str, Any],
+    ) -> tuple[Tensor, Tensor]:
+        """Run the Qwen3-VL backbone and apply Robometer's heads.
+
+        ``inputs`` is the encoded batch produced by
+        :class:`RobometerEncoderProcessorStep`. It carries Qwen tensors as well
+        as Robometer-specific metadata (``prog_token_id``,
+        ``vision_start_token_id``, ``vision_end_token_id``, ``video_merge_size``)
+        — the metadata is popped here so the rest can be forwarded straight to
+        the Qwen model.
+
+        Returns ``(progress_logits, success_logits)``. Shapes:
+
+        - ``progress_logits``: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        - ``success_logits``: ``(B, T)`` raw logits (sigmoid happens at decode time).
+        """
+        prog_token_id = inputs.pop("prog_token_id", None)
+        vision_start_token_id = inputs.pop("vision_start_token_id", None)
+        vision_end_token_id = inputs.pop("vision_end_token_id", None)
+        video_merge_size = inputs.pop("video_merge_size", 14)
+
+        # Qwen3-VL doesn't reliably populate `last_hidden_state`; ask for the
+        # full hidden-state tuple and take the last layer. This matches the
+        # `is_qwen3` path in upstream Robometer's `RBM.forward_qwen` (main).
+        outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        hidden_state = (
+            outputs.hidden_states[-1]
+            if getattr(outputs, "hidden_states", None)
+            else outputs.last_hidden_state
+        )
+
+        input_ids = inputs["input_ids"]
+        if self.config.use_per_frame_progress_token:
+            if prog_token_id is None:
+                raise KeyError("`prog_token_id` missing in batch (run RobometerEncoderProcessorStep first)")
+            return self._process_token_extraction(hidden_state, input_ids, prog_token_id=prog_token_id)
+        if self.config.use_multi_image:
+            if vision_start_token_id is None or vision_end_token_id is None:
+                raise KeyError(
+                    "`vision_start_token_id` / `vision_end_token_id` missing in batch "
+                    "(run RobometerEncoderProcessorStep first)"
+                )
+            return self._process_multi_image_frames(
+                hidden_state,
+                input_ids,
+                start_id=vision_start_token_id,
+                end_id=vision_end_token_id,
+            )
+        video_grid_thw = inputs.get("video_grid_thw")
+        if video_grid_thw is None:
+            raise ValueError("video_grid_thw is required for video-mode Robometer inference")
+        if vision_start_token_id is None:
+            raise KeyError("`vision_start_token_id` missing in batch")
+        return self._process_video_frames(
+            hidden_state,
+            input_ids,
+            video_grid_thw,
+            start_id=vision_start_token_id,
+            merge_size=video_merge_size,
+        )
+
+    def _apply_heads_to_hidden_states(self, frame_embeddings: Tensor) -> tuple[Tensor, Tensor]:
+        """Apply progress + success heads to a tensor of frame embeddings."""
+        progress_out = self.progress_head(frame_embeddings)
+        progress = progress_out if self.config.use_discrete_progress else _squeeze_last_safe(progress_out)
+        success = _squeeze_last_safe(self.success_head(frame_embeddings))
+        return progress, success
+
+    def _process_token_extraction(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        prog_token_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success from ``<|prog_token|>`` positions."""
+        token_mask = input_ids == prog_token_id
+        batch_indices, positions = token_mask.nonzero(as_tuple=True)
+        if positions.numel() == 0:
+            raise ValueError("`<|prog_token|>` not found in any sequence")
+
+        per_sample_hidden = [
+            hidden_state[i, positions[batch_indices == i]] for i in range(input_ids.shape[0])
+        ]
+        progress_list, success_list = [], []
+        for embeddings in per_sample_hidden:
+            if embeddings.shape[0] == 0:
+                raise ValueError("`<|prog_token|>` missing in a sequence")
+            progress, success = self._apply_heads_to_hidden_states(embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _process_multi_image_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        start_id: int,
+        end_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in multi-image mode (Qwen-VL)."""
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            frame_embeddings = self._extract_hidden_states_from_token_pairs(
+                seq_hidden, seq_ids, start_id, end_id
+            )
+            progress, success = self._apply_heads_to_hidden_states(frame_embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _extract_hidden_states_from_token_pairs(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        start_id: int,
+        end_id: int,
+    ) -> Tensor:
+        start_positions = (input_ids == start_id).nonzero(as_tuple=True)[0]
+        end_positions = (input_ids == end_id).nonzero(as_tuple=True)[0]
+        if start_positions.numel() == 0:
+            raise ValueError("`<|vision_start|>` not found in sequence")
+        if start_positions.numel() != end_positions.numel():
+            raise ValueError(
+                f"Mismatched vision token counts: {start_positions.numel()} start vs "
+                f"{end_positions.numel()} end"
+            )
+
+        frames: list[Tensor] = []
+        for start, end in zip(start_positions.tolist(), end_positions.tolist(), strict=True):
+            if start >= end:
+                raise ValueError(f"Invalid vision token pair: start={start} end={end}")
+            patch_tokens = hidden_state[start + 1 : end]
+            if patch_tokens.shape[0] == 0:
+                frames.append((hidden_state[start] + hidden_state[end]) / 2.0)
+                continue
+
+            pooling = self.config.frame_pooling
+            if pooling == "mean":
+                frames.append(patch_tokens.mean(dim=0))
+            elif pooling == "boundary":
+                frames.append(patch_tokens[-1])
+            else:  # attention
+                scores = (
+                    self.frame_pool_attn(patch_tokens).squeeze(-1)
+                    / self.config.frame_pooling_attn_temperature
+                )
+                weights = torch.softmax(scores, dim=0).unsqueeze(-1)
+                frames.append((weights * patch_tokens).sum(dim=0))
+
+        return torch.stack(frames)
+
+    def _process_video_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        video_grid_thw: Tensor,
+        *,
+        start_id: int,
+        merge_size: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in video mode (Qwen-VL)."""
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            start_positions = (seq_ids == start_id).nonzero(as_tuple=True)[0]
+            if start_positions.numel() == 0:
+                raise ValueError("`<|vision_start|>` not found in sequence")
+            t_dim, h_dim, w_dim = (int(x) for x in video_grid_thw[batch_idx].tolist())
+            tokens_per_frame = (h_dim * w_dim) // (merge_size**2)
+
+            cursor = start_positions[0].item()
+            frame_embeddings: list[Tensor] = []
+            for _ in range(t_dim):
+                if self.config.average_temporal_patches:
+                    patch = seq_hidden[cursor : cursor + tokens_per_frame]
+                    frame_embeddings.append(patch.mean(dim=0))
+                else:
+                    frame_embeddings.append(seq_hidden[cursor + tokens_per_frame])
+                cursor += tokens_per_frame
+
+            stacked = torch.stack(frame_embeddings)
+            progress, success = self._apply_heads_to_hidden_states(stacked)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
diff --git a/src/lerobot/rewards/robometer/processor_robometer.py b/src/lerobot/rewards/robometer/processor_robometer.py
new file mode 100644
index 000000000..d98f8b9aa
--- /dev/null
+++ b/src/lerobot/rewards/robometer/processor_robometer.py
@@ -0,0 +1,338 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Robometer pre/post processing pipelines."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+from PIL import Image
+from torch import Tensor
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    policy_action_to_transition,
+)
+from lerobot.rewards.robometer.configuration_robometer import (
+    ROBOMETER_SPECIAL_TOKENS,
+    RobometerConfig,
+)
+from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_FEATURE_PREFIX
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_IMAGES,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoProcessor
+else:
+    AutoProcessor = None
+
+PROGRESS_PROMPT = (
+    "The task for the robot is '{task}'. Given the trajectory video, predict "
+    "the task progress at each frame, how far along the robot is towards "
+    "completing the task, a float between 0 and 1, where 0 is the starting "
+    "state and 1 is when the task is completed. If the robot is not "
+    "performing the same task, predict 0 progress."
+)
+
+
+def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
+    """Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
+    if frames.ndim != 4:
+        raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
+    if frames.dtype != np.uint8:
+        frames = np.clip(frames, 0, 255).astype(np.uint8)
+    return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
+
+
+def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray:
+    """Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array."""
+    if max_frames is not None:
+        video = video[-max_frames:]
+    if video.shape[1] in (1, 3):
+        video = video.permute(0, 2, 3, 1)
+    elif video.shape[-1] not in (1, 3):
+        raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
+
+    array = video.detach().cpu().numpy()
+    if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0:
+        array = array * 255.0
+    return np.clip(array, 0, 255).astype(np.uint8)
+
+
+def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
+    if task is None:
+        task = default
+    if task is None:
+        raise KeyError("Robometer expected a task description in complementary data")
+    if isinstance(task, str):
+        return [task] * batch_size
+    if isinstance(task, tuple):
+        task = list(task)
+    if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
+        raise TypeError(f"Robometer task must be a string or list of strings, got {type(task)}")
+    if len(task) == 1 and batch_size > 1:
+        return task * batch_size
+    if len(task) != batch_size:
+        raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
+    return task
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="robometer_encoder")
+class RobometerEncoderProcessorStep(ProcessorStep):
+    """Encode raw frames + task into Qwen-VL tensors for the Robometer model.
+
+    Loads a :class:`~transformers.AutoProcessor` matching ``base_model_id`` and
+    registers Robometer's special tokens on the tokenizer. The matching
+    embedding resize happens model-side in
+    :meth:`RobometerRewardModel.__init__`.
+
+    At call time the step reads:
+
+    - ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
+    - ``complementary_data[task_key]``: a string or list of strings.
+
+    and writes ``observation[f"{ROBOMETER_FEATURE_PREFIX}<name>"]`` for:
+
+    - the Qwen-VL processor outputs: ``input_ids``, ``attention_mask``,
+      ``pixel_values``, ``image_grid_thw``, ``video_grid_thw``, ...
+    - Robometer-specific token ids consumed by the model heads:
+      ``prog_token_id``, ``vision_start_token_id``, ``vision_end_token_id``,
+      ``video_merge_size``.
+    """
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 8
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    max_length: int = 1024
+
+    _processor: Any = field(default=None, init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="robometer")
+        require_package("qwen-vl-utils", extra="robometer", import_name="qwen_vl_utils")
+
+        self._processor = AutoProcessor.from_pretrained(
+            self.base_model_id,
+            trust_remote_code=True,
+            do_sample_frames=False,
+            padding_side="right",
+        )
+
+        # Register Robometer's special tokens on the tokenizer. The matching
+        # embedding resize happens model-side in `RobometerRewardModel.__init__`.
+        tokenizer = self._processor.tokenizer
+        # Qwen tokenizers may not define a pad token, but batched prompts/videos
+        # require padding, so reuse EOS as the padding token.
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        for token in ROBOMETER_SPECIAL_TOKENS:
+            if token not in tokenizer.get_vocab():
+                tokenizer.add_special_tokens({"additional_special_tokens": [token]})
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        observation = transition.get(TransitionKey.OBSERVATION)
+        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        if not isinstance(observation, dict):
+            raise ValueError("RobometerEncoderProcessorStep requires an observation dict")
+
+        if self.image_key not in observation:
+            raise KeyError(f"Robometer expected image key {self.image_key!r} in observation")
+
+        frames = observation[self.image_key]
+        tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
+        if tensor.ndim == 4:
+            tensor = tensor.unsqueeze(1)
+        elif tensor.ndim != 5:
+            raise ValueError(
+                f"Expected Robometer frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
+            )
+
+        batch_size = tensor.shape[0]
+        tasks = _expand_tasks(
+            complementary.get(self.task_key, self.default_task),
+            batch_size=batch_size,
+            default=self.default_task,
+        )
+
+        samples = [
+            (_video_to_numpy(tensor[i], max_frames=self.max_frames), tasks[i]) for i in range(batch_size)
+        ]
+        encoded = self.encode_samples(samples)
+
+        new_observation = dict(observation)
+        for key, value in encoded.items():
+            new_observation[f"{ROBOMETER_FEATURE_PREFIX}{key}"] = value
+
+        new_transition = transition.copy()
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        return new_transition
+
+    def encode_samples(self, samples: list[tuple[np.ndarray, str]]) -> dict[str, Tensor]:
+        """Run the Qwen-VL processor on a list of ``(frames, task)`` samples."""
+        from qwen_vl_utils import process_vision_info
+
+        conversations = [self._build_conversation(frames, task) for frames, task in samples]
+
+        texts = [
+            self._processor.apply_chat_template(
+                msg,
+                tokenize=False,
+                add_generation_prompt=False,
+                add_vision_id=True,
+                enable_thinking=False,
+                fps=1,
+            )
+            for msg in conversations
+        ]
+
+        process_kwargs: dict[str, Any] = {
+            "return_video_kwargs": True,
+            "return_video_metadata": True,
+        }
+        image_processor = getattr(self._processor, "image_processor", None)
+        if image_processor is not None and hasattr(image_processor, "patch_size"):
+            process_kwargs["image_patch_size"] = image_processor.patch_size
+
+        image_inputs, video_inputs, video_kwargs = process_vision_info(conversations, **process_kwargs)
+
+        videos: list[Any] | None = None
+        video_metadatas: list[Any] | None = None
+        if video_inputs:
+            if isinstance(video_inputs[0], tuple) and len(video_inputs[0]) == 2:
+                videos_seq, metadatas_seq = zip(*video_inputs, strict=False)
+                videos = list(videos_seq)
+                video_metadatas = list(metadatas_seq)
+            else:
+                videos = list(video_inputs)
+
+        processor_kwargs: dict[str, Any] = {
+            "text": texts,
+            "images": image_inputs,
+            "padding": True,
+            "truncation": False,
+            "max_length": self.max_length,
+            "return_tensors": "pt",
+            "do_resize": False,
+        }
+        if videos is not None:
+            processor_kwargs["videos"] = videos
+        if video_metadatas is not None:
+            processor_kwargs["video_metadata"] = video_metadatas
+        if video_kwargs:
+            processor_kwargs.update(video_kwargs)
+
+        encoded = self._processor(**processor_kwargs)
+
+        # Write Robometer-specific token ids and the video patch merge size into
+        # the encoded batch so `RobometerRewardModel` doesn't need its own
+        # tokenizer at inference (EO1-style separation: the processor owns the
+        # tokenizer, the model owns the backbone and heads).
+        tokenizer = self._processor.tokenizer
+        encoded["prog_token_id"] = tokenizer.convert_tokens_to_ids("<|prog_token|>")
+        encoded["vision_start_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        encoded["vision_end_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+        video_processor = getattr(self._processor, "video_processor", None)
+        encoded["video_merge_size"] = int(getattr(video_processor, "merge_size", 14))
+        return encoded
+
+    def _build_conversation(self, frames: np.ndarray, task: str) -> list[dict[str, Any]]:
+        pil_frames = _frames_to_pil(frames)
+        prompt = PROGRESS_PROMPT.format(task=task)
+        content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
+
+        if self.use_multi_image:
+            for image in pil_frames:
+                content.append({"type": "image", "image": image})
+                if self.use_per_frame_progress_token:
+                    content.append({"type": "text", "text": "<|prog_token|>"})
+        else:
+            content.append({"type": "video", "video": pil_frames, "sample_fps": 1.0})
+
+        return [{"role": "user", "content": content}]
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "base_model_id": self.base_model_id,
+            "image_key": self.image_key,
+            "task_key": self.task_key,
+            "default_task": self.default_task,
+            "max_frames": self.max_frames,
+            "use_multi_image": self.use_multi_image,
+            "use_per_frame_progress_token": self.use_per_frame_progress_token,
+            "max_length": self.max_length,
+        }
+
+
+def make_robometer_pre_post_processors(
+    config: RobometerConfig,
+    dataset_stats: dict[str, dict[str, Any]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
+
+    The preprocessor adds a batch dimension if needed, runs Robometer's
+    encoder, and moves everything to the configured device. The
+    postprocessor is the identity since Robometer outputs a single reward
+    tensor.
+    """
+    del dataset_stats  # Robometer has its own normalisation inside the Qwen-VL processor.
+
+    preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+        steps=[
+            AddBatchDimensionProcessorStep(),
+            RobometerEncoderProcessorStep(
+                base_model_id=config.base_model_id,
+                image_key=config.image_key,
+                task_key=config.task_key,
+                default_task=config.default_task,
+                max_frames=config.max_frames,
+                use_multi_image=config.use_multi_image,
+                use_per_frame_progress_token=config.use_per_frame_progress_token,
+            ),
+            DeviceProcessorStep(device=config.device or "cpu"),
+        ],
+        name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+    )
+    postprocessor = PolicyProcessorPipeline(
+        name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+        to_transition=policy_action_to_transition,
+    )
+    return preprocessor, postprocessor
diff --git a/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md b/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
index 11df95de5..163aa530b 100644
--- a/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
+++ b/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
@@ -13,6 +13,8 @@
 A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
 {% elif model_name == "sarm" %}
 A Success-Aware Reward Model (SARM) predicts a dense reward signal from observations, typically used downstream for reinforcement learning or human-in-the-loop fine-tuning when task success is not directly observable.
+{% elif model_name == "robometer" %}
+ROBOMETER is a general-purpose video-language robotic reward model built on a fine-tuned Qwen3-VL-4B backbone with progress, preference, and success heads. Given a trajectory video and a task description, it predicts dense, frame-level task progress in [0, 1] and frame-level success probabilities for downstream robot learning, including offline RL, online RL, data filtering and retrieval, and automated failure detection.
 {% elif model_name == "topreward" %}
 TOPReward is a **zero-shot** reward model that extracts token log-probabilities from an off-the-shelf vision-language model (default Qwen3-VL) as a reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood of the instruction being true, with no fine-tuning required.
 {% else %}
diff --git a/tests/rewards/test_modeling_robometer.py b/tests/rewards/test_modeling_robometer.py
new file mode 100644
index 000000000..19aba13fa
--- /dev/null
+++ b/tests/rewards/test_modeling_robometer.py
@@ -0,0 +1,340 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Robometer reward model."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.factory import get_reward_model_class, make_reward_model_config
+from lerobot.rewards.robometer import RobometerConfig
+from lerobot.rewards.robometer.configuration_robometer import ROBOMETER_SPECIAL_TOKENS
+from lerobot.rewards.robometer.modeling_robometer import (
+    ROBOMETER_FEATURE_PREFIX,
+    convert_bins_to_continuous,
+    decode_progress_outputs,
+)
+from tests.utils import skip_if_package_missing
+
+# Length of the fake tokenizer used in `_patch_build`. The deterministic
+# resize target derived in ``RobometerConfig.__post_init__`` is therefore
+# ``_FAKE_TOKENIZER_LEN + len(ROBOMETER_SPECIAL_TOKENS)``.
+_FAKE_TOKENIZER_LEN = 100
+_EXPECTED_RESIZED_VOCAB = _FAKE_TOKENIZER_LEN + len(ROBOMETER_SPECIAL_TOKENS)
+
+
+class _FakeQwenConfig:
+    """Stand-in for a Qwen3-VL config (the `model.config` attribute).
+
+    ``to_dict`` matches HF's ``PretrainedConfig.to_dict`` closely enough for
+    ``RobometerConfig.__post_init__`` to snapshot a meaningful ``vlm_config``
+    into the saved ``config.json`` and for the reload path to round-trip
+    through ``AutoConfig.for_model``.
+    """
+
+    def __init__(self, hidden_dim: int = 8, vocab_size: int = _FAKE_TOKENIZER_LEN) -> None:
+        # `vocab_size` here is the *pre-resize* value the fake backbone advertises.
+        # `__post_init__` is expected to overwrite it with `len(tokenizer) + 5`.
+        self.text_config = SimpleNamespace(hidden_size=hidden_dim, vocab_size=vocab_size)
+        self._hidden_dim = hidden_dim
+        self._vocab_size = vocab_size
+
+    def to_dict(self) -> dict:
+        return {
+            "model_type": "fake_qwen",
+            "text_config": {
+                "hidden_size": self._hidden_dim,
+                "vocab_size": self._vocab_size,
+            },
+        }
+
+
+class _FakeEmbeddings(torch.nn.Module):
+    def __init__(self, num_embeddings: int = _FAKE_TOKENIZER_LEN) -> None:
+        super().__init__()
+        self.num_embeddings = num_embeddings
+
+
+class _FakeBaseModel(torch.nn.Module):
+    """Stand-in for the Qwen3-VL backbone during tests.
+
+    Provides the minimum surface `RobometerRewardModel.__init__` and
+    `_compute_rbm_logits` rely on: a `parameters()` iterator (for dtype +
+    device), a `config.text_config.hidden_size`, a `config.to_dict()` so
+    `_save_pretrained` can snapshot `vlm_config`,
+    `get_input_embeddings()` / `resize_token_embeddings()` so the fresh-init
+    embed resize is a no-op, and a forward that returns a `SimpleNamespace`
+    with a `hidden_states` tuple.
+    """
+
+    def __init__(self, hidden_dim: int = 8) -> None:
+        super().__init__()
+        self._param = torch.nn.Parameter(torch.zeros(1))
+        self.hidden_dim = hidden_dim
+        self.config = _FakeQwenConfig(hidden_dim)
+        self._embeddings = _FakeEmbeddings()
+
+    def get_input_embeddings(self) -> _FakeEmbeddings:
+        return self._embeddings
+
+    def resize_token_embeddings(self, new_size: int) -> None:
+        self._embeddings.num_embeddings = new_size
+
+    def forward(self, **kwargs):  # noqa: ARG002 - intentional kwargs sink
+        input_ids = kwargs["input_ids"]
+        return SimpleNamespace(
+            hidden_states=(torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),),
+            last_hidden_state=torch.zeros(input_ids.shape[0], input_ids.shape[1], self.hidden_dim),
+        )
+
+
+class _FakeTokenizer:
+    """Minimal stand-in for an HF tokenizer.
+
+    ``RobometerConfig.__post_init__`` uses ``len(tokenizer)`` to compute the
+    deterministic resize target ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``,
+    so a working ``__len__`` is all we need.
+    """
+
+    def __init__(self, length: int = _FAKE_TOKENIZER_LEN) -> None:
+        self._length = length
+
+    def __len__(self) -> int:
+        return self._length
+
+
+def _patch_build(monkeypatch) -> None:
+    """Stub out the HF AutoX calls so Robometer construction stays cheap in tests.
+
+    Covers (EO-1 style — no model-side override hooks):
+    * ``AutoConfig.from_pretrained`` (config side) — used by
+      ``RobometerConfig.__post_init__`` to snapshot the backbone config.
+    * ``AutoTokenizer.from_pretrained`` (config side) — used by
+      ``__post_init__`` to compute ``len(tokenizer) + 5``.
+    * ``AutoConfig.for_model``                       — used by
+      ``RobometerConfig.vlm_backbone_config`` when rebuilding for ``from_config``.
+    * ``AutoModelForImageTextToText.from_pretrained`` — fresh-training path
+      (``pretrained_path is None``).
+    * ``AutoModelForImageTextToText.from_config``    — checkpoint-reload path
+      (``pretrained_path`` is set).
+    """
+    from lerobot.rewards.robometer import configuration_robometer, modeling_robometer
+
+    monkeypatch.setattr(
+        modeling_robometer.AutoModelForImageTextToText,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        modeling_robometer.AutoModelForImageTextToText,
+        "from_config",
+        lambda *args, **kwargs: _FakeBaseModel(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoConfig,
+        "for_model",
+        lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoConfig,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeQwenConfig(hidden_dim=8),
+    )
+    monkeypatch.setattr(
+        configuration_robometer.AutoTokenizer,
+        "from_pretrained",
+        lambda *args, **kwargs: _FakeTokenizer(length=_FAKE_TOKENIZER_LEN),
+    )
+
+
+def _make_batch(features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """Build a `compute_reward`-ready batch using Robometer's namespaced keys."""
+    return {f"{ROBOMETER_FEATURE_PREFIX}{key}": value for key, value in features.items()}
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_config_registered(monkeypatch):
+    _patch_build(monkeypatch)
+    assert "robometer" in RewardModelConfig.get_known_choices()
+    assert RewardModelConfig.get_choice_class("robometer") is RobometerConfig
+    assert isinstance(make_reward_model_config("robometer", device="cpu"), RobometerConfig)
+
+
+def test_robometer_factory_returns_in_tree_class():
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    assert get_reward_model_class("robometer") is RobometerRewardModel
+
+
+def test_convert_bins_to_continuous_returns_expected_values():
+    # Two frames: first peaks at bin 0 (center 0.0), second peaks at bin 9 (center 1.0).
+    bin_logits = torch.full((2, 10), -10.0)
+    bin_logits[0, 0] = 10.0
+    bin_logits[1, -1] = 10.0
+    values = convert_bins_to_continuous(bin_logits)
+    assert values.shape == (2,)
+    assert torch.allclose(values, torch.tensor([0.0, 1.0]), atol=1e-3)
+
+
+def test_decode_progress_outputs_returns_last_frame_values():
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
+
+    outputs = decode_progress_outputs(progress, success_logits, is_discrete_mode=False)
+
+    assert outputs["progress_pred"] == [pytest.approx([0.1, 0.9]), pytest.approx([0.4, 0.6])]
+    assert outputs["success_probs"][0][-1] == pytest.approx(torch.sigmoid(torch.tensor(5.0)).item(), abs=1e-3)
+    assert outputs["success_probs"][1][-1] == pytest.approx(
+        torch.sigmoid(torch.tensor(-5.0)).item(), abs=1e-3
+    )
+
+
+def test_decode_progress_outputs_discrete_mode_softmaxes_over_bins():
+    # 2 frames, peaks at bin 0 and bin 9 → continuous predictions 0.0 and 1.0
+    bin_logits = torch.full((1, 2, 10), -10.0)
+    bin_logits[0, 0, 0] = 10.0
+    bin_logits[0, 1, -1] = 10.0
+
+    outputs = decode_progress_outputs(bin_logits, success_logits=None, is_discrete_mode=True)
+
+    assert outputs["success_probs"] == []
+    assert outputs["progress_pred"][0] == pytest.approx([0.0, 1.0], abs=1e-3)
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_post_init_overwrites_vocab_size_with_tokenizer_length(monkeypatch):
+    """``RobometerConfig.__post_init__`` must overwrite the backbone's stale
+    ``text_config.vocab_size`` (which on the real Qwen3-VL config is the
+    padded embedding size, ``151,936``) with ``len(tokenizer) + 5``. This is
+    the contract that makes the published ``Robometer-4B`` checkpoint load
+    byte-equivalently."""
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(device="cpu", progress_loss_type="l2")
+
+    assert cfg.vlm_config["text_config"]["vocab_size"] == _EXPECTED_RESIZED_VOCAB
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_compute_reward_reads_pre_encoded_inputs(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(device="cpu", reward_output="progress", progress_loss_type="l2")
+    model = RobometerRewardModel(cfg)
+    # Bypass the Qwen3-VL forward + head extraction with deterministic logits.
+    monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
+
+    batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
+    rewards = model.compute_reward(batch)
+
+    assert torch.allclose(rewards, torch.tensor([0.9, 0.6]))
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_compute_reward_can_return_binary_success(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    progress = torch.tensor([[0.1, 0.9], [0.4, 0.6]])
+    success_logits = torch.tensor([[0.0, 5.0], [0.0, -5.0]])  # sigmoid(5) > 0.5; sigmoid(-5) < 0.5
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(
+        device="cpu",
+        reward_output="success",
+        success_threshold=0.5,
+        progress_loss_type="l2",
+    )
+    model = RobometerRewardModel(cfg)
+    monkeypatch.setattr(model, "_compute_rbm_logits", lambda _inputs: (progress, success_logits))
+
+    batch = _make_batch({"input_ids": torch.zeros(2, 2, dtype=torch.long)})
+    rewards = model.compute_reward(batch)
+
+    assert torch.equal(rewards, torch.tensor([1.0, 0.0]))
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_compute_reward_errors_when_inputs_missing(monkeypatch):
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    _patch_build(monkeypatch)
+
+    cfg = RobometerConfig(device="cpu", progress_loss_type="l2")
+    model = RobometerRewardModel(cfg)
+
+    with pytest.raises(KeyError, match=r"observation\.robometer\.input_ids"):
+        model.compute_reward({})
+
+
+@skip_if_package_missing("transformers")
+def test_robometer_save_pretrained_roundtrips(monkeypatch, tmp_path):
+    """Saving and reloading a Robometer model in LeRobot HF format must produce
+    a single ``model.safetensors`` + ``config.json`` (no Hydra ``config.yaml``),
+    must round-trip user-tunable config fields, and must persist all three
+    prediction heads (``progress_head``, ``success_head``, ``preference_head``)
+    so the published ``Robometer-4B`` checkpoint loads byte-equivalently.
+    """
+    from huggingface_hub.constants import CONFIG_NAME, SAFETENSORS_SINGLE_FILE
+    from safetensors.torch import load_file
+
+    from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+    _patch_build(monkeypatch)
+    cfg = RobometerConfig(
+        device="cpu",
+        pretrained_path="robometer/Robometer-4B",
+        # Knobs the user might tweak — must survive the round-trip.
+        image_key="observation.images.cam_top",
+        task_key="task",
+        reward_output="success",
+        success_threshold=0.7,
+        progress_loss_type="l2",
+    )
+    model = RobometerRewardModel(cfg)
+    model.save_pretrained(str(tmp_path))
+
+    # Exactly the files LeRobot's HubMixin promises.
+    assert (tmp_path / CONFIG_NAME).exists()
+    assert (tmp_path / SAFETENSORS_SINGLE_FILE).exists()
+    assert not (tmp_path / "config.yaml").exists()  # we want HF-style, not Hydra
+
+    # All three heads must be present in the saved safetensors. The preference
+    # head is unused at inference but the published checkpoint expects its
+    # rows — losing it would silently break weight loading.
+    state = load_file(str(tmp_path / SAFETENSORS_SINGLE_FILE))
+    assert any(k.startswith("progress_head.") for k in state), "progress_head weights missing"
+    assert any(k.startswith("success_head.") for k in state), "success_head weights missing"
+    assert any(k.startswith("preference_head.") for k in state), "preference_head weights missing"
+
+    # Reload from the local directory: no Hub fetch, no YAML overlay. The
+    # base class drives subclass dispatch via the `type` field in config.json.
+    reloaded_cfg = RewardModelConfig.from_pretrained(str(tmp_path))
+    assert isinstance(reloaded_cfg, RobometerConfig)
+    reloaded_cfg.pretrained_path = str(tmp_path)  # mimic lerobot-train's `validate()`
+    reloaded = RobometerRewardModel.from_pretrained(str(tmp_path), config=reloaded_cfg)
+
+    assert reloaded.config.image_key == "observation.images.cam_top"
+    assert reloaded.config.task_key == "task"
+    assert reloaded.config.reward_output == "success"
+    assert reloaded.config.success_threshold == 0.7
+    assert reloaded.config.progress_loss_type == "l2"  # came back from config.json
diff --git a/tests/rewards/test_robometer_processor.py b/tests/rewards/test_robometer_processor.py
new file mode 100644
index 000000000..cba8ad564
--- /dev/null
+++ b/tests/rewards/test_robometer_processor.py
@@ -0,0 +1,354 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Robometer's pre-processing helpers and encoder step.
+
+Covers the pure helpers (``_video_to_numpy`` and ``_expand_tasks``) directly,
+and exercises :class:`RobometerEncoderProcessorStep` with a stubbed
+``AutoProcessor`` so we don't need to download Qwen-VL just to test the
+dataclass plumbing (``transform_features`` / ``get_config``).
+
+The full ``__call__`` path that runs ``process_vision_info`` + the Qwen
+processor is intentionally *not* covered here — it is essentially HF glue
+that's exercised by the integration / parity scripts.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import pytest
+import torch
+
+from lerobot.configs import FeatureType, PipelineFeatureType, PolicyFeature
+from lerobot.rewards.robometer.processor_robometer import (
+    PROGRESS_PROMPT,
+    _expand_tasks,
+    _frames_to_pil,
+    _video_to_numpy,
+)
+from tests.utils import skip_if_package_missing
+
+
+def _skip_if_robometer_extras_missing(func):
+    """Apply both optional-dependency guards in one shot.
+
+    ``RobometerEncoderProcessorStep.__post_init__`` calls
+    ``require_package("transformers", ...)`` *and*
+    ``require_package("qwen-vl-utils", ...)``, so both need to be present
+    before we can instantiate the step.
+    """
+    func = skip_if_package_missing("qwen-vl-utils", import_name="qwen_vl_utils")(func)
+    func = skip_if_package_missing("transformers")(func)
+    return func
+
+
+# ---------------------------------------------------------------------------
+# _video_to_numpy — pure tensor → uint8 (T, H, W, C) conversion
+# ---------------------------------------------------------------------------
+
+
+def test_video_to_numpy_chw_float_is_converted_to_thwc_uint8():
+    video = torch.rand(4, 3, 8, 8)  # (T, C, H, W) floats in [0, 1]
+    array = _video_to_numpy(video, max_frames=None)
+
+    assert array.shape == (4, 8, 8, 3)
+    assert array.dtype == np.uint8
+    assert array.min() >= 0 and array.max() <= 255
+
+
+def test_video_to_numpy_already_thwc_uint8_passes_through():
+    video = torch.randint(0, 256, (3, 8, 8, 3), dtype=torch.uint8)  # (T, H, W, C)
+    array = _video_to_numpy(video, max_frames=None)
+
+    assert array.shape == (3, 8, 8, 3)
+    assert array.dtype == np.uint8
+
+
+def test_video_to_numpy_max_frames_tail_crops_recent_frames():
+    """``max_frames`` should keep the **last** K frames (most recent)."""
+    video = torch.zeros(10, 3, 4, 4)
+    for t in range(10):
+        video[t] = t / 9.0  # marker: 0 at t=0, ≈1 at t=9
+
+    array = _video_to_numpy(video, max_frames=3)
+
+    assert array.shape == (3, 4, 4, 3)
+    # The first kept frame is t=7 → marker ≈ 7/9 → uint8 ≈ 198
+    assert int(array[0, 0, 0, 0]) == int(round(7 / 9 * 255))
+    # The last kept frame is t=9 → marker = 1.0 → uint8 = 255
+    assert int(array[-1, 0, 0, 0]) == 255
+
+
+def test_video_to_numpy_rejects_3d_input():
+    with pytest.raises(ValueError, match="Expected channel dim"):
+        _video_to_numpy(torch.zeros(4, 8, 8), max_frames=None)
+
+
+def test_video_to_numpy_floats_above_one_pass_through_without_rescaling():
+    """If ``array.max() > 1`` the helper assumes the tensor is already in the
+    [0, 255] range (uint8-as-float), so values pass through unchanged."""
+    video = torch.full((1, 3, 2, 2), 5.0)
+    array = _video_to_numpy(video, max_frames=None)
+
+    assert array.shape == (1, 2, 2, 3)
+    assert int(array.max()) == 5
+
+
+def test_video_to_numpy_clips_very_large_floats_to_uint8_max():
+    """Out-of-uint8-range floats are clipped at 255 before the cast."""
+    video = torch.full((1, 3, 2, 2), 300.0)
+    array = _video_to_numpy(video, max_frames=None)
+
+    assert int(array.max()) == 255
+
+
+# ---------------------------------------------------------------------------
+# _expand_tasks — string / list / tuple broadcasting to batch size
+# ---------------------------------------------------------------------------
+
+
+def test_expand_tasks_string_is_broadcast_to_batch_size():
+    assert _expand_tasks("pick up", batch_size=3, default=None) == ["pick up", "pick up", "pick up"]
+
+
+def test_expand_tasks_list_of_matching_size_passes_through():
+    assert _expand_tasks(["a", "b", "c"], batch_size=3, default=None) == ["a", "b", "c"]
+
+
+def test_expand_tasks_tuple_is_normalised_to_list():
+    assert _expand_tasks(("a", "b"), batch_size=2, default=None) == ["a", "b"]
+
+
+def test_expand_tasks_single_element_list_is_broadcast():
+    assert _expand_tasks(["only one"], batch_size=3, default=None) == ["only one"] * 3
+
+
+def test_expand_tasks_size_mismatch_raises():
+    with pytest.raises(ValueError, match="Expected 3 tasks"):
+        _expand_tasks(["a", "b"], batch_size=3, default=None)
+
+
+def test_expand_tasks_missing_uses_default():
+    assert _expand_tasks(None, batch_size=2, default="fallback") == ["fallback", "fallback"]
+
+
+def test_expand_tasks_missing_without_default_raises():
+    with pytest.raises(KeyError, match="task description"):
+        _expand_tasks(None, batch_size=1, default=None)
+
+
+def test_expand_tasks_wrong_type_raises():
+    with pytest.raises(TypeError, match="must be a string or list"):
+        _expand_tasks(42, batch_size=1, default=None)
+
+
+# ---------------------------------------------------------------------------
+# _frames_to_pil — uint8 (T, H, W, C) → list[PIL.Image]
+# ---------------------------------------------------------------------------
+
+
+def test_frames_to_pil_returns_one_image_per_frame():
+    frames = np.zeros((4, 8, 8, 3), dtype=np.uint8)
+    images = _frames_to_pil(frames)
+
+    assert len(images) == 4
+    assert all(img.size == (8, 8) for img in images)
+
+
+def test_frames_to_pil_casts_floats_to_uint8():
+    frames = np.full((2, 4, 4, 3), 200.0, dtype=np.float32)
+    images = _frames_to_pil(frames)
+
+    assert len(images) == 2
+    # PIL converted from clipped uint8 - sanity check pixel values come through.
+    assert np.asarray(images[0]).dtype == np.uint8
+
+
+def test_frames_to_pil_rejects_non_4d_input():
+    with pytest.raises(ValueError, match=r"\(T,H,W,C\)"):
+        _frames_to_pil(np.zeros((4, 8, 8), dtype=np.uint8))
+
+
+# ---------------------------------------------------------------------------
+# Encoder step plumbing — exercise dataclass surface with a stubbed AutoProcessor
+# ---------------------------------------------------------------------------
+
+
+class _FakeTokenizer:
+    """Tokenizer surface the encoder step touches in ``__post_init__``."""
+
+    def __init__(self) -> None:
+        self.pad_token: str | None = None
+        self.eos_token = "<|endoftext|>"
+        self._vocab: dict[str, int] = {"<|endoftext|>": 0}
+        self.added: list[str] = []
+
+    def get_vocab(self) -> dict[str, int]:
+        return self._vocab
+
+    def add_special_tokens(self, payload: dict[str, Any]) -> int:
+        for token in payload.get("additional_special_tokens", []):
+            if token not in self._vocab:
+                self._vocab[token] = len(self._vocab)
+                self.added.append(token)
+        return len(self.added)
+
+
+class _FakeAutoProcessor:
+    """Stand-in returned by ``AutoProcessor.from_pretrained`` during tests."""
+
+    def __init__(self) -> None:
+        self.tokenizer = _FakeTokenizer()
+        self.image_processor = None
+        self.video_processor = None
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):  # noqa: ARG003
+        return cls()
+
+
+def _build_step(monkeypatch, **overrides):
+    from lerobot.rewards.robometer import processor_robometer
+
+    monkeypatch.setattr(processor_robometer, "AutoProcessor", _FakeAutoProcessor)
+
+    return processor_robometer.RobometerEncoderProcessorStep(**overrides)
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_registers_special_tokens_on_tokenizer(monkeypatch):
+    """``__post_init__`` must register Robometer's five special tokens on the
+    tokenizer that ships with the chosen Qwen-VL checkpoint."""
+    from lerobot.rewards.robometer.configuration_robometer import ROBOMETER_SPECIAL_TOKENS
+
+    step = _build_step(monkeypatch)
+
+    vocab = step._processor.tokenizer.get_vocab()
+    for token in ROBOMETER_SPECIAL_TOKENS:
+        assert token in vocab, f"{token} not registered on the tokenizer"
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_sets_pad_token_to_eos_when_missing(monkeypatch):
+    """Qwen tokenizers ship without a pad token; the step must reuse EOS so
+    batched processing doesn't crash on padding."""
+    step = _build_step(monkeypatch)
+
+    assert step._processor.tokenizer.pad_token == "<|endoftext|>"
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_get_config_roundtrips_user_fields(monkeypatch):
+    """``get_config`` must serialise every user-tunable field — these are what
+    the processor pipeline saves under ``preprocessor_config.json``."""
+    step = _build_step(
+        monkeypatch,
+        base_model_id="Qwen/Qwen3-VL-4B-Instruct",
+        image_key="observation.images.cam_top",
+        task_key="task",
+        default_task="do the thing",
+        max_frames=12,
+        use_multi_image=True,
+        use_per_frame_progress_token=True,
+        max_length=2048,
+    )
+
+    cfg = step.get_config()
+    assert cfg == {
+        "base_model_id": "Qwen/Qwen3-VL-4B-Instruct",
+        "image_key": "observation.images.cam_top",
+        "task_key": "task",
+        "default_task": "do the thing",
+        "max_frames": 12,
+        "use_multi_image": True,
+        "use_per_frame_progress_token": True,
+        "max_length": 2048,
+    }
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_transform_features_is_identity(monkeypatch):
+    """The encoder step writes Qwen tensors into ``observation`` at call time,
+    but it does **not** advertise new typed features at pipeline-build time —
+    the downstream model consumes them via the ``ROBOMETER_FEATURE_PREFIX``
+    namespace, not via the typed feature map.
+    """
+    step = _build_step(monkeypatch)
+
+    features = {
+        PipelineFeatureType.OBSERVATION: {
+            "observation.images.top": PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL),
+        }
+    }
+    assert step.transform_features(features) == features
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_build_conversation_inserts_prog_token_per_frame(monkeypatch):
+    """In multi-image mode with per-frame progress tokens, the conversation
+    must alternate ``image`` and ``<|prog_token|>`` text entries, one pair
+    per frame, after the task prompt."""
+    step = _build_step(
+        monkeypatch,
+        use_multi_image=True,
+        use_per_frame_progress_token=True,
+    )
+
+    frames = np.zeros((3, 8, 8, 3), dtype=np.uint8)
+    conversation = step._build_conversation(frames, task="pick up the cube")
+
+    assert len(conversation) == 1 and conversation[0]["role"] == "user"
+    content = conversation[0]["content"]
+
+    # First entry is the task prompt.
+    assert content[0] == {"type": "text", "text": PROGRESS_PROMPT.format(task="pick up the cube")}
+
+    # Then 3 (image, <|prog_token|>) pairs.
+    expected_tail = [
+        item
+        for _ in range(3)
+        for item in (
+            {"type": "image"},  # value asserted below
+            {"type": "text", "text": "<|prog_token|>"},
+        )
+    ]
+    assert len(content) == 1 + len(expected_tail)
+    for got, exp in zip(content[1:], expected_tail, strict=True):
+        assert got["type"] == exp["type"]
+        if exp["type"] == "text":
+            assert got["text"] == exp["text"]
+
+
+@_skip_if_robometer_extras_missing
+def test_encoder_step_build_conversation_video_mode_uses_single_video_entry(monkeypatch):
+    """When ``use_multi_image=False``, frames are bundled into a single
+    ``video`` content entry instead of individual ``image`` entries."""
+    step = _build_step(
+        monkeypatch,
+        use_multi_image=False,
+        use_per_frame_progress_token=False,
+    )
+
+    frames = np.zeros((4, 8, 8, 3), dtype=np.uint8)
+    conversation = step._build_conversation(frames, task="pour the water")
+
+    content = conversation[0]["content"]
+    # Exactly two entries: the prompt and one video entry.
+    assert len(content) == 2
+    assert content[0]["type"] == "text"
+    assert content[1]["type"] == "video"
+    # The video entry carries all four frames.
+    assert len(content[1]["video"]) == 4
diff --git a/uv.lock b/uv.lock
index eebbb7f95..fbcdf1a83 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2989,6 +2989,11 @@ rebot = [
     { name = "motorbridge" },
     { name = "motorbridge-smart-servo" },
 ]
+robometer = [
+    { name = "peft" },
+    { name = "qwen-vl-utils" },
+    { name = "transformers" },
+]
 robstride = [
     { name = "python-can" },
 ]
@@ -3146,6 +3151,7 @@ requires-dist = [
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'groot'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'molmoact2'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'peft'" },
+    { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'robometer'" },
     { name = "lerobot", extras = ["peft-dep"], marker = "extra == 'wallx'" },
     { name = "lerobot", extras = ["phone"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["pi"], marker = "extra == 'all'" },
@@ -3163,10 +3169,12 @@ requires-dist = [
     { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'lekiwi'" },
     { name = "lerobot", extras = ["pyzmq-dep"], marker = "extra == 'unitree-g1'" },
     { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'eo1'" },
+    { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'robometer'" },
     { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'sarm'" },
     { name = "lerobot", extras = ["qwen-vl-utils-dep"], marker = "extra == 'wallx'" },
     { name = "lerobot", extras = ["reachy2"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["rebot"], marker = "extra == 'all'" },
+    { name = "lerobot", extras = ["robometer"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["robstride"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["sarm"], marker = "extra == 'all'" },
     { name = "lerobot", extras = ["scipy-dep"], marker = "extra == 'aloha'" },
@@ -3188,6 +3196,7 @@ requires-dist = [
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'multi-task-dit'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'peft'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'pi'" },
+    { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'robometer'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'sarm'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'smolvla'" },
     { name = "lerobot", extras = ["transformers-dep"], marker = "extra == 'topreward'" },
@@ -3258,7 +3267,7 @@ requires-dist = [
     { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
     { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
-provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "molmoact2", "smolvla", "multi-task-dit", "groot", "sarm", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
+provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "molmoact2", "smolvla", "multi-task-dit", "groot", "sarm", "robometer", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
 
 [[package]]
 name = "librt"

From 2bfaf44db2c1b559df788114e433e440a2391b24 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 10:01:47 +0200
Subject: [PATCH 08/45] annotations(steerable): structured action records +
 5-axis task augmentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EgoMimic-inspired additions to the plan module, both opt-in for back-compat.

1. PHASE 1a + 1b: per-subtask structured action records
   * cfg.action_records.enabled=True triggers, after Phase 1 subtask-span
     generation, one extra VLM call per subtask to extract a typed record:
       {verb, object, arm, grasp_type, destination, mistake}
   * A deterministic Python template (_render_action_record_to_subtask_text)
     renders the record back to canonical subtask text. When replace_subtask_
     text=True (default), this REPLACES the VLM's free-form text — eliminates
     cross-episode phrasing drift.
   * When emit_record_row=True (default), the structured record is also
     emitted as a row with style='action_record' (added to PERSISTENT_STYLES)
     so downstream training can consume the typed schema directly.
   * Verb + grasp vocabularies are configurable. Out-of-vocab values are
     rejected at extraction time.

2. STRUCTURED 5-AXIS TASK AUGMENTATION
   * cfg.task_aug_axes.enabled=True replaces the free-form n_task_rephrasings
     path with a structured prompt producing variants along 5 named axes:
       synonym_paraphrase (3)
       omit_arm           (3)
       omit_orientation   (2)
       omit_grasp_method  (2)
       combined_omissions (2)
     Total ~12 variants. Axes with nothing to omit emit fewer entries.
   * Each variant is emitted as a task_aug row at t=0 (existing style).

Inspired by https://github.com/GaTech-RL2/EgoVerse/tree/main/egomimic/scripts/language_process
— they pay Scale AI annotators to fill a structured form and then generate
language via a deterministic prompt. We get the same hallucination-reducing
structure via one extra VLM call per subtask.

Files:
  src/lerobot/datasets/language.py
  src/lerobot/annotations/steerable_pipeline/config.py
  src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
  src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
  src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 130 +++++++++
 .../modules/plan_subtasks_memory.py           | 263 +++++++++++++++++-
 .../prompts/module_1_action_record.txt        |  64 +++++
 .../prompts/module_1_task_aug_axes.txt        |  60 ++++
 src/lerobot/datasets/language.py              |   2 +-
 5 files changed, 514 insertions(+), 5 deletions(-)
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index da07d7998..cc6402f08 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -92,6 +92,136 @@ class PlanConfig:
     use_video_url: bool = False
     use_video_url_fps: float = 1.0
 
+    # Structured per-subtask action records (Phase 1a + 1b, inspired by
+    # EgoMimic's annotator form). For each generated subtask span, the
+    # VLM extracts a typed record (verb / object / arm / grasp_type /
+    # destination / mistake). A deterministic Python template renders
+    # that record back to canonical subtask text — reducing the VLM's
+    # "creative" surface to just the perception step. See
+    # ``ActionRecordsConfig`` for details. Off by default (back-compat).
+    action_records: "ActionRecordsConfig" = field(default_factory=lambda: ActionRecordsConfig())
+
+    # Structured 5-axis augmentation taxonomy for the t=0 task variants
+    # (replaces the free-form ``n_task_rephrasings`` flow when enabled).
+    # Mirrors EgoMimic's ``augment_prompt.txt`` taxonomy: instead of N
+    # free-form rephrasings, the VLM produces variants along named
+    # axes (synonym / omit_arm / omit_orientation / omit_grasp_method /
+    # combined). Off by default (back-compat).
+    task_aug_axes: "TaskAugAxesConfig" = field(default_factory=lambda: TaskAugAxesConfig())
+
+
+@dataclass
+class ActionRecordsConfig:
+    """Structured per-subtask action record extraction.
+
+    When ``enabled=True``, after the existing subtask-span generation in
+    ``plan_subtasks_memory.py``, the module makes one extra VLM call per
+    subtask to extract a typed record::
+
+        {
+          "verb": "pick" | "place" | "press" | ...,    # closed vocabulary
+          "object": "<canonical_object_name>",
+          "arm": "left" | "right" | "both" | null,
+          "grasp_type": "pinch" | "wrap" | "hook" | ... | null,
+          "destination": "<canonical_destination>" | null,
+          "mistake": "<short text>" | null,
+        }
+
+    A deterministic Python template then renders the record back to
+    canonical subtask text (e.g. ``pick blue cube with left arm using
+    pinch grip``). When ``replace_subtask_text=True`` (default), the
+    rendered text REPLACES the VLM's free-form subtask text — eliminating
+    cross-episode phrasing drift. When ``emit_record_row=True``
+    (default), the structured record is also emitted as a row with
+    ``style="action_record"`` so downstream consumers can train on the
+    typed schema directly.
+
+    Cost: one extra VLM call per subtask. For an 8-subtask episode this
+    means ~8x more VLM calls in the plan module — still cheap relative
+    to the action-expert training cost, but worth knowing.
+    """
+
+    enabled: bool = False
+
+    # When True, replace the VLM-generated subtask text with the
+    # deterministic template's rendering of the structured record.
+    # Strongly recommended — it's the whole point of the structured
+    # intermediate. Set False to keep both representations side by side.
+    replace_subtask_text: bool = True
+
+    # When True, emit a separate row with ``style="action_record"`` and
+    # ``content=json.dumps(record)`` at the subtask's start timestamp.
+    # Lets downstream training consume the typed schema directly (e.g.
+    # auxiliary supervision on verb/arm/grasp classification heads).
+    emit_record_row: bool = True
+
+    # Frame sampling for the per-subtask VLM call (similar to the
+    # interjection module's window). Anchored to the subtask span.
+    frames_per_subtask: int = 4
+
+    # Closed verb vocabulary. The prompt instructs the VLM to pick
+    # exactly one. Override per-dataset (e.g. ``["pick", "place", "open",
+    # "close"]`` for door-only manipulation) for tighter constraint.
+    verb_vocabulary: tuple[str, ...] = (
+        "pick", "place", "push", "pull", "open", "close", "turn",
+        "press", "lift", "insert", "pour", "move", "reach", "grasp",
+        "release", "wipe", "dump",
+    )
+
+    # Closed grasp-type vocabulary. ``null`` is always allowed (no
+    # contact / unclear). Adjust per-hardware (e.g. drop ``hook`` /
+    # ``key`` for parallel-jaw grippers).
+    grasp_vocabulary: tuple[str, ...] = (
+        "pinch", "wrap", "hook", "key", "lateral",
+    )
+
+
+@dataclass
+class TaskAugAxesConfig:
+    """Structured 5-axis augmentation taxonomy for t=0 task variants.
+
+    When ``enabled=True``, replaces the free-form ``n_task_rephrasings``
+    flow with a structured prompt that produces variants along five
+    named axes (mirroring EgoMimic's ``augment_prompt.txt``):
+
+      * ``synonym_paraphrase`` — different wording / verbs, all
+        information preserved.
+      * ``omit_arm`` — drop the left/right/both arm specification.
+      * ``omit_orientation`` — drop orientation cues (upright,
+        sideways, ...).
+      * ``omit_grasp_method`` — drop grip / grasp method specification.
+      * ``combined_omissions`` — combine two of the above
+        simultaneously.
+
+    Default counts (3+3+2+2+2 = 12 variants per task) match EgoMimic.
+    Axes that have nothing to omit in the source task (e.g. ``omit_arm``
+    when the task doesn't mention an arm) emit fewer entries rather
+    than pad — the prompt instructs the VLM accordingly.
+
+    Each variant is emitted as a ``task_aug`` row at ``t=0`` (same
+    style as the free-form variants), so the rest of the pipeline /
+    training recipe doesn't need to know about the taxonomy.
+    """
+
+    enabled: bool = False
+
+    synonym_paraphrase: int = 3
+    omit_arm: int = 3
+    omit_orientation: int = 2
+    omit_grasp_method: int = 2
+    combined_omissions: int = 2
+
+    @property
+    def total(self) -> int:
+        """Sum of requested variants across all axes (upper bound)."""
+        return (
+            self.synonym_paraphrase
+            + self.omit_arm
+            + self.omit_orientation
+            + self.omit_grasp_method
+            + self.combined_omissions
+        )
+
 
 @dataclass
 class InterjectionsConfig:
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index b9bae607e..46d678fd6 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 from collections.abc import Sequence
 from dataclasses import dataclass, field
@@ -28,6 +29,7 @@ from ..frames import (
     FrameProvider,
     VideoFrameProvider,
     null_provider,
+    to_image_blocks,
     to_video_block,
     to_video_url_block,
 )
@@ -78,13 +80,37 @@ class PlanSubtasksMemoryModule:
         # ``task_aug`` rows at t=0 (role=user), one per rephrasing — the
         # message renderer rotates ``${task}`` deterministically through
         # them so the policy sees diverse phrasings during training.
+        # Two paths:
+        #   * ``task_aug_axes.enabled=True`` — structured 5-axis taxonomy
+        #     (synonym / omit_arm / omit_orientation / omit_grasp_method
+        #     / combined). Replaces the free-form rephrasings flow.
+        #   * Otherwise — free-form ``n_task_rephrasings`` (original).
         t0 = float(record.frame_timestamps[0]) if record.frame_timestamps else 0.0
-        if self.config.n_task_rephrasings > 0 and effective_task:
+        axes_cfg = self.config.task_aug_axes
+        if axes_cfg.enabled and effective_task:
+            variants = self._generate_task_aug_by_axes(effective_task, axes_cfg)
+            seen: set[str] = set()
+            ordered = [effective_task, *variants]
+            for phrasing in ordered:
+                key = phrasing.strip()
+                if not key or key in seen:
+                    continue
+                seen.add(key)
+                rows.append(
+                    {
+                        "role": "user",
+                        "content": key,
+                        "style": "task_aug",
+                        "timestamp": t0,
+                        "tool_calls": None,
+                    }
+                )
+        elif self.config.n_task_rephrasings > 0 and effective_task:
             rephrasings = self._generate_task_rephrasings(effective_task, n=self.config.n_task_rephrasings)
             # Always include the effective task itself as the first variant
             # so the rotation is guaranteed to cover the source-of-truth
             # phrasing, not just synthetic alternatives.
-            seen: set[str] = set()
+            seen = set()
             ordered = [effective_task, *rephrasings]
             for phrasing in ordered:
                 key = phrasing.strip()
@@ -102,8 +128,31 @@ class PlanSubtasksMemoryModule:
                 )
 
         subtask_spans = self._generate_subtasks(record, task=effective_task)
-        # subtask rows
-        for span in subtask_spans:
+
+        # ----------------------------------------------------------------
+        # Phase 1a + 1b: structured per-subtask action records
+        # ----------------------------------------------------------------
+        # When enabled, for every subtask span we ask the VLM for a typed
+        # ActionRecord (verb / object / arm / grasp_type / destination /
+        # mistake). A deterministic Python template renders the record
+        # back to canonical subtask text. The render replaces the
+        # free-form subtask text (cleaner conditioning) and the typed
+        # record is emitted as a separate row for downstream use.
+        records_cfg = self.config.action_records
+        action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
+        if records_cfg.enabled and subtask_spans:
+            for i, span in enumerate(subtask_spans):
+                rec = self._extract_action_record(record, span, effective_task)
+                if rec is None:
+                    continue
+                action_records[i] = rec
+                if records_cfg.replace_subtask_text:
+                    canonical_text = self._render_action_record_to_subtask_text(rec)
+                    if canonical_text:
+                        span["text"] = canonical_text
+
+        # subtask rows (may now reflect canonical-rendered text)
+        for i, span in enumerate(subtask_spans):
             rows.append(
                 {
                     "role": "assistant",
@@ -113,6 +162,16 @@ class PlanSubtasksMemoryModule:
                     "tool_calls": None,
                 }
             )
+            if records_cfg.enabled and records_cfg.emit_record_row and action_records[i] is not None:
+                rows.append(
+                    {
+                        "role": "assistant",
+                        "content": json.dumps(action_records[i], sort_keys=True),
+                        "style": "action_record",
+                        "timestamp": snap_to_frame(span["start"], record.frame_timestamps),
+                        "tool_calls": None,
+                    }
+                )
         # Plan rows at every subtask boundary — including t=0 (start of
         # the first subtask). Because the plan is just a numbered list
         # of *still-todo* subtasks, re-emitting at each boundary makes
@@ -244,6 +303,202 @@ class PlanSubtasksMemoryModule:
         out = [item.strip().strip('"').strip("'") for item in raw if isinstance(item, str)]
         return [s for s in out if s][:n]
 
+    # ------------------------------------------------------------------
+    # Phase 1a + 1b: structured per-subtask action records
+    # ------------------------------------------------------------------
+
+    def _extract_action_record(
+        self,
+        record: EpisodeRecord,
+        span: dict[str, Any],
+        episode_task: str,
+    ) -> dict[str, Any] | None:
+        """Ask the VLM to extract a typed ``ActionRecord`` from a subtask span.
+
+        Sends ``frames_per_subtask`` frames uniformly sampled from
+        ``[span.start, span.end]`` plus the canonical subtask text. The
+        VLM is constrained to verb + grasp vocabularies from the config
+        — invalid values are silently dropped at this layer (the
+        validator catches structural problems pre-write).
+
+        Returns ``None`` when the call fails or the VLM returns something
+        unrecognizable; callers fall back to the free-form subtask text.
+        """
+        cfg = self.config.action_records
+        start_t = float(span.get("start", 0.0))
+        end_t = float(span.get("end", start_t))
+        duration = max(0.0, end_t - start_t)
+
+        # Uniform timestamps within the span; fall back to a single
+        # center frame for very short spans.
+        n = max(1, int(cfg.frames_per_subtask))
+        if n == 1 or duration <= 0.0:
+            timestamps = [0.5 * (start_t + end_t)]
+        else:
+            step = duration / (n - 1)
+            timestamps = [start_t + i * step for i in range(n)]
+        frames = self.frame_provider.frames_at(record, timestamps)
+        if not frames:
+            logger.debug(
+                "action_record: no frames at span %.2f-%.2f for ep %s; skipping",
+                start_t, end_t, record.episode_index,
+            )
+            return None
+
+        prompt = load_prompt("module_1_action_record").format(
+            episode_task=episode_task,
+            subtask_text=span.get("text", ""),
+            start_time=start_t,
+            end_time=end_t,
+            duration=duration,
+            n_frames=len(frames),
+            verb_vocabulary=", ".join(cfg.verb_vocabulary),
+            grasp_vocabulary=" | ".join(f'"{g}"' for g in cfg.grasp_vocabulary),
+        )
+        message = [
+            {
+                "role": "user",
+                "content": [*to_image_blocks(frames), {"type": "text", "text": prompt}],
+            }
+        ]
+        result = self.vlm.generate_json([message])[0]
+        if not isinstance(result, dict):
+            return None
+
+        # Light validation + normalisation. Verb is required; everything
+        # else may be null. Verb / grasp_type are clamped to the
+        # vocabularies (out-of-vocab → reject or null).
+        verb = (result.get("verb") or "").strip().lower()
+        if not verb or verb not in {v.lower() for v in cfg.verb_vocabulary}:
+            return None
+        obj = (result.get("object") or "").strip()
+        if not obj:
+            return None
+        grasp = result.get("grasp_type")
+        if isinstance(grasp, str):
+            grasp = grasp.strip().lower()
+            if grasp not in {g.lower() for g in cfg.grasp_vocabulary}:
+                grasp = None
+        else:
+            grasp = None
+        arm = result.get("arm")
+        if isinstance(arm, str):
+            arm = arm.strip().lower()
+            if arm not in {"left", "right", "both"}:
+                arm = None
+        else:
+            arm = None
+        destination = result.get("destination")
+        destination = destination.strip() if isinstance(destination, str) and destination.strip() else None
+        mistake = result.get("mistake")
+        mistake = mistake.strip() if isinstance(mistake, str) and mistake.strip() else None
+
+        return {
+            "verb": verb,
+            "object": obj,
+            "arm": arm,
+            "grasp_type": grasp,
+            "destination": destination,
+            "mistake": mistake,
+        }
+
+    @staticmethod
+    def _render_action_record_to_subtask_text(record: dict[str, Any]) -> str:
+        """Deterministic template: ``ActionRecord`` → canonical subtask text.
+
+        Mirrors the authoring guidance in ``module_1_subtasks.txt``:
+        imperative, drop articles / adverbs, use canonical object nouns,
+        append arm / grasp clauses only when present.
+
+        Examples (record → rendered text)::
+
+            {verb=pick, object=blue cube}
+                → "pick blue cube"
+            {verb=pick, object=blue cube, arm=left, grasp_type=pinch}
+                → "pick blue cube with left arm using pinch grip"
+            {verb=place, object=blue cube, destination=green box}
+                → "place blue cube in green box"
+            {verb=move, object=mug, destination=stove}
+                → "move mug to stove"
+        """
+        verb = (record.get("verb") or "").strip().lower()
+        obj = (record.get("object") or "").strip()
+        arm = (record.get("arm") or "").strip().lower() if record.get("arm") else ""
+        grasp = (record.get("grasp_type") or "").strip().lower() if record.get("grasp_type") else ""
+        dest = (record.get("destination") or "").strip() if record.get("destination") else ""
+
+        if not verb:
+            return ""
+
+        parts: list[str] = [verb]
+        if obj:
+            parts.append(obj)
+        if dest:
+            # Pick a sensible preposition per verb family.
+            if verb in {"place", "put", "drop", "insert", "pour", "dump"}:
+                parts.append(f"in {dest}")
+            elif verb in {"move", "transport", "reach"}:
+                parts.append(f"to {dest}")
+            else:
+                parts.append(f"at {dest}")
+        if arm == "both":
+            parts.append("with both arms")
+        elif arm in {"left", "right"}:
+            parts.append(f"with {arm} arm")
+        if grasp:
+            parts.append(f"using {grasp} grip")
+        return " ".join(parts)
+
+    # ------------------------------------------------------------------
+    # Structured 5-axis task augmentation (EgoMimic-style taxonomy)
+    # ------------------------------------------------------------------
+
+    def _generate_task_aug_by_axes(self, base_task: str, axes_cfg: Any) -> list[str]:
+        """One VLM call → variants along the 5-axis taxonomy.
+
+        Variants from all axes are flattened into a single list (the
+        downstream pipeline doesn't need to know about the per-axis
+        bucketing — every variant becomes a ``task_aug`` row). Order
+        is preserved for reproducibility: synonym_paraphrase first,
+        then omit_arm, then omit_orientation, then omit_grasp_method,
+        then combined_omissions.
+        """
+        if not base_task:
+            return []
+        prompt = load_prompt("module_1_task_aug_axes").format(
+            base_task=base_task,
+            n_synonym=axes_cfg.synonym_paraphrase,
+            n_omit_arm=axes_cfg.omit_arm,
+            n_omit_orientation=axes_cfg.omit_orientation,
+            n_omit_grasp_method=axes_cfg.omit_grasp_method,
+            n_combined=axes_cfg.combined_omissions,
+        )
+        result = self.vlm.generate_json([self._text_message(prompt)])[0]
+        if not isinstance(result, dict):
+            return []
+        ordered_axes = (
+            "synonym_paraphrase",
+            "omit_arm",
+            "omit_orientation",
+            "omit_grasp_method",
+            "combined_omissions",
+        )
+        flat: list[str] = []
+        seen: set[str] = set()
+        for axis in ordered_axes:
+            entries = result.get(axis)
+            if not isinstance(entries, list):
+                continue
+            for item in entries:
+                if not isinstance(item, str):
+                    continue
+                key = item.strip().strip('"').strip("'")
+                if not key or key in seen:
+                    continue
+                seen.add(key)
+                flat.append(key)
+        return flat
+
     def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]:
         """Same video block ``_generate_subtasks`` builds — extracted helper."""
         if not record.frame_timestamps:
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
new file mode 100644
index 000000000..1bd127048
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
@@ -0,0 +1,64 @@
+You are extracting a structured action record from a subtask span of a
+teleoperated robot demonstration. This is Phase 1a of a two-step
+process: you extract a typed record; a deterministic template then
+renders it back to canonical subtask text. Your job is the PERCEPTION
+step — not the language step.
+
+The user originally asked: "{episode_task}"
+The subtask span is:        "{subtask_text}"
+Span time window:           [{start_time:.2f}s, {end_time:.2f}s]
+                            ({duration:.2f}s of robot activity)
+
+You are shown {n_frames} frames sampled uniformly from the subtask
+window. Fill in a structured record describing the action that takes
+place between the first and last frame.
+
+Hard rules:
+- Use ONLY information visible in the frames. Do not infer details from
+  outside the span. Do not extrapolate from the original task wording.
+- Use canonical object names from the original task VERBATIM. Never
+  introduce synonyms: if the task says "cube", the record says "cube",
+  never "block" / "object" / "item".
+- For non-applicable fields, use ``null`` (not "n/a", not "none", not
+  an empty string).
+- For ``verb`` and ``grasp_type``, pick EXACTLY one value from the
+  vocabulary below. Never invent a new one.
+
+Field schema:
+
+  verb (required) — the imperative verb of the action. Vocabulary:
+    {verb_vocabulary}
+
+  object (required) — the manipulated object. Use the canonical noun
+    from the original task above.
+
+  arm — which arm performs the action. One of:
+    "left" | "right" | "both" | null
+    Use ``null`` when the source robot is single-arm or when the arm
+    is genuinely not visible in the frames.
+
+  grasp_type — which grip the gripper uses on contact. One of:
+    {grasp_vocabulary} | null
+    Use ``null`` when there is no contact in this span (e.g. a pure
+    ``move`` / ``reach`` subtask) or the grip is genuinely unclear.
+
+  destination — the target location for actions like ``place``,
+    ``move``, ``insert``, ``pour``. Use canonical names from the
+    original task. Use ``null`` for in-place actions (``press``,
+    ``turn``, ``grasp``, ``release``).
+
+  mistake — a brief one-clause description of any visible failure or
+    recovery during the span (e.g. "dropped the cube and re-grasped",
+    "missed the target on first attempt"). Use ``null`` when the span
+    completes cleanly with no visible recovery.
+
+Output strictly valid JSON of shape:
+
+  {{
+    "verb": "<one of vocabulary>",
+    "object": "<canonical noun>",
+    "arm": "left" | "right" | "both" | null,
+    "grasp_type": "<one of vocabulary>" | null,
+    "destination": "<canonical noun>" | null,
+    "mistake": "<short description>" | null
+  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
new file mode 100644
index 000000000..d8cd13104
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
@@ -0,0 +1,60 @@
+You are generating structured augmentations of a robot task instruction
+for training a language-conditioned policy. Unlike free-form rephrasing,
+your variants follow a NAMED 5-axis taxonomy — each axis omits or varies
+a specific element of the task while preserving its meaning.
+
+Original task: "{base_task}"
+
+Produce variants along five named axes. Each axis has a target count.
+The whole batch should expose the policy to maximum linguistic diversity
+WITHOUT changing what the robot is supposed to do.
+
+Axes and target counts:
+
+  synonym_paraphrase ({n_synonym}):
+    Different wording / verbs / sentence structure. ALL information
+    from the original task is preserved — same object, same arm
+    specification if present, same orientation if present, same grasp
+    if present.
+
+  omit_arm ({n_omit_arm}):
+    Drop the left/right/both arm specification from the task. Skip
+    entirely (emit 0 entries) if the original task does NOT mention an
+    arm. Do not invent an arm specification just to omit it.
+
+  omit_orientation ({n_omit_orientation}):
+    Drop orientation cues (upright, sideways, facing the user,
+    long-edge-first, etc.). Skip entirely if no orientation cue is
+    present in the original task.
+
+  omit_grasp_method ({n_omit_grasp_method}):
+    Drop the grip / grasp method specification (pinch, wrap, hold by
+    the rim, etc.). Skip entirely if no grasp method is mentioned.
+
+  combined_omissions ({n_combined}):
+    Combine TWO of the above omissions simultaneously (e.g. drop both
+    arm and orientation). Skip entirely if fewer than two of (arm,
+    orientation, grasp_method) appear in the original task.
+
+Hard rules:
+- Each variant MUST preserve the core action and the target object.
+  Do not change which object is involved, the destination, or the
+  high-level action.
+- Each variant is plain prose, no markdown, no quotes, no list numbers.
+- Each variant must be DISTINCT from every other variant in the entire
+  output, both within and across axes. Near-duplicates are not allowed.
+- If an axis cannot reach its target count because the original task
+  lacks the omittable element, emit fewer entries — do NOT pad the
+  axis with paraphrases that belong to a different axis.
+- Variants should not all start with verbs — vary sentence structure
+  (some imperative, some polite request, some question).
+
+Output strictly valid JSON of shape:
+
+  {{
+    "synonym_paraphrase": ["<v1>", "<v2>", ...],
+    "omit_arm": ["<v1>", "<v2>", ...],
+    "omit_orientation": ["<v1>", ...],
+    "omit_grasp_method": ["<v1>", ...],
+    "combined_omissions": ["<v1>", ...]
+  }}
diff --git a/src/lerobot/datasets/language.py b/src/lerobot/datasets/language.py
index 124c25221..aaca34e23 100644
--- a/src/lerobot/datasets/language.py
+++ b/src/lerobot/datasets/language.py
@@ -46,7 +46,7 @@ CORE_STYLES = {
 EXTENDED_STYLES: set[str] = set()
 STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
 
-PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
+PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug", "action_record"}
 EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
 
 # Styles whose ``content`` is grounded in a specific camera view. Rows of these

From 5dbf0fac5f96ecfd13edddff6346d6ddbf4ce5a0 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 11:48:05 +0200
Subject: [PATCH 09/45] annotations(steerable): remove Phase 0 canonical
 vocabulary discovery

Drops the optional Phase 0 vocabulary-discovery feature entirely.
With the new structured action records (Phase 1a + 1b) providing
cross-episode consistency via the deterministic template renderer,
the older vocabulary-constraint path is redundant and adds a second
constraint mechanism that wasn't well-validated in practice.

Removed:
  * src/lerobot/annotations/steerable_pipeline/vocabulary.py
    (Vocabulary dataclass + VocabularyDiscoveryModule + load_/
    save_vocabulary helpers; canonical_vocabulary.json on-disk format)
  * src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
    (Phase 0 VLM prompt)
  * tests/annotations/test_vocabulary.py

Pruned wiring across:
  * config.py: VocabularyConfig dataclass + AnnotationPipelineConfig.
    vocabulary field
  * executor.py: vocabulary attribute on Executor + _run_vocabulary_
    phase method + Phase 0 phases.append call in run()
  * modules/plan_subtasks_memory.py: Vocabulary import + vocabulary
    attribute + _subtask_vocabulary_block / _memory_vocabulary_block
    helpers + _canonicalize_subtask / _normalize / _invalid_subtasks
    / _build_subtask_retry_message methods + vocabulary-gated retry
    path in _generate_subtasks + empty-episode warning + _NORMALIZE_
    STRIP_TOKENS constant
  * prompts/module_1_subtasks.txt: {vocabulary_block} placeholder
  * prompts/module_1_memory.txt: {vocabulary_block} placeholder
  * __init__.py: Vocabulary / VocabularyDiscoveryModule / load_
    vocabulary / save_vocabulary / vocabulary_path / VOCABULARY_
    FILENAME re-exports
  * scripts/lerobot_annotate.py: VocabularyDiscoveryModule import +
    instantiation + executor argument
  * examples/annotations/run_hf_job.py: --vocabulary.enabled=false
    flag + docstring references + inline phase-0 comment

The original free-form rephrasings path stays (PlanConfig.
n_task_rephrasings still works when task_aug_axes.enabled=False).
Action records remain the preferred mechanism for cross-episode
subtask consistency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |  16 +-
 .../steerable_pipeline/__init__.py            |  14 -
 .../annotations/steerable_pipeline/config.py  |  36 --
 .../steerable_pipeline/executor.py            |  61 ---
 .../modules/plan_subtasks_memory.py           | 162 -------
 .../prompts/module_0_vocabulary.txt           |  53 ---
 .../prompts/module_1_memory.txt               |   2 +-
 .../prompts/module_1_subtasks.txt             |   2 +-
 .../steerable_pipeline/vocabulary.py          | 222 ----------
 src/lerobot/scripts/lerobot_annotate.py       |   5 -
 tests/annotations/test_vocabulary.py          | 412 ------------------
 11 files changed, 4 insertions(+), 981 deletions(-)
 delete mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
 delete mode 100644 src/lerobot/annotations/steerable_pipeline/vocabulary.py
 delete mode 100644 tests/annotations/test_vocabulary.py

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index c8219d9e4..dcc9435ce 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -6,15 +6,11 @@ Spawns one ``h200x2`` job that:
   1. installs this branch of ``lerobot`` plus the annotation extras,
   2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
   3. runs the plan / interjections / vqa modules across the dataset
-     in free-form mode (phase 0 canonical-vocabulary discovery is
-     disabled — each episode generates its own subtasks + memory),
+     in free-form mode (each episode generates its own subtasks +
+     memory),
   4. uploads the annotated dataset to ``--dest_repo_id`` (when set)
      or back to ``--repo_id``.
 
-Re-enable phase 0 with ``--vocabulary.enabled=true`` (optionally
-``--vocabulary.sample_episodes=N``) when the dataset is homogeneous
-enough to share one subtask + memory vocabulary across all episodes.
-
 Usage:
 
     HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
@@ -57,14 +53,6 @@ CMD = (
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.wrist "
-    # Phase 0 — canonical vocabulary discovery DISABLED by default.
-    # Heterogeneous datasets (different tasks/scenes across episodes)
-    # don't share a single small subtask + memory vocabulary, so each
-    # episode generates its subtasks + memory free-form. Flip to
-    # ``--vocabulary.enabled=true`` (optionally ``--vocabulary.sample_episodes=N``)
-    # for homogeneous datasets where a shared canonical vocabulary
-    # helps the downstream policy.
-    "--vocabulary.enabled=false "
     # Phase 1 — plan module (subtasks + plan + memory + task_aug).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "
diff --git a/src/lerobot/annotations/steerable_pipeline/__init__.py b/src/lerobot/annotations/steerable_pipeline/__init__.py
index 02d819604..a8da5e05e 100644
--- a/src/lerobot/annotations/steerable_pipeline/__init__.py
+++ b/src/lerobot/annotations/steerable_pipeline/__init__.py
@@ -26,25 +26,11 @@ outputs are staged per-episode before a final parquet rewrite:
 
 from .config import AnnotationPipelineConfig
 from .validator import StagingValidator, ValidationReport
-from .vocabulary import (
-    VOCABULARY_FILENAME,
-    Vocabulary,
-    VocabularyDiscoveryModule,
-    load_vocabulary,
-    save_vocabulary,
-    vocabulary_path,
-)
 from .writer import LanguageColumnsWriter
 
 __all__ = [
-    "VOCABULARY_FILENAME",
     "AnnotationPipelineConfig",
     "LanguageColumnsWriter",
     "StagingValidator",
     "ValidationReport",
-    "Vocabulary",
-    "VocabularyDiscoveryModule",
-    "load_vocabulary",
-    "save_vocabulary",
-    "vocabulary_path",
 ]
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index cc6402f08..c60e58fee 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -21,41 +21,6 @@ from pathlib import Path
 from typing import Any
 
 
-@dataclass
-class VocabularyConfig:
-    """Phase 0 — dataset-level canonical vocabulary discovery.
-
-    Watches the first ``sample_episodes`` episode videos and asks the VLM
-    to derive a small canonical vocabulary (subtask labels + memory
-    milestones) that every episode in the dataset will reuse. The VLM
-    decides the count itself from what it sees in the clips — short
-    pick-and-place demos get ~6 labels, longer multi-step recipes more.
-    The output lands at ``meta/canonical_vocabulary.json`` and feeds
-    phase 1's subtask + memory generation as both a prompt-side
-    constraint and a post-VLM validation gate.
-
-    Why this exists: free-form LLM rephrasing per episode produces near-
-    unique subtask strings, which makes the downstream low-level policy's
-    conditioning effectively noise — at inference the policy generates a
-    *new* paraphrase the action expert has never seen and produces tiny
-    cautious actions. Forcing every episode onto the same small set of
-    canonical strings gives the action expert dense supervision per
-    string and a small target distribution to learn against.
-
-    Set ``enabled=False`` to fall back to free-form generation (original
-    behaviour). ``reuse_existing=True`` keeps a hand-edited vocabulary
-    file from being clobbered on re-runs.
-    """
-
-    enabled: bool = True
-    sample_episodes: int = 3
-    max_video_frames_per_episode: int = 32
-    # When True (default), an existing meta/canonical_vocabulary.json is
-    # loaded as-is and no VLM call is made — lets operators hand-edit the
-    # file. Set False to always rediscover from the sample episodes.
-    reuse_existing: bool = True
-
-
 @dataclass
 class PlanConfig:
     """``plan`` module: plan + subtasks + memory + task augmentation.
@@ -351,7 +316,6 @@ class AnnotationPipelineConfig:
 
     seed: int = 1729
 
-    vocabulary: VocabularyConfig = field(default_factory=VocabularyConfig)
     plan: PlanConfig = field(default_factory=PlanConfig)
     interjections: InterjectionsConfig = field(default_factory=InterjectionsConfig)
     vqa: VqaConfig = field(default_factory=VqaConfig)
diff --git a/src/lerobot/annotations/steerable_pipeline/executor.py b/src/lerobot/annotations/steerable_pipeline/executor.py
index 5c725fa65..355e25460 100644
--- a/src/lerobot/annotations/steerable_pipeline/executor.py
+++ b/src/lerobot/annotations/steerable_pipeline/executor.py
@@ -94,7 +94,6 @@ class Executor:
     vqa: Any  # GeneralVqaModule
     writer: LanguageColumnsWriter
     validator: StagingValidator
-    vocabulary: Any = None  # VocabularyDiscoveryModule | None
 
     def run(self, root: Path) -> PipelineRunSummary:
         records = list(iter_episodes(root, only_episodes=self.config.only_episodes))
@@ -109,10 +108,6 @@ class Executor:
 
         phases: list[PhaseResult] = []
 
-        # Phase 0: vocabulary discovery. Mutates ``self.plan.vocabulary``
-        # so subsequent per-episode plan calls see the canonical labels.
-        phases.append(self._run_vocabulary_phase(records, root))
-
         # Phase 1: ``plan`` module (plan + subtasks + memory)
         phases.append(self._run_module_phase("plan", records, staging_dir, self.plan))
         # Phase 2: ``interjections`` module (interjections + speech). It
@@ -183,62 +178,6 @@ class Executor:
                 flush=True,
             )
 
-    def _run_vocabulary_phase(
-        self, records: list[EpisodeRecord], root: Path
-    ) -> PhaseResult:
-        """Discover (or load) the canonical vocabulary, wire it into ``self.plan``.
-
-        Returns a ``PhaseResult`` whose ``episodes_processed`` is the number
-        of sample episodes consulted (0 when disabled or no VLM call was
-        needed); ``episodes_skipped`` is always ``0`` because vocabulary is
-        a once-per-dataset artifact, not a per-episode product.
-        """
-        from .vocabulary import load_vocabulary, save_vocabulary  # noqa: PLC0415
-
-        if self.vocabulary is None or not getattr(self.vocabulary, "enabled", False):
-            print(
-                "[annotate] phase=vocabulary skipped (module disabled or unset)",
-                flush=True,
-            )
-            return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
-
-        existing = load_vocabulary(root)
-        if existing is not None and self.config.vocabulary.reuse_existing:
-            print(
-                f"[annotate] phase=vocabulary reusing {root / 'meta' / 'canonical_vocabulary.json'} "
-                f"({len(existing.subtasks)} subtask labels, "
-                f"{len(existing.memory_milestones)} memory milestones)",
-                flush=True,
-            )
-            self.plan.vocabulary = existing
-            return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
-
-        sample_n = max(1, min(int(self.config.vocabulary.sample_episodes), len(records)))
-        print(
-            f"[annotate] phase=vocabulary discovering from {sample_n} sample episode(s)...",
-            flush=True,
-        )
-        t0 = time.time()
-        vocab = self.vocabulary.discover(records[:sample_n], existing=existing)
-        if vocab is None:
-            print(
-                "[annotate] phase=vocabulary returned no vocabulary — "
-                "plan module will fall back to free-form generation",
-                flush=True,
-            )
-            return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
-
-        save_path = save_vocabulary(root, vocab)
-        print(
-            f"[annotate] phase=vocabulary wrote {save_path} "
-            f"({len(vocab.subtasks)} subtask labels, "
-            f"{len(vocab.memory_milestones)} memory milestones) in "
-            f"{time.time() - t0:.1f}s",
-            flush=True,
-        )
-        self.plan.vocabulary = vocab
-        return PhaseResult(name="vocabulary", episodes_processed=sample_n, episodes_skipped=0)
-
     def _run_module_phase(
         self,
         name: str,
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 46d678fd6..f58ec2c91 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -37,7 +37,6 @@ from ..prompts import load as load_prompt
 from ..reader import EpisodeRecord, reconstruct_subtask_spans, snap_to_frame
 from ..staging import EpisodeStaging
 from ..vlm_client import VlmClient
-from ..vocabulary import Vocabulary
 
 logger = logging.getLogger(__name__)
 
@@ -60,11 +59,6 @@ class PlanSubtasksMemoryModule:
     vlm: VlmClient
     config: PlanConfig
     frame_provider: FrameProvider = field(default_factory=null_provider)
-    vocabulary: Vocabulary | None = None
-    """When set, the module constrains subtask + memory generation to the
-    canonical strings in ``vocabulary``. Phase 0 (vocabulary discovery)
-    populates this once per dataset; ``None`` falls back to free-form
-    generation (original behaviour)."""
 
     @property
     def enabled(self) -> bool:
@@ -575,28 +569,9 @@ class PlanSubtasksMemoryModule:
             min_subtask_seconds=self.config.min_subtask_seconds,
             max_steps=self.config.plan_max_steps,
             episode_duration=f"{episode_duration:.3f}",
-            vocabulary_block=self._subtask_vocabulary_block(),
         )
         messages = self._video_message(record, prompt)
         spans = self._vlm_field(messages, "subtasks")
-        # When a vocabulary is in force, do a single targeted retry if
-        # any returned subtask is off-vocab — strict exact-match only,
-        # no fuzzy snapping. The retry includes the offending strings
-        # and the full canonical list so the VLM can correct itself.
-        if self.vocabulary is not None and self.vocabulary.subtasks and spans:
-            invalid = self._invalid_subtasks(spans)
-            if invalid:
-                logger.info(
-                    "episode %d: VLM emitted %d off-vocab subtask(s) (%s); retrying once",
-                    record.episode_index,
-                    len(invalid),
-                    invalid,
-                )
-                retry_msg = self._build_subtask_retry_message(messages, invalid)
-                retried = self._vlm_field(retry_msg, "subtasks")
-                if retried:
-                    spans = retried
-
         if not spans:
             return []
         # clamp to [t0, t_last] and sort
@@ -614,21 +589,11 @@ class PlanSubtasksMemoryModule:
             end = max(t0, min(end, t_last))
             if end < start:
                 start, end = end, start
-            if not text:
-                continue
-            text = self._canonicalize_subtask(text)
             if not text:
                 continue
             cleaned.append({"text": text, "start": start, "end": end})
         cleaned.sort(key=lambda s: s["start"])
         cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
-        if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
-            logger.warning(
-                "episode %d: every VLM subtask was off-vocab even after retry — "
-                "episode left empty (extend meta/canonical_vocabulary.json to "
-                "cover the missing phase)",
-                record.episode_index,
-            )
         return cleaned
 
     @staticmethod
@@ -679,132 +644,6 @@ class PlanSubtasksMemoryModule:
             out.append(new_span)
         return out
 
-    # ------------------------------------------------------------------
-    # Canonical-vocabulary helpers
-    # ------------------------------------------------------------------
-
-    def _subtask_vocabulary_block(self) -> str:
-        """Bullet-list of canonical subtasks the VLM must pick from.
-
-        Returns an empty string when no vocabulary is configured —
-        ``module_1_subtasks.txt`` then falls back to its free-form
-        rules (original behaviour).
-        """
-        if self.vocabulary is None or not self.vocabulary.subtasks:
-            return ""
-        bullets = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
-        return (
-            "You MUST choose each subtask label verbatim from this canonical "
-            "vocabulary — pick the closest match for each phase of the demo, "
-            "and reuse the SAME string every time that phase recurs. The "
-            "low-level policy is conditioned on these exact strings; any "
-            "novel paraphrase you invent will make its conditioning OOD.\n"
-            "Canonical subtask labels:\n"
-            f"{bullets}\n\n"
-        )
-
-    def _memory_vocabulary_block(self) -> str:
-        """Bullet-list of canonical memory milestones the VLM must pick from."""
-        if self.vocabulary is None or not self.vocabulary.memory_milestones:
-            return ""
-        bullets = "\n".join(f"- {m}" for m in self.vocabulary.memory_milestones)
-        return (
-            "Compose the memory by picking ONLY from this canonical milestone "
-            "list — append a milestone (or rewrite the running memory to "
-            "compress past ones) using these exact phrases. Do not invent new "
-            "wording: every paraphrase weakens the downstream conditioning.\n"
-            "Canonical memory milestones:\n"
-            f"{bullets}\n\n"
-        )
-
-    _NORMALIZE_STRIP_TOKENS: frozenset[str] = frozenset({"the", "a", "an"})
-
-    def _canonicalize_subtask(self, text: str) -> str:
-        """Validate ``text`` against the canonical vocabulary; no fuzzy snap.
-
-        Without a vocabulary, the original text passes through. With a
-        vocabulary, accept the span only if its normalised form (lower-
-        cased, articles stripped, whitespace collapsed) matches a
-        canonical entry exactly — the canonical wording is returned so
-        the supervised string is byte-identical across episodes.
-
-        Off-vocab spans are dropped (empty string). Upstream
-        ``_generate_subtasks`` triggers a targeted retry before reaching
-        the drop path; this function never snaps or warps a span into
-        a different label.
-        """
-        if self.vocabulary is None or not self.vocabulary.subtasks:
-            return text.strip()
-        normalised = self._normalize(text)
-        if not normalised:
-            return ""
-        for candidate in self.vocabulary.subtasks:
-            if self._normalize(candidate) == normalised:
-                return candidate
-        return ""
-
-    @classmethod
-    def _normalize(cls, text: str) -> str:
-        """Lowercase, strip articles, collapse whitespace, drop punctuation."""
-        words = [
-            w.strip(".,:;\"'!?()")
-            for w in text.lower().replace(",", " ").split()
-        ]
-        return " ".join(w for w in words if w and w not in cls._NORMALIZE_STRIP_TOKENS)
-
-    def _invalid_subtasks(self, spans: list[dict[str, Any]]) -> list[str]:
-        """Return the unique off-vocab subtask strings the VLM produced."""
-        seen: list[str] = []
-        for span in spans:
-            text = str((span or {}).get("text") or "").strip()
-            if not text:
-                continue
-            if self._canonicalize_subtask(text):
-                continue
-            if text not in seen:
-                seen.append(text)
-        return seen
-
-    def _build_subtask_retry_message(
-        self, original_messages: list[dict[str, Any]], invalid: list[str]
-    ) -> list[dict[str, Any]]:
-        """Compose a one-shot correction prompt naming the off-vocab strings."""
-        assert self.vocabulary is not None
-        canonical = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
-        invalid_list = "\n".join(f"- {s!r}" for s in invalid)
-        correction = (
-            "Your previous response included subtask labels that are NOT in "
-            "the canonical vocabulary:\n"
-            f"{invalid_list}\n\n"
-            "Re-emit the same segmentation (same number of spans, same start/end "
-            "timestamps where they were valid) but replace every off-vocab "
-            "label with the EXACT canonical string for that phase, copied "
-            "verbatim from this list:\n"
-            f"{canonical}\n\n"
-            "Strict rules:\n"
-            "- Output strings must be byte-for-byte identical to entries above.\n"
-            "- No articles, no adverbs, no extra words.\n"
-            "- If a phase truly has no canonical match, omit that span entirely.\n"
-            "Return the same JSON shape as before."
-        )
-        # Append the correction as an additional user turn; the model
-        # sees the original prompt + its prior output is implied by the
-        # conversation context (the VLM client is stateless, so we
-        # re-send the original content plus this correction).
-        retry_messages = [
-            {
-                "role": m.get("role", "user"),
-                "content": (
-                    m.get("content")
-                    if isinstance(m.get("content"), str)
-                    else list(m.get("content") or [])
-                ),
-            }
-            for m in original_messages
-        ]
-        retry_messages.append({"role": "user", "content": correction})
-        return retry_messages
-
     def _generate_plan(
         self,
         record: EpisodeRecord,  # noqa: ARG002  (kept for signature stability)
@@ -866,7 +705,6 @@ class PlanSubtasksMemoryModule:
             prior_memory=prior_memory or "(none)",
             completed_subtask=completed,
             remaining_subtasks=", ".join(remaining) if remaining else "(none)",
-            vocabulary_block=self._memory_vocabulary_block(),
         )
         memory = self._vlm_field(self._text_message(prompt), "memory")
         return memory.strip() if isinstance(memory, str) else ""
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
deleted file mode 100644
index 00c29be4e..000000000
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-You are inspecting {n_episodes} sample episode video(s) from a teleoperated
-robot dataset. Every episode in the dataset performs the SAME task; the
-user originally asked: "{episode_task}".
-
-Watch all the clips and produce a SHORT canonical vocabulary that every
-episode in this dataset will reuse. The downstream low-level policy is
-conditioned on these strings — duplicate phrasings (e.g. "grasp blue
-cube" vs "pick up the blue cube") would destroy the conditioning, so
-pick one wording per concept and reuse it everywhere.
-
-Decide how many entries each list needs YOURSELF based on what you see —
-the smallest set that still covers every recurring phase in the demos.
-A simple two-object pick-and-place might need ~6 subtask labels and 2
-memory milestones; a long multi-step recipe needs more. Err on the side
-of FEWER — extra entries that don't recur across episodes weaken the
-conditioning.
-
-You output two lists:
-
-1. `subtasks`: imperative, telegraphic commands the robot can execute.
-   - Verb-first. Drop articles, adverbs, qualifiers.
-   - Consistent object nouns (if the task says "cube", every subtask says
-     "cube" — never "block" / "object").
-   - Atomic — one skill per subtask (gripper-open events, contact, regrasps,
-     transitions all become cut points).
-   - Each label must recur across the demos. If you see a motion only
-     once across all sample clips, it probably isn't a canonical phase.
-   - Good: "move to blue cube", "grasp blue cube", "lift blue cube",
-     "place blue cube in box", "release blue cube", "retract arm".
-   - Bad: "the robot arm moves towards the blue cube" (third person,
-     too long), "carefully pick up the cube" (adverb, article),
-     "carrying the yellow cube over the green basket" (gerund — should
-     be imperative "transport yellow cube to green basket").
-
-2. `memory_milestones`: first-person past-tense sentences the running
-   memory composes from. Each subtask phase that produces a lasting
-   change should have a milestone; transient motions (move, retract)
-   should NOT.
-   - First person, past tense. Start with "I".
-   - One sentence. Functional outcome only — no grasp / motion detail.
-   - Good: "I picked up the blue cube.", "I placed the blue cube in
-     the green box.", "I wiped the counter."
-   - Bad: "The robot arm grasped the blue cube." (third person),
-     "I carefully grasped the blue cube with the parallel gripper."
-     (irrelevant detail), "I moved towards the blue cube." (transient
-     motion — should be omitted, not memorialised).
-
-Output strictly valid JSON of shape:
-
-  {{
-    "subtasks": ["<verb phrase>", ...],
-    "memory_milestones": ["I <past-tense sentence>.", ...]
-  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt
index d066b9f73..b5278368b 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt
@@ -13,7 +13,7 @@ Previous memory: {prior_memory}
 Just-completed subtask: "{completed_subtask}"
 Remaining subtasks (for relevance judgement only): {remaining_subtasks}
 
-{vocabulary_block}Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
+Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
 robot has accomplished so far — the running story it would tell itself.
 
 Authoring rules:
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index 9314282be..a49096682 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,7 +6,7 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
-{vocabulary_block}Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
+Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 
 - Each subtask = one COMPOSITE atomic skill the low-level policy can
   execute end-to-end. A "skill" bundles its own approach motion with
diff --git a/src/lerobot/annotations/steerable_pipeline/vocabulary.py b/src/lerobot/annotations/steerable_pipeline/vocabulary.py
deleted file mode 100644
index 121cef849..000000000
--- a/src/lerobot/annotations/steerable_pipeline/vocabulary.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Dataset-level canonical vocabulary discovery (Phase 0).
-
-The downstream consumer of these annotations is a low-level action expert
-conditioned on the ``subtask`` string. Free-form per-episode LLM rephrasing
-gives near-unique strings per occurrence, which collapses the action
-expert's conditioning to noise and makes runtime subtask-paraphrase drift
-catastrophic. The Hi-Robot / π0.6-MEM recipe ships a small canonical
-vocabulary per environment (~10 strings) that every episode reuses; this
-module derives that vocabulary automatically from the first few episode
-videos and persists it next to the dataset.
-
-Pipeline-level flow:
-
-    Phase 0 (here): watch N sample episodes → produce vocabulary.json
-    Phase 1 (plan module): reuse vocabulary on every episode, both as
-                           prompt-side constraint *and* post-VLM validation
-
-The vocabulary is JSON, lives at ``<root>/meta/canonical_vocabulary.json``,
-and is human-inspectable / hand-editable — if the discovered set is wrong,
-operators edit the file and re-run the pipeline without phase 0.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-from collections.abc import Sequence
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any
-
-from .config import VocabularyConfig
-from .frames import FrameProvider, null_provider, to_video_block
-from .prompts import load as load_prompt
-from .reader import EpisodeRecord
-from .vlm_client import VlmClient
-
-logger = logging.getLogger(__name__)
-
-VOCABULARY_FILENAME = "canonical_vocabulary.json"
-
-
-@dataclass
-class Vocabulary:
-    """Canonical phrasings shared across every episode of one dataset.
-
-    Both lists are strict: per-episode subtask + memory generation pick
-    from these strings only; the downstream policy then has a small,
-    repeatable target distribution to learn instead of thousands of
-    LLM paraphrases.
-    """
-
-    subtasks: tuple[str, ...]
-    """Imperative subtask labels — what the low-level policy is conditioned
-    on. Verb-first, telegraphic, consistent object nouns. Example:
-    ``("move to blue cube", "grasp blue cube", "lift blue cube",
-       "place blue cube in box", "retract arm")``.
-    """
-
-    memory_milestones: tuple[str, ...]
-    """First-person past-tense milestone sentences — building blocks for
-    the running memory string. Example: ``("I picked up the blue cube.",
-    "I placed the blue cube in the green box.")``. Each milestone maps
-    1:1 onto a completed subtask phase; ``memory_at_step_k`` is the
-    concatenation of milestones for completed phases.
-    """
-
-    def to_json(self) -> dict[str, list[str]]:
-        return {
-            "subtasks": list(self.subtasks),
-            "memory_milestones": list(self.memory_milestones),
-        }
-
-    @classmethod
-    def from_json(cls, payload: dict[str, Any]) -> Vocabulary:
-        subtasks = tuple(
-            str(s).strip() for s in (payload.get("subtasks") or []) if str(s).strip()
-        )
-        memory_milestones = tuple(
-            str(s).strip() for s in (payload.get("memory_milestones") or []) if str(s).strip()
-        )
-        return cls(subtasks=subtasks, memory_milestones=memory_milestones)
-
-    def is_empty(self) -> bool:
-        return not self.subtasks and not self.memory_milestones
-
-
-def vocabulary_path(root: Path) -> Path:
-    """Return the canonical on-disk location for the vocabulary file."""
-    return root / "meta" / VOCABULARY_FILENAME
-
-
-def load_vocabulary(root: Path) -> Vocabulary | None:
-    """Read ``<root>/meta/canonical_vocabulary.json`` if present.
-
-    Returns ``None`` when the file does not exist — callers fall back to
-    free-form (unconstrained) subtask + memory generation, preserving the
-    pipeline's behaviour on datasets that never ran phase 0.
-    """
-    path = vocabulary_path(root)
-    if not path.exists():
-        return None
-    try:
-        payload = json.loads(path.read_text(encoding="utf-8"))
-    except (OSError, json.JSONDecodeError) as exc:
-        logger.warning("could not read %s: %s — proceeding without vocabulary", path, exc)
-        return None
-    if not isinstance(payload, dict):
-        logger.warning("%s is not a JSON object — ignoring", path)
-        return None
-    vocab = Vocabulary.from_json(payload)
-    if vocab.is_empty():
-        return None
-    return vocab
-
-
-def save_vocabulary(root: Path, vocab: Vocabulary) -> Path:
-    """Atomically persist ``vocab`` to ``<root>/meta/canonical_vocabulary.json``."""
-    path = vocabulary_path(root)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    tmp = path.with_suffix(path.suffix + ".tmp")
-    tmp.write_text(
-        json.dumps(vocab.to_json(), indent=2, ensure_ascii=False) + "\n",
-        encoding="utf-8",
-    )
-    tmp.replace(path)
-    return path
-
-
-@dataclass
-class VocabularyDiscoveryModule:
-    """Derive a dataset-level canonical vocabulary from sample episodes.
-
-    Phase 0 of the executor: pulls ``config.sample_episodes`` episode
-    videos, packs them into one Qwen-VL multi-video prompt, and asks the
-    model to enumerate the small set of canonical subtask labels +
-    memory milestones that recur across them. The output is persisted
-    to ``meta/canonical_vocabulary.json`` and consumed by phase 1.
-    """
-
-    vlm: VlmClient
-    config: VocabularyConfig
-    frame_provider: FrameProvider = field(default_factory=null_provider)
-
-    @property
-    def enabled(self) -> bool:
-        return self.config.enabled
-
-    def discover(
-        self,
-        records: Sequence[EpisodeRecord],
-        *,
-        existing: Vocabulary | None = None,
-    ) -> Vocabulary | None:
-        """Run vocabulary discovery against the first N sample episodes.
-
-        ``existing`` short-circuits the VLM call when ``config.reuse_existing``
-        is True and an on-disk vocabulary is already present — keeps re-runs
-        cheap and lets operators hand-edit the file without it getting
-        overwritten.
-        """
-        if existing is not None and self.config.reuse_existing:
-            logger.info(
-                "vocabulary: reusing existing (%d subtasks, %d memory milestones)",
-                len(existing.subtasks),
-                len(existing.memory_milestones),
-            )
-            return existing
-
-        sample = list(records[: max(1, int(self.config.sample_episodes))])
-        if not sample:
-            return None
-
-        task_hint = next((r.episode_task for r in sample if r.episode_task), "")
-        prompt = load_prompt("module_0_vocabulary").format(
-            episode_task=task_hint or "(unspecified)",
-            n_episodes=len(sample),
-        )
-        # Pack one video block per sample episode so the VLM sees the
-        # variation across episodes (different starting poses, different
-        # object placements) rather than overfitting to one trajectory.
-        content: list[dict[str, Any]] = []
-        for record in sample:
-            video_frames = self.frame_provider.video_for_episode(
-                record, int(self.config.max_video_frames_per_episode)
-            )
-            if video_frames:
-                content.extend(to_video_block(video_frames))
-        content.append({"type": "text", "text": prompt})
-        messages = [{"role": "user", "content": content}]
-
-        result = self.vlm.generate_json([messages])[0]
-        if not isinstance(result, dict):
-            logger.warning("vocabulary: VLM did not return a JSON object — skipping")
-            return None
-
-        vocab = Vocabulary.from_json(result)
-        if vocab.is_empty():
-            logger.warning("vocabulary: VLM returned an empty vocabulary — skipping")
-            return None
-        logger.info(
-            "vocabulary: discovered %d subtask labels + %d memory milestones from %d episodes",
-            len(vocab.subtasks),
-            len(vocab.memory_milestones),
-            len(sample),
-        )
-        return vocab
diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py
index 52309b827..7fee1f052 100644
--- a/src/lerobot/scripts/lerobot_annotate.py
+++ b/src/lerobot/scripts/lerobot_annotate.py
@@ -40,7 +40,6 @@ from lerobot.annotations.steerable_pipeline.modules import (
 )
 from lerobot.annotations.steerable_pipeline.validator import StagingValidator
 from lerobot.annotations.steerable_pipeline.vlm_client import make_vlm_client
-from lerobot.annotations.steerable_pipeline.vocabulary import VocabularyDiscoveryModule
 from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
 from lerobot.configs import parser
 
@@ -89,9 +88,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
         vlm=vlm, config=cfg.interjections, seed=cfg.seed, frame_provider=frame_provider
     )
     vqa = GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed, frame_provider=frame_provider)
-    vocabulary = VocabularyDiscoveryModule(
-        vlm=vlm, config=cfg.vocabulary, frame_provider=frame_provider
-    )
     writer = LanguageColumnsWriter()
     validator = StagingValidator(
         dataset_camera_keys=tuple(getattr(frame_provider, "camera_keys", []) or []) or None,
@@ -102,7 +98,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
         plan=plan,
         interjections=interjections,
         vqa=vqa,
-        vocabulary=vocabulary,
         writer=writer,
         validator=validator,
     )
diff --git a/tests/annotations/test_vocabulary.py b/tests/annotations/test_vocabulary.py
deleted file mode 100644
index 7b820834d..000000000
--- a/tests/annotations/test_vocabulary.py
+++ /dev/null
@@ -1,412 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Vocabulary-discovery phase (phase 0) tests."""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-
-from lerobot.annotations.steerable_pipeline.config import (
-    PlanConfig,
-    VocabularyConfig,
-)
-from lerobot.annotations.steerable_pipeline.modules import PlanSubtasksMemoryModule
-from lerobot.annotations.steerable_pipeline.reader import iter_episodes
-from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging
-from lerobot.annotations.steerable_pipeline.vocabulary import (
-    Vocabulary,
-    VocabularyDiscoveryModule,
-    load_vocabulary,
-    save_vocabulary,
-    vocabulary_path,
-)
-
-from ._helpers import make_canned_responder
-
-
-_CANONICAL_SUBTASKS = (
-    "grasp blue cube",
-    "place blue cube in box",
-    "retract arm",
-)
-_CANONICAL_MEMORY = (
-    "I picked up the blue cube.",
-    "I placed the blue cube in the box.",
-)
-
-
-# ---------------------------------------------------------------------------
-# Vocabulary dataclass + on-disk round-trip
-# ---------------------------------------------------------------------------
-
-
-def test_vocabulary_roundtrip(tmp_path: Path) -> None:
-    vocab = Vocabulary(
-        subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY
-    )
-    save_path = save_vocabulary(tmp_path, vocab)
-    assert save_path == vocabulary_path(tmp_path)
-    assert save_path.exists()
-
-    loaded = load_vocabulary(tmp_path)
-    assert loaded is not None
-    assert loaded.subtasks == _CANONICAL_SUBTASKS
-    assert loaded.memory_milestones == _CANONICAL_MEMORY
-
-
-def test_vocabulary_load_missing_returns_none(tmp_path: Path) -> None:
-    assert load_vocabulary(tmp_path) is None
-
-
-def test_vocabulary_load_malformed_returns_none(tmp_path: Path) -> None:
-    path = vocabulary_path(tmp_path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text("{ not valid json", encoding="utf-8")
-    assert load_vocabulary(tmp_path) is None
-
-
-def test_vocabulary_load_empty_payload_returns_none(tmp_path: Path) -> None:
-    path = vocabulary_path(tmp_path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps({"subtasks": [], "memory_milestones": []}), encoding="utf-8")
-    assert load_vocabulary(tmp_path) is None
-
-
-# ---------------------------------------------------------------------------
-# Discovery module
-# ---------------------------------------------------------------------------
-
-
-def test_vocabulary_discovery_calls_vlm_and_returns_vocab(
-    fixture_dataset_root: Path,
-) -> None:
-    vlm = make_canned_responder(
-        {
-            "canonical vocabulary": {
-                "subtasks": list(_CANONICAL_SUBTASKS),
-                "memory_milestones": list(_CANONICAL_MEMORY),
-            }
-        }
-    )
-    module = VocabularyDiscoveryModule(vlm=vlm, config=VocabularyConfig(sample_episodes=2))
-    records = list(iter_episodes(fixture_dataset_root))
-    vocab = module.discover(records)
-    assert vocab is not None
-    assert vocab.subtasks == _CANONICAL_SUBTASKS
-    assert vocab.memory_milestones == _CANONICAL_MEMORY
-
-
-def test_vocabulary_discovery_reuses_existing(fixture_dataset_root: Path) -> None:
-    """``reuse_existing=True`` short-circuits the VLM call entirely."""
-
-    def _explode(_messages):  # pragma: no cover - must not be called
-        raise AssertionError("VLM should not be invoked when reusing existing vocabulary")
-
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    vlm = StubVlmClient(responder=_explode)
-    module = VocabularyDiscoveryModule(
-        vlm=vlm, config=VocabularyConfig(reuse_existing=True)
-    )
-    records = list(iter_episodes(fixture_dataset_root))
-    existing = Vocabulary(subtasks=("a", "b"), memory_milestones=("I a.",))
-    vocab = module.discover(records, existing=existing)
-    assert vocab is existing
-
-
-def test_vocabulary_discovery_empty_payload_returns_none(
-    fixture_dataset_root: Path,
-) -> None:
-    vlm = make_canned_responder({"canonical vocabulary": {"subtasks": [], "memory_milestones": []}})
-    module = VocabularyDiscoveryModule(vlm=vlm, config=VocabularyConfig())
-    records = list(iter_episodes(fixture_dataset_root))
-    assert module.discover(records) is None
-
-
-# ---------------------------------------------------------------------------
-# PlanSubtasksMemoryModule consumes the vocabulary
-# ---------------------------------------------------------------------------
-
-
-def test_plan_module_inlines_vocab_into_subtask_prompt(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    captured: list[str] = []
-
-    def responder(messages):
-        # Find the last user text block and stash it for inspection.
-        for message in messages:
-            content = message.get("content")
-            if isinstance(content, list):
-                for block in content:
-                    if isinstance(block, dict) and block.get("type") == "text":
-                        captured.append(block.get("text", ""))
-        # Return canned subtasks; pick the first two canonical strings so
-        # the validator accepts them.
-        return {
-            "subtasks": [
-                {"text": "grasp blue cube", "start": 0.0, "end": 0.4},
-                {"text": "place blue cube in box", "start": 0.4, "end": 0.9},
-            ]
-        }
-
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    # The subtask prompt (and the memory prompt) carries the canonical
-    # bullet list so the VLM can't paraphrase them away.
-    assert any("Canonical subtask labels:" in t for t in captured)
-    assert any("grasp blue cube" in t for t in captured)
-
-
-def test_plan_module_accepts_article_only_difference(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """Articles like 'the'/'a'/'an' are stripped during validation."""
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    def responder(_messages):
-        return {
-            "subtasks": [
-                # Same canonical phrase modulo "the" — should be accepted.
-                {"text": "grasp the blue cube", "start": 0.0, "end": 0.4},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
-    assert subtask_texts == ["grasp blue cube"]
-
-
-def test_plan_module_retries_when_subtask_off_vocab(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """One-shot retry replaces an off-vocab paraphrase with the canonical form."""
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    call_count = {"n": 0}
-
-    def responder(messages):
-        call_count["n"] += 1
-        # First call: returns an off-vocab paraphrase.
-        if call_count["n"] == 1:
-            return {
-                "subtasks": [
-                    # paraphrase, not in vocab
-                    {"text": "pick up blue cube", "start": 0.0, "end": 0.4},
-                ]
-            }
-        # Second call (the retry): should contain the correction prompt;
-        # respond with the canonical phrase exactly.
-        last_user_text = ""
-        for message in messages:
-            content = message.get("content")
-            if isinstance(content, str):
-                last_user_text = content
-            elif isinstance(content, list):
-                for block in content:
-                    if isinstance(block, dict) and block.get("type") == "text":
-                        last_user_text = block.get("text", "")
-        assert "NOT in the canonical vocabulary" in last_user_text
-        return {
-            "subtasks": [
-                {"text": "grasp blue cube", "start": 0.0, "end": 0.4},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
-    assert subtask_texts == ["grasp blue cube"]
-    # The retry must have fired exactly once.
-    assert call_count["n"] == 2
-
-
-def test_plan_module_drops_off_vocab_subtask_after_retry(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """If the VLM stays off-vocab even after the retry, the bad span is dropped."""
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    call_count = {"n": 0}
-
-    def responder(_messages):
-        call_count["n"] += 1
-        # Both calls return the same off-vocab span — the model can't
-        # be corrected. The second call also returns one in-vocab span
-        # so the episode isn't empty; this lets us check that the
-        # off-vocab span is dropped without affecting the in-vocab one.
-        if call_count["n"] == 1:
-            return {
-                "subtasks": [
-                    {"text": "perform a fancy macarena dance", "start": 0.0, "end": 0.4},
-                    {"text": "grasp blue cube", "start": 0.4, "end": 0.9},
-                ]
-            }
-        return {
-            "subtasks": [
-                {"text": "perform a fancy macarena dance", "start": 0.0, "end": 0.4},
-                {"text": "grasp blue cube", "start": 0.4, "end": 0.9},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
-    # Retry fired exactly once; bad span dropped, good span kept.
-    assert call_count["n"] == 2
-    assert subtask_texts == ["grasp blue cube"]
-
-
-def test_plan_module_bumps_collocated_subtasks_to_distinct_frames(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """Two subtasks whose starts snap to the same frame get split onto two frames.
-
-    Without this guard, both spans would emit ``style=subtask`` rows at the
-    identical persistent timestamp; the training-time renderer's
-    ``active_at(t, style=subtask)`` then raises an ambiguity error.
-    """
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    def responder(_messages):
-        # Two canonical labels with starts within one frame of each other —
-        # both snap to the same source frame, so the dedupe pass must bump
-        # the later one to the next frame.
-        return {
-            "subtasks": [
-                {"text": "grasp blue cube", "start": 0.40, "end": 0.42},
-                {"text": "place blue cube in box", "start": 0.41, "end": 0.50},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_rows = [r for r in rows if r["style"] == "subtask"]
-    # Both subtasks present, both on distinct timestamps.
-    assert len(subtask_rows) == 2
-    timestamps = [r["timestamp"] for r in subtask_rows]
-    assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}"
-    # Order preserved: the chronologically earlier span keeps the earlier
-    # frame, the later one was bumped onto the next available frame.
-    assert subtask_rows[0]["content"] == "grasp blue cube"
-    assert subtask_rows[1]["content"] == "place blue cube in box"
-    assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"]
-
-
-def test_plan_module_empty_when_all_off_vocab_after_retry(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """All-off-vocab spans → episode comes out empty (no silent fuzzy snap)."""
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    def responder(_messages):
-        # Returns the same off-vocab spans on both attempts.
-        return {
-            "subtasks": [
-                {"text": "make a smoothie", "start": 0.0, "end": 0.4},
-                {"text": "consult the wizard", "start": 0.4, "end": 0.9},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm,
-        config=PlanConfig(n_task_rephrasings=0),
-        vocabulary=vocab,
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
-    # No subtask gets fabricated — better to leave the episode empty
-    # so the operator notices the vocabulary gap than to silently
-    # warp the labels.
-    assert subtask_texts == []
-
-
-def test_plan_module_without_vocab_passes_through(
-    fixture_dataset_root: Path, tmp_path: Path
-) -> None:
-    """No vocabulary configured → original free-form behavior is preserved."""
-    from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
-
-    def responder(_messages):
-        return {
-            "subtasks": [
-                {"text": "any free-form text the VLM wants", "start": 0.0, "end": 1.0},
-            ]
-        }
-
-    vlm = StubVlmClient(responder=responder)
-    module = PlanSubtasksMemoryModule(
-        vlm=vlm, config=PlanConfig(n_task_rephrasings=0)
-    )
-    record = next(iter_episodes(fixture_dataset_root))
-    staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
-    module.run_episode(record, staging)
-    rows = staging.read("plan")
-    subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
-    assert subtask_texts == ["any free-form text the VLM wants"]

From 98a519e7f266acd9e19bdf420b201a696785186e Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 12:08:15 +0200
Subject: [PATCH 10/45] fix(annotate): default frame provider to video keys,
 not image keys

VideoFrameProvider derived its default camera and camera list from
meta.camera_keys, which mixes image- and video-stored cameras. The
clip/decode paths read videos/<key>/from_timestamp, which only exists
for video keys, so an image-stored camera sorted first (e.g.
observation.images.wrist) crashed the plan phase with a KeyError.

Restrict the list and default to meta.video_keys. Add a regression test
and point the example job at the dataset's actual video camera. Skip
bandit B607 (ffmpeg/git are intentionally resolved via PATH).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 examples/annotations/run_hf_job.py            | 11 +++---
 pyproject.toml                                |  2 +-
 .../annotations/steerable_pipeline/frames.py  | 39 ++++++++++++-------
 tests/annotations/test_frames.py              | 39 +++++++++++++++++--
 4 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index dcc9435ce..01ef58f4d 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -36,8 +36,8 @@ CMD = (
     "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
     "export VLLM_VIDEO_BACKEND=pyav && "
     "lerobot-annotate "
-    "--repo_id=imstevenpmwork/super_poulain_draft "
-    "--dest_repo_id=pepijn223/super_poulain_vocab "
+    "--repo_id=pepijn223/robocasa_smoke_2atomic_v3 "
+    "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
     "--push_to_hub=true "
     "--vlm.backend=openai "
     "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
@@ -52,17 +52,18 @@ CMD = (
     "--vlm.temperature=0.7 "
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
-    "--vlm.camera_key=observation.images.wrist "
+    "--vlm.camera_key=observation.images.robot0_agentview_right "
     # Phase 1 — plan module (subtasks + plan + memory + task_aug).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "
     "--plan.use_video_url_fps=1.0 "
     "--plan.derive_task_from_video=always "
-    "--plan.n_task_rephrasings=30 "
+    "--plan.task_aug_axes.enabled=true "
+    "--plan.action_records.enabled=true "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
-    "--vqa.K=3 "
+    "--vqa.K=1 "
     "--vqa.vqa_emission_hz=1.0"
 )
 
diff --git a/pyproject.toml b/pyproject.toml
index d29800a3b..5a329e56d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -401,7 +401,7 @@ exclude_dirs = [
     "benchmarks",
     "src/lerobot/datasets/push_dataset_to_hub",
 ]
-skips = ["B101", "B311", "B404", "B603", "B615"]
+skips = ["B101", "B311", "B404", "B603", "B607", "B615"]
 
 [tool.typos]
 default.extend-ignore-re = [
diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py
index 112f50ce6..804dae109 100644
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -151,13 +151,15 @@ class VideoFrameProvider:
         from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata  # noqa: PLC0415
 
         self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root)
-        # ``camera_keys`` covers both image- and video-stored cameras and is
-        # always defined on the metadata (``[]`` in the worst case), so it is
-        # the single source we need here.
-        keys = list(self._meta.camera_keys)
-        # Last-resort fallback: if metadata didn't surface anything but the
-        # caller explicitly named a camera (``--vlm.camera_key=...``), trust
-        # them — the key is by definition known to exist on the dataset.
+        # Only ``video_keys`` are decodable here: the clip/decode paths read
+        # ``videos/<key>/from_timestamp`` from episode metadata, which exists
+        # only for video-stored cameras. Image-stored cameras (also in
+        # ``camera_keys``) would KeyError, so restrict the list — and the
+        # default — to video keys.
+        keys = list(self._meta.video_keys)
+        # Last-resort fallback: if metadata didn't surface any video keys but
+        # the caller explicitly named a camera (``--vlm.camera_key=...``),
+        # trust them — the key is by definition known to exist on the dataset.
         if not keys and self.camera_key:
             keys = [self.camera_key]
         self._camera_keys = keys
@@ -338,8 +340,7 @@ class VideoFrameProvider:
                 self._warned_decode_fail = True
         if not already_warned:
             logger.warning(
-                "VideoFrameProvider._decode failed for episode=%s camera=%s "
-                "video_path=%s backends=%s: %s",
+                "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s",
                 episode_index,
                 camera_key,
                 video_path,
@@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
     for ts in timestamps:
         proc = subprocess.run(
             [
-                "ffmpeg", "-nostdin", "-loglevel", "error",
-                "-ss", f"{max(ts, 0.0):.3f}",
-                "-i", str(video_path),
-                "-frames:v", "1",
-                "-f", "image2pipe", "-vcodec", "png", "pipe:1",
+                "ffmpeg",
+                "-nostdin",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{max(ts, 0.0):.3f}",
+                "-i",
+                str(video_path),
+                "-frames:v",
+                "1",
+                "-f",
+                "image2pipe",
+                "-vcodec",
+                "png",
+                "pipe:1",
             ],
             capture_output=True,
             check=True,
diff --git a/tests/annotations/test_frames.py b/tests/annotations/test_frames.py
index 07b8b2c33..c8ed51ed5 100644
--- a/tests/annotations/test_frames.py
+++ b/tests/annotations/test_frames.py
@@ -45,6 +45,33 @@ from lerobot.annotations.steerable_pipeline.frames import (  # noqa: E402
 )
 
 
+class _FakeMeta:
+    """Minimal metadata stub exposing ``video_keys`` / ``camera_keys``."""
+
+    def __init__(self, video_keys: list[str], image_keys: list[str]) -> None:
+        self.video_keys = video_keys
+        self.camera_keys = [*video_keys, *image_keys]
+
+
+def test_default_camera_key_skips_image_only_cameras(tmp_path: Path, monkeypatch) -> None:
+    """The default camera must be a *video* key — image-stored cameras have no
+    ``videos/<key>/from_timestamp`` and would KeyError in the clip/decode path.
+
+    Regression: a dataset whose first ``camera_keys`` entry was an image-stored
+    camera (e.g. ``observation.images.wrist``) crashed at clip extraction.
+    """
+    fake = _FakeMeta(
+        video_keys=["observation.images.robot0_agentview_right"],
+        image_keys=["observation.images.wrist"],
+    )
+    import lerobot.datasets.dataset_metadata as meta_mod
+
+    monkeypatch.setattr(meta_mod, "LeRobotDatasetMetadata", lambda *a, **k: fake, raising=True)
+    provider = VideoFrameProvider(root=tmp_path)
+    assert provider.camera_key == "observation.images.robot0_agentview_right"
+    assert "observation.images.wrist" not in provider.camera_keys
+
+
 def test_video_for_episode_is_a_method_of_videoframeprovider():
     """``video_for_episode`` must be a bound method, not nested dead code."""
     assert callable(getattr(VideoFrameProvider, "video_for_episode", None))
@@ -81,9 +108,15 @@ def sample_video(tmp_path: Path) -> Path:
     out = tmp_path / "sample.mp4"
     subprocess.run(
         [
-            "ffmpeg", "-y", "-f", "lavfi",
-            "-i", "testsrc=duration=3:size=160x120:rate=10",
-            "-pix_fmt", "yuv420p", str(out),
+            "ffmpeg",
+            "-y",
+            "-f",
+            "lavfi",
+            "-i",
+            "testsrc=duration=3:size=160x120:rate=10",
+            "-pix_fmt",
+            "yuv420p",
+            str(out),
         ],
         check=True,
         capture_output=True,

From c5042a6850902f8b29730cebdc140016d6ce327f Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 14:34:48 +0200
Subject: [PATCH 11/45] fix(annotate): stop action records + augmentation from
 corrupting RoboCasa labels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three compounding bugs made RoboCasa annotation produce off-task
subtasks ('move stove to stove with left arm') and drifting
augmentations ('wander around the kitchen' for 'Navigate to the stove').

1. action_records.replace_subtask_text now defaults False.
   Overwriting the VLM's subtask text with a reconstruction of
   hallucinated {verb,object,arm,grasp,dest} fields is high-risk:
   navigation / non-manipulation tasks don't fit the schema and render
   to nonsense. Records are now additive by default (emit_record_row),
   never silently replacing subtask text. Flip replace_subtask_text on
   only for manipulation datasets verified to render cleanly.

2. _render_action_record_to_subtask_text drops a degenerate
   destination that just echoes the object (verb=move object=stove
   destination=stove -> 'move stove' instead of 'move stove to stove').
   Also routes 'navigate' through the 'to <dest>' preposition family.

3. module_1_task_aug_axes.txt hardened: variants MUST preserve the
   goal/destination. Explicitly forbids 'Navigate to the stove' ->
   'wander around the kitchen'. Only wording / arm / orientation /
   grasp may vary; verb meaning, object, and destination are fixed.

examples/annotations/run_hf_job.py — corrected for RoboCasa:
  * derive_task_from_video=off (was =always). The dataset task string
    is authoritative and is what eval conditions on; =always threw it
    away, re-derived a hallucinated task from the video, and poisoned
    every downstream subtask/plan row. THIS was the dominant cause.
  * n_task_rephrasings=0 + task_aug_axes left off — RoboCasa eval uses
    exact task strings, so augmentation is unused/harmful.
  * action_records left off — manipulation schema doesn't fit atomic /
    navigation tasks.
  * plan_max_steps=6 to keep atomic-task decomposition tight.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            | 25 +++++++++++++++---
 .../annotations/steerable_pipeline/config.py  | 26 ++++++++++++-------
 .../modules/plan_subtasks_memory.py           |  9 ++++++-
 .../prompts/module_1_task_aug_axes.txt        | 13 +++++++---
 4 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 01ef58f4d..f669593e9 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -53,13 +53,30 @@ CMD = (
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
-    # Phase 1 — plan module (subtasks + plan + memory + task_aug).
+    # Phase 1 — plan module (subtasks + plan + memory).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "
     "--plan.use_video_url_fps=1.0 "
-    "--plan.derive_task_from_video=always "
-    "--plan.task_aug_axes.enabled=true "
-    "--plan.action_records.enabled=true "
+    # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
+    # stove", "Pick the mug...") is authoritative and is what eval uses.
+    # ``derive_task_from_video=off`` keeps that canonical task driving
+    # subtask generation. Do NOT use ``always`` here — it throws the real
+    # task away, asks the VLM "what is this video about?" with no hint,
+    # and the hallucinated task then poisons every subtask + plan row.
+    "--plan.derive_task_from_video=off "
+    # NO task augmentation for RoboCasa: eval conditions on the exact task
+    # strings, so synthetic rephrasings are unused at best and (when they
+    # drift, e.g. "wander around the kitchen") harmful. 0 rephrasings +
+    # axes disabled = the policy only ever sees the canonical task.
+    "--plan.n_task_rephrasings=0 "
+    # action_records OFF: the structured {verb,object,arm,grasp,dest}
+    # schema is a manipulation schema; RoboCasa navigation / atomic tasks
+    # don't fit it and the VLM hallucinates (e.g. "move stove to stove").
+    # Leave off unless annotating long composite manipulation tasks you've
+    # verified render cleanly (and even then replace_subtask_text stays
+    # off by default so records are additive, never overwriting subtasks).
+    # Keep subtask decomposition tight for atomic tasks:
+    "--plan.plan_max_steps=6 "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index c60e58fee..a3c1306be 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -94,12 +94,18 @@ class ActionRecordsConfig:
 
     A deterministic Python template then renders the record back to
     canonical subtask text (e.g. ``pick blue cube with left arm using
-    pinch grip``). When ``replace_subtask_text=True`` (default), the
-    rendered text REPLACES the VLM's free-form subtask text — eliminating
-    cross-episode phrasing drift. When ``emit_record_row=True``
-    (default), the structured record is also emitted as a row with
-    ``style="action_record"`` so downstream consumers can train on the
-    typed schema directly.
+    pinch grip``). When ``replace_subtask_text=True``, the rendered text
+    REPLACES the VLM's free-form subtask text. This is OFF by default:
+    the structured fields are easy for the VLM to hallucinate on tasks
+    that don't fit the manipulation schema (e.g. navigation tasks yield
+    nonsense like ``move stove to stove``), and silently overwriting the
+    subtask text with a reconstruction is high-risk. Leave it off to keep
+    the original VLM subtask text and treat the record as additive
+    metadata; only flip it on for datasets you've verified render
+    cleanly. When ``emit_record_row=True`` (default), the structured
+    record is also emitted as a row with ``style="action_record"`` so
+    downstream consumers can train on the typed schema directly —
+    without touching the subtask text.
 
     Cost: one extra VLM call per subtask. For an 8-subtask episode this
     means ~8x more VLM calls in the plan module — still cheap relative
@@ -110,9 +116,11 @@ class ActionRecordsConfig:
 
     # When True, replace the VLM-generated subtask text with the
     # deterministic template's rendering of the structured record.
-    # Strongly recommended — it's the whole point of the structured
-    # intermediate. Set False to keep both representations side by side.
-    replace_subtask_text: bool = True
+    # OFF by default — see class docstring. Overwriting good subtask
+    # text with a reconstruction of hallucinated structured fields is
+    # high-risk (navigation / non-manipulation tasks render to
+    # nonsense). Keep records additive (``emit_record_row``) instead.
+    replace_subtask_text: bool = False
 
     # When True, emit a separate row with ``style="action_record"`` and
     # ``content=json.dumps(record)`` at the subtask's start timestamp.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index f58ec2c91..6ef5352b0 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -424,6 +424,13 @@ class PlanSubtasksMemoryModule:
         if not verb:
             return ""
 
+        # Drop a degenerate destination that just echoes the object — the
+        # VLM sometimes fills both with the same noun (e.g. navigation:
+        # ``verb=move object=stove destination=stove`` → "move stove to
+        # stove"). Treat that as "no meaningful destination".
+        if dest and obj and dest.strip().lower() == obj.strip().lower():
+            dest = ""
+
         parts: list[str] = [verb]
         if obj:
             parts.append(obj)
@@ -431,7 +438,7 @@ class PlanSubtasksMemoryModule:
             # Pick a sensible preposition per verb family.
             if verb in {"place", "put", "drop", "insert", "pour", "dump"}:
                 parts.append(f"in {dest}")
-            elif verb in {"move", "transport", "reach"}:
+            elif verb in {"move", "transport", "reach", "navigate"}:
                 parts.append(f"to {dest}")
             else:
                 parts.append(f"at {dest}")
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
index d8cd13104..8b19a0a8e 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
@@ -37,9 +37,16 @@ Axes and target counts:
     orientation, grasp_method) appear in the original task.
 
 Hard rules:
-- Each variant MUST preserve the core action and the target object.
-  Do not change which object is involved, the destination, or the
-  high-level action.
+- Each variant MUST preserve the core action, the target object, AND
+  the goal / destination. Do not change which object is involved, where
+  it goes, or the high-level action. "Navigate to the stove" may become
+  "go to the stove" or "head over to the stove" — it must NEVER become
+  "wander around the kitchen", "explore the room", or anything that
+  drops or generalises the stove destination. If you cannot vary the
+  wording without changing the goal, emit fewer variants.
+- Only the FIVE listed elements (wording, arm, orientation, grasp
+  method, or a combination) may be varied or omitted. The verb's
+  meaning, the object, and the destination are fixed.
 - Each variant is plain prose, no markdown, no quotes, no list numbers.
 - Each variant must be DISTINCT from every other variant in the entire
   output, both within and across axes. Near-duplicates are not allowed.

From 7454b4c993745ff0ab8cf73b63662e0ed4082859 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 14:42:36 +0200
Subject: [PATCH 12/45] annotate: remove action-record subtask-text replacement
 entirely

Drops the replace_subtask_text option and the
_render_action_record_to_subtask_text renderer. Action records are now
strictly additive: when action_records.enabled=True the module emits
style='action_record' rows (the typed {verb,object,arm,grasp,dest,
mistake} schema) and NEVER rewrites the subtask text the policy
conditions on.

The render-back-to-text path was the source of corrupted subtasks
(navigation tasks produced 'move stove to stove', manipulation tasks
got spurious 'with left arm using pinch grip' suffixes). Reconstructing
natural-language subtasks from hallucinated structured fields is
inherently fragile, so the capability is removed rather than guarded.

Removed:
  * ActionRecordsConfig.replace_subtask_text field
  * PlanSubtasksMemoryModule._render_action_record_to_subtask_text
  * the span['text'] = canonical_text overwrite in run_episode

Updated docstrings + run_hf_job.py comment accordingly. emit_record_row
(default True) is now the feature's only output.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |  8 +-
 .../annotations/steerable_pipeline/config.py  | 40 ++++------
 .../modules/plan_subtasks_memory.py           | 76 +++----------------
 3 files changed, 27 insertions(+), 97 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index f669593e9..8ce22c28f 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -71,10 +71,10 @@ CMD = (
     "--plan.n_task_rephrasings=0 "
     # action_records OFF: the structured {verb,object,arm,grasp,dest}
     # schema is a manipulation schema; RoboCasa navigation / atomic tasks
-    # don't fit it and the VLM hallucinates (e.g. "move stove to stove").
-    # Leave off unless annotating long composite manipulation tasks you've
-    # verified render cleanly (and even then replace_subtask_text stays
-    # off by default so records are additive, never overwriting subtasks).
+    # don't fit it and the VLM hallucinates. When on, records are purely
+    # additive (emitted as style="action_record" rows) and never touch
+    # the subtask text — useful only for long composite manipulation
+    # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
     "--plan.plan_max_steps=6 "
     # Phase 2 — interjections + speech.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index a3c1306be..f84fdaa08 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -92,20 +92,16 @@ class ActionRecordsConfig:
           "mistake": "<short text>" | null,
         }
 
-    A deterministic Python template then renders the record back to
-    canonical subtask text (e.g. ``pick blue cube with left arm using
-    pinch grip``). When ``replace_subtask_text=True``, the rendered text
-    REPLACES the VLM's free-form subtask text. This is OFF by default:
-    the structured fields are easy for the VLM to hallucinate on tasks
-    that don't fit the manipulation schema (e.g. navigation tasks yield
-    nonsense like ``move stove to stove``), and silently overwriting the
-    subtask text with a reconstruction is high-risk. Leave it off to keep
-    the original VLM subtask text and treat the record as additive
-    metadata; only flip it on for datasets you've verified render
-    cleanly. When ``emit_record_row=True`` (default), the structured
-    record is also emitted as a row with ``style="action_record"`` so
-    downstream consumers can train on the typed schema directly —
-    without touching the subtask text.
+    The record is emitted as a separate row with ``style="action_record"``
+    (``content=json.dumps(record)``) at the subtask's start timestamp.
+    It is PURELY ADDITIVE — it never touches the VLM's subtask text.
+    Downstream training can consume the typed schema directly (e.g.
+    auxiliary supervision on verb / arm / grasp classification heads)
+    while the subtask string the policy conditions on stays exactly what
+    the subtask module produced. (Reconstructing subtask text from these
+    fields was too easy for the VLM to hallucinate on tasks that don't
+    fit the manipulation schema — navigation tasks yielded nonsense like
+    ``move stove to stove`` — so that path was removed.)
 
     Cost: one extra VLM call per subtask. For an 8-subtask episode this
     means ~8x more VLM calls in the plan module — still cheap relative
@@ -114,18 +110,10 @@ class ActionRecordsConfig:
 
     enabled: bool = False
 
-    # When True, replace the VLM-generated subtask text with the
-    # deterministic template's rendering of the structured record.
-    # OFF by default — see class docstring. Overwriting good subtask
-    # text with a reconstruction of hallucinated structured fields is
-    # high-risk (navigation / non-manipulation tasks render to
-    # nonsense). Keep records additive (``emit_record_row``) instead.
-    replace_subtask_text: bool = False
-
-    # When True, emit a separate row with ``style="action_record"`` and
-    # ``content=json.dumps(record)`` at the subtask's start timestamp.
-    # Lets downstream training consume the typed schema directly (e.g.
-    # auxiliary supervision on verb/arm/grasp classification heads).
+    # When True (default), emit a separate row with ``style="action_record"``
+    # and ``content=json.dumps(record)`` at the subtask's start timestamp.
+    # This is the only output of the feature — set ``enabled=False`` to
+    # skip the extra VLM calls entirely.
     emit_record_row: bool = True
 
     # Frame sampling for the per-subtask VLM call (similar to the
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 6ef5352b0..5e66f67be 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -124,28 +124,24 @@ class PlanSubtasksMemoryModule:
         subtask_spans = self._generate_subtasks(record, task=effective_task)
 
         # ----------------------------------------------------------------
-        # Phase 1a + 1b: structured per-subtask action records
+        # Phase 1a: structured per-subtask action records (additive)
         # ----------------------------------------------------------------
         # When enabled, for every subtask span we ask the VLM for a typed
         # ActionRecord (verb / object / arm / grasp_type / destination /
-        # mistake). A deterministic Python template renders the record
-        # back to canonical subtask text. The render replaces the
-        # free-form subtask text (cleaner conditioning) and the typed
-        # record is emitted as a separate row for downstream use.
+        # mistake) and emit it as a separate ``style="action_record"``
+        # row for downstream use. This is purely additive — it never
+        # touches the VLM's subtask text (reconstructing subtask text
+        # from these fields was too easy to hallucinate on tasks that
+        # don't fit the manipulation schema).
         records_cfg = self.config.action_records
         action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
         if records_cfg.enabled and subtask_spans:
             for i, span in enumerate(subtask_spans):
                 rec = self._extract_action_record(record, span, effective_task)
-                if rec is None:
-                    continue
-                action_records[i] = rec
-                if records_cfg.replace_subtask_text:
-                    canonical_text = self._render_action_record_to_subtask_text(rec)
-                    if canonical_text:
-                        span["text"] = canonical_text
+                if rec is not None:
+                    action_records[i] = rec
 
-        # subtask rows (may now reflect canonical-rendered text)
+        # subtask rows
         for i, span in enumerate(subtask_spans):
             rows.append(
                 {
@@ -396,60 +392,6 @@ class PlanSubtasksMemoryModule:
             "mistake": mistake,
         }
 
-    @staticmethod
-    def _render_action_record_to_subtask_text(record: dict[str, Any]) -> str:
-        """Deterministic template: ``ActionRecord`` → canonical subtask text.
-
-        Mirrors the authoring guidance in ``module_1_subtasks.txt``:
-        imperative, drop articles / adverbs, use canonical object nouns,
-        append arm / grasp clauses only when present.
-
-        Examples (record → rendered text)::
-
-            {verb=pick, object=blue cube}
-                → "pick blue cube"
-            {verb=pick, object=blue cube, arm=left, grasp_type=pinch}
-                → "pick blue cube with left arm using pinch grip"
-            {verb=place, object=blue cube, destination=green box}
-                → "place blue cube in green box"
-            {verb=move, object=mug, destination=stove}
-                → "move mug to stove"
-        """
-        verb = (record.get("verb") or "").strip().lower()
-        obj = (record.get("object") or "").strip()
-        arm = (record.get("arm") or "").strip().lower() if record.get("arm") else ""
-        grasp = (record.get("grasp_type") or "").strip().lower() if record.get("grasp_type") else ""
-        dest = (record.get("destination") or "").strip() if record.get("destination") else ""
-
-        if not verb:
-            return ""
-
-        # Drop a degenerate destination that just echoes the object — the
-        # VLM sometimes fills both with the same noun (e.g. navigation:
-        # ``verb=move object=stove destination=stove`` → "move stove to
-        # stove"). Treat that as "no meaningful destination".
-        if dest and obj and dest.strip().lower() == obj.strip().lower():
-            dest = ""
-
-        parts: list[str] = [verb]
-        if obj:
-            parts.append(obj)
-        if dest:
-            # Pick a sensible preposition per verb family.
-            if verb in {"place", "put", "drop", "insert", "pour", "dump"}:
-                parts.append(f"in {dest}")
-            elif verb in {"move", "transport", "reach", "navigate"}:
-                parts.append(f"to {dest}")
-            else:
-                parts.append(f"at {dest}")
-        if arm == "both":
-            parts.append("with both arms")
-        elif arm in {"left", "right"}:
-            parts.append(f"with {arm} arm")
-        if grasp:
-            parts.append(f"using {grasp} grip")
-        return " ".join(parts)
-
     # ------------------------------------------------------------------
     # Structured 5-axis task augmentation (EgoMimic-style taxonomy)
     # ------------------------------------------------------------------

From ba5d4c5cd824f9b23df525bf94f41638c7ea43ee Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:08:25 +0200
Subject: [PATCH 13/45] annotate: kill subtask hallucination + single-camera
 grounding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes for 'subtasks describe actions not in the video' plus a way
to focus the whole pipeline on one camera.

ANTI-HALLUCINATION
  1. _episode_video_block: when use_video_url is set but clip extraction
     fails, FALL BACK to embedded frames instead of returning an empty
     block. An empty block left the VLM with zero visual grounding, so
     it invented subtasks from the task text alone — the likely root
     cause of hallucinated steps. Now logs a warning and embeds frames.
  2. module_1_subtasks.txt gains a GROUNDING preamble (overrides all
     other rules): label only motion visible in specific frames; never
     invent/anticipate/pad; max_steps is a CEILING not a target; atomic
     demos may be exactly ONE subtask; the VIDEO is ground truth, not
     the instruction text.

SINGLE-CAMERA GROUNDING
  * New VqaConfig.restrict_to_default_camera (default False). When True,
    the VQA module grounds on only the --vlm.camera_key stream instead
    of iterating every camera — matching the plan / interjection
    modules, which already use that single camera. Now the whole
    pipeline can focus on one view (e.g. observation.images.base).

run_hf_job.py updated:
  * use_video_url=false + frames_per_second=2.0 — embed frames directly
    (most reliable; no silent text-only failure mode) with dense
    grounding.
  * vqa.restrict_to_default_camera=true — VQA on the single camera too.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            | 15 ++++++++++---
 .../annotations/steerable_pipeline/config.py  |  9 ++++++++
 .../steerable_pipeline/modules/general_vqa.py | 14 ++++++++++++-
 .../modules/plan_subtasks_memory.py           | 21 ++++++++++++++-----
 .../prompts/module_1_subtasks.txt             | 15 +++++++++++++
 5 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 8ce22c28f..5e1b11d4b 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -54,9 +54,14 @@ CMD = (
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
     # Phase 1 — plan module (subtasks + plan + memory).
-    "--plan.frames_per_second=1.0 "
-    "--plan.use_video_url=true "
-    "--plan.use_video_url_fps=1.0 "
+    # Embed decoded frames directly (use_video_url=false) rather than
+    # handing the server a file:// clip. The embedded path is more
+    # reliable: if clip extraction ever fails, the video_url path would
+    # silently send NO video and the VLM would hallucinate subtasks from
+    # the task text alone. 2 fps gives dense visual grounding so the VLM
+    # labels what actually happens.
+    "--plan.frames_per_second=2.0 "
+    "--plan.use_video_url=false "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
     # stove", "Pick the mug...") is authoritative and is what eval uses.
     # ``derive_task_from_video=off`` keeps that canonical task driving
@@ -80,6 +85,10 @@ CMD = (
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
+    # Ground VQA on the SAME single camera as plan/interjections
+    # (--vlm.camera_key) instead of iterating every camera. The whole
+    # pipeline then focuses on one view, e.g. observation.images.base.
+    "--vqa.restrict_to_default_camera=true "
     "--vqa.K=1 "
     "--vqa.vqa_emission_hz=1.0"
 )
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index f84fdaa08..1cecfa772 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -219,6 +219,15 @@ class VqaConfig:
     precision for more (noisier) VQA frames."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
 
+    # Camera restriction. By default VQA iterates EVERY camera the
+    # dataset declares (one VQA pair per camera per emission tick). Set
+    # ``restrict_to_default_camera=True`` to ground VQA on only the
+    # single ``--vlm.camera_key`` stream — the same camera the plan /
+    # interjection modules use — so the whole pipeline focuses on one
+    # view. Use this when you want every annotation grounded on, e.g.,
+    # ``observation.images.base`` and nothing else.
+    restrict_to_default_camera: bool = False
+
 
 @dataclass
 class VlmConfig:
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index adabff731..1e5ad8838 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -180,8 +180,20 @@ class GeneralVqaModule:
         Defaults to every camera the provider exposes. Datasets with no
         cameras (or test/null providers) yield an empty list, which makes
         ``run_episode`` a no-op.
+
+        When ``config.restrict_to_default_camera`` is set, VQA grounds on
+        only the provider's default camera (the single ``--vlm.camera_key``
+        stream), matching the plan / interjection modules so the whole
+        pipeline focuses on one view.
         """
-        return list(getattr(self.frame_provider, "camera_keys", []) or [])
+        all_cameras = list(getattr(self.frame_provider, "camera_keys", []) or [])
+        if getattr(self.config, "restrict_to_default_camera", False):
+            default = getattr(self.frame_provider, "camera_key", None)
+            if default and default in all_cameras:
+                return [default]
+            if default:
+                return [default]
+        return all_cameras
 
     def _build_messages(
         self,
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 5e66f67be..c46b20bac 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -443,16 +443,27 @@ class PlanSubtasksMemoryModule:
         return flat
 
     def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]:
-        """Same video block ``_generate_subtasks`` builds — extracted helper."""
+        """Same video block ``_generate_subtasks`` builds — extracted helper.
+
+        Always returns a block that actually carries the video. When
+        ``use_video_url`` is set we try the server-side ``video_url``
+        path first, but if clip extraction fails we FALL BACK to
+        decoding + embedding frames rather than returning an empty
+        block — an empty block would leave the VLM with no visual
+        grounding at all and it would hallucinate subtasks purely from
+        the task text.
+        """
         if not record.frame_timestamps:
             return []
         if self.config.use_video_url and isinstance(self.frame_provider, VideoFrameProvider):
             cache_dir = Path(self.frame_provider.root) / ".annotate_staging" / ".video_clips"
             clip = self.frame_provider.episode_clip_path(record, cache_dir)
-            return (
-                to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
-                if clip is not None
-                else []
+            if clip is not None:
+                return to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
+            logger.warning(
+                "episode %d: video_url clip extraction failed — falling back to "
+                "embedded frames so the VLM still sees the demonstration",
+                record.episode_index,
             )
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
         target_count = max(1, int(round(episode_duration * self.config.frames_per_second)))
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index a49096682..4ea7407e6 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,6 +6,21 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
+GROUNDING — read this first, it overrides everything below:
+- Label ONLY what the robot actually does in the video. Every subtask
+  you emit must correspond to motion you can SEE in specific frames.
+- Do NOT invent, anticipate, or pad. If the robot only does one thing
+  (e.g. it just navigates to a location and the clip ends), emit
+  EXACTLY ONE subtask. Many demonstrations are a single atomic skill.
+- ``max_steps`` below is a hard CEILING, not a target. Emitting fewer
+  subtasks than the ceiling is not just allowed, it is expected for
+  short / atomic demonstrations. One correct subtask is far better
+  than several invented ones.
+- If the video does not clearly show the action implied by the task,
+  describe what you actually see — do NOT fabricate the task's steps
+  from the instruction text. The instruction tells you the goal; the
+  VIDEO is the ground truth for what happened.
+
 Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 
 - Each subtask = one COMPOSITE atomic skill the low-level policy can

From dcd368e1f80702718e98a9bd6b230acae28c04ce Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:12:46 +0200
Subject: [PATCH 14/45] annotate: multi-call subtask quality chain (describe ->
 segment -> verify)

The single-call 'watch video -> emit subtask JSON' pattern makes the
VLM commit to structured output before reasoning about what it saw, so
it pattern-matches the task text and hallucinates steps. Split it into
an opt-in multi-call chain that grounds first and prunes last.

New PlanConfig flags (both default False -> single-call unchanged):
  * subtask_describe_first: a grounding pass narrates ONLY what is
    visible in the video (no subtask JSON yet). That description is
    injected into the segmentation prompt via a new {observation_block}
    placeholder, so the model segments its own grounded observations
    instead of the instruction text. +1 VLM call/episode.
  * subtask_verify: after segmentation, an adversarial pass re-watches
    the video and drops any candidate subtask it cannot see. Can only
    PRUNE (never add/rewrite/move) and fails open (keeps un-verified
    spans if the call returns nothing). +1 VLM call/episode.

Implementation:
  * _generate_subtasks now orchestrates describe -> segment -> verify.
  * Factored span cleaning into _clean_spans (shared by segment + verify
    outputs); added _describe_episode and _verify_subtasks helpers.
  * New prompts module_1_subtask_describe.txt (returns {description})
    and module_1_subtask_verify.txt (returns pruned {subtasks}).
  * module_1_subtasks.txt gains a {observation_block} slot at the top.

run_hf_job.py enables both for the RoboCasa run (3 VLM calls/episode
for subtasks). Combined with single-camera grounding + the embedded-
frame path, this is the high-quality configuration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |   7 ++
 .../annotations/steerable_pipeline/config.py  |  16 +++
 .../modules/plan_subtasks_memory.py           | 105 +++++++++++++++++-
 .../prompts/module_1_subtask_describe.txt     |  27 +++++
 .../prompts/module_1_subtask_verify.txt       |  33 ++++++
 .../prompts/module_1_subtasks.txt             |   2 +-
 6 files changed, 183 insertions(+), 7 deletions(-)
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 5e1b11d4b..421b263da 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -82,6 +82,13 @@ CMD = (
     # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
     "--plan.plan_max_steps=6 "
+    # Multi-call quality chain (3 VLM calls/episode for subtasks):
+    #   1. describe-first: narrate ONLY what is visible before segmenting
+    #      — the strongest fix for subtasks invented from the task text.
+    #   2. (segment)
+    #   3. verify: re-watch and prune any subtask not actually seen.
+    "--plan.subtask_describe_first=true "
+    "--plan.subtask_verify=true "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 1cecfa772..9a0dd4232 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -51,6 +51,22 @@ class PlanConfig:
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
+    # Multi-call subtask quality chain (opt-in, more VLM calls, higher
+    # quality). Both off by default → single-call behaviour unchanged.
+    #
+    # ``subtask_describe_first``: run a grounding pass that narrates ONLY
+    # what is visible in the video (no subtask JSON yet), then inject that
+    # description into the segmentation prompt. Forces the model to
+    # observe before committing to structured output — the strongest
+    # lever against subtasks invented from the task text. +1 VLM call/ep.
+    subtask_describe_first: bool = False
+    # ``subtask_verify``: after segmentation, re-watch the video and drop
+    # any proposed subtask that can't be verified as visible. Prunes
+    # hallucinations; can only remove subtasks, never add/rewrite them.
+    # Fail-open (keeps un-verified spans if the verify call returns
+    # nothing). +1 VLM call/ep.
+    subtask_verify: bool = False
+
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4
     # subclip and lets the server sample frames at ``use_video_url_fps``.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index c46b20bac..1ba9b142b 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -521,20 +521,65 @@ class PlanSubtasksMemoryModule:
         staging.write("plan", new_rows)
 
     def _generate_subtasks(self, record: EpisodeRecord, *, task: str | None = None) -> list[dict[str, Any]]:
+        """Generate subtask spans, optionally via a multi-call quality chain.
+
+        Single call (default): watch video → emit subtask JSON.
+
+        Multi-call (opt-in, higher quality, more VLM calls):
+          1. ``subtask_describe_first`` — a grounding pass that narrates
+             ONLY what is visible (no JSON commitment to subtasks yet);
+             its description is injected into the segmentation prompt so
+             the model segments its own grounded observations instead of
+             pattern-matching the task text.
+          2. segmentation — emit subtask JSON (as before).
+          3. ``subtask_verify`` — an adversarial pass that re-watches the
+             video and drops any proposed subtask it cannot actually see,
+             pruning hallucinations.
+        """
         if record.row_count == 0 or not record.frame_timestamps:
             return []
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
+        effective_task = task if task is not None else record.episode_task
+
+        # ---- Pass 1 (optional): grounding description ----------------
+        observation_block = ""
+        if getattr(self.config, "subtask_describe_first", False):
+            description = self._describe_episode(record, effective_task)
+            if description:
+                observation_block = (
+                    "You watched this video and described, chronologically, "
+                    "ONLY what the robot actually does:\n"
+                    f'"""{description}"""\n\n'
+                    "Segment THAT grounded description (cross-checked against "
+                    "the video) into atomic subtasks. Do not introduce any "
+                    "action that is not in your description above.\n\n"
+                )
+
+        # ---- Pass 2: segmentation ------------------------------------
         prompt = load_prompt("module_1_subtasks").format(
-            episode_task=(task if task is not None else record.episode_task),
+            episode_task=effective_task,
             min_subtask_seconds=self.config.min_subtask_seconds,
             max_steps=self.config.plan_max_steps,
             episode_duration=f"{episode_duration:.3f}",
+            observation_block=observation_block,
         )
-        messages = self._video_message(record, prompt)
-        spans = self._vlm_field(messages, "subtasks")
+        spans = self._vlm_field(self._video_message(record, prompt), "subtasks")
+        cleaned = self._clean_spans(spans, record)
+        if not cleaned:
+            return []
+
+        # ---- Pass 3 (optional): verification / pruning ---------------
+        if getattr(self.config, "subtask_verify", False):
+            cleaned = self._verify_subtasks(record, effective_task, cleaned)
+
+        return cleaned
+
+    def _clean_spans(
+        self, spans: Any, record: EpisodeRecord
+    ) -> list[dict[str, Any]]:
+        """Clamp / sort / dedupe raw VLM subtask spans into valid rows."""
         if not spans:
             return []
-        # clamp to [t0, t_last] and sort
         t0 = record.frame_timestamps[0]
         t_last = record.frame_timestamps[-1]
         cleaned: list[dict[str, Any]] = []
@@ -553,8 +598,56 @@ class PlanSubtasksMemoryModule:
                 continue
             cleaned.append({"text": text, "start": start, "end": end})
         cleaned.sort(key=lambda s: s["start"])
-        cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
-        return cleaned
+        return self._dedupe_starts_to_distinct_frames(cleaned, record)
+
+    def _describe_episode(self, record: EpisodeRecord, task: str) -> str:
+        """Grounding pass: free-form chronological description of the video."""
+        prompt = load_prompt("module_1_subtask_describe").format(episode_task=task)
+        text = self._vlm_field(self._video_message(record, prompt), "description")
+        return text.strip() if isinstance(text, str) and text.strip() else ""
+
+    def _verify_subtasks(
+        self,
+        record: EpisodeRecord,
+        task: str,
+        spans: list[dict[str, Any]],
+    ) -> list[dict[str, Any]]:
+        """Adversarial pass: drop proposed subtasks not visible in the video.
+
+        Keeps the original span on a verified ``text`` match (the verify
+        prompt is told not to rewrite text), so verification can only
+        PRUNE — never invent or mutate. If the verify call fails or
+        returns nothing parseable, the un-verified spans are kept (fail
+        open: better to keep a possibly-good label than silently drop
+        everything on a transient VLM hiccup).
+        """
+        import json  # noqa: PLC0415
+
+        subtasks_json = json.dumps(
+            {"subtasks": [{"text": s["text"], "start": round(s["start"], 3), "end": round(s["end"], 3)} for s in spans]},
+            indent=2,
+        )
+        prompt = load_prompt("module_1_subtask_verify").format(
+            episode_task=task, subtasks_json=subtasks_json
+        )
+        kept_raw = self._vlm_field(self._video_message(record, prompt), "subtasks")
+        kept = self._clean_spans(kept_raw, record)
+        if not kept:
+            logger.info(
+                "episode %d: verify pass returned nothing — keeping the %d "
+                "un-verified subtask(s) (fail-open)",
+                record.episode_index,
+                len(spans),
+            )
+            return spans
+        if len(kept) < len(spans):
+            logger.info(
+                "episode %d: verify pass pruned %d -> %d subtask(s)",
+                record.episode_index,
+                len(spans),
+                len(kept),
+            )
+        return kept
 
     @staticmethod
     def _dedupe_starts_to_distinct_frames(
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
new file mode 100644
index 000000000..6b709e41d
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
@@ -0,0 +1,27 @@
+You are watching a teleoperated robot demonstration from a single
+camera. The user asked the robot to: "{episode_task}"
+
+This is an OBSERVATION pass. Watch the entire clip and describe, in
+chronological order, ONLY what the robot physically does — the concrete
+motions, approaches, contacts, grasps, releases, and relocations you can
+actually SEE in the frames.
+
+Hard rules:
+- Describe only motion visible in the video. Do NOT use the task
+  instruction to guess steps that aren't shown. The instruction is the
+  goal; the video is ground truth.
+- Do NOT segment into named subtasks yet and do NOT output JSON beyond
+  the single field below. Just narrate what happens.
+- Give an approximate timestamp (in seconds) for each distinct event,
+  e.g. "0.0-1.4s: the base drives forward toward the stove".
+- Do NOT invent objects, grasps, destinations, or steps. If the robot
+  only does one thing (e.g. it just navigates and the clip ends), say
+  exactly that and nothing more.
+- Be concrete and literal. "the gripper closes on the mug" — not "the
+  robot prepares to make coffee".
+
+Output strictly valid JSON:
+
+  {{
+    "description": "<chronological, timestamped description of ONLY what is visible>"
+  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
new file mode 100644
index 000000000..e52dc0aeb
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
@@ -0,0 +1,33 @@
+You previously segmented a teleoperated robot demonstration into these
+candidate subtasks (JSON):
+
+{subtasks_json}
+
+The user's task was: "{episode_task}"
+
+This is a VERIFICATION pass. Re-watch the video. For EACH candidate
+subtask, decide whether the robot can ACTUALLY be seen performing that
+action within its [start, end] time window.
+
+Rules:
+- KEEP a subtask only if its action is clearly visible in the video in
+  roughly that time window.
+- DROP any subtask whose action you cannot see, that describes
+  something not actually present in the video, that was inferred from
+  the task instruction rather than observed, or that duplicates another
+  kept subtask.
+- Do NOT add new subtasks. Do NOT rewrite the text of kept subtasks.
+  Do NOT change the start/end timestamps of kept subtasks.
+- It is correct and expected to return FEWER subtasks than you were
+  given — even just one — if that is all the video supports. Returning
+  zero is allowed if none can be verified.
+
+Output strictly valid JSON of the SAME shape, containing only the kept
+subtasks in chronological order:
+
+  {{
+    "subtasks": [
+      {{"text": "<kept verbatim>", "start": <float>, "end": <float>}},
+      ...
+    ]
+  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index 4ea7407e6..e1c8f822e 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,7 +6,7 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
-GROUNDING — read this first, it overrides everything below:
+{observation_block}GROUNDING — read this first, it overrides everything below:
 - Label ONLY what the robot actually does in the video. Every subtask
   you emit must correspond to motion you can SEE in specific frames.
 - Do NOT invent, anticipate, or pad. If the robot only does one thing

From 1fe1463ae0610b490b0cea9c28f777dcb90ceaab Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:13:50 +0200
Subject: [PATCH 15/45] annotate: enable subtask describe->segment->verify
 chain by default

Flip PlanConfig.subtask_describe_first and subtask_verify defaults
False -> True. Every subtask annotation now runs the 3-call grounding
+ pruning chain by default, since the single-call path reliably
hallucinates steps from the task text. Costs 2 extra VLM calls/episode;
disable with --plan.subtask_describe_first=false / --plan.subtask_
verify=false on easy datasets where fewer calls matter more than
label fidelity.

run_hf_job.py: drop the now-redundant explicit flags, leave a note that
the chain is default-on and how to opt out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py                 | 11 ++++-------
 .../annotations/steerable_pipeline/config.py       | 14 ++++++++++----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 421b263da..ade582861 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -82,13 +82,10 @@ CMD = (
     # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
     "--plan.plan_max_steps=6 "
-    # Multi-call quality chain (3 VLM calls/episode for subtasks):
-    #   1. describe-first: narrate ONLY what is visible before segmenting
-    #      — the strongest fix for subtasks invented from the task text.
-    #   2. (segment)
-    #   3. verify: re-watch and prune any subtask not actually seen.
-    "--plan.subtask_describe_first=true "
-    "--plan.subtask_verify=true "
+    # NOTE: the multi-call subtask quality chain (describe -> segment ->
+    # verify, 3 VLM calls/episode) is ON BY DEFAULT now. Pass
+    # --plan.subtask_describe_first=false / --plan.subtask_verify=false to
+    # disable on datasets you've verified are easy and want fewer calls.
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 9a0dd4232..18867f701 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -51,21 +51,27 @@ class PlanConfig:
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # Multi-call subtask quality chain (opt-in, more VLM calls, higher
-    # quality). Both off by default → single-call behaviour unchanged.
+    # Multi-call subtask quality chain. ON by default — the single-call
+    # 'watch video -> emit subtask JSON' pattern makes the VLM commit to
+    # structured output before reasoning about the video, so it
+    # pattern-matches the task text and hallucinates steps. The chain
+    # costs 2 extra VLM calls/episode (3 total for subtasks) but is the
+    # difference between trustworthy and fabricated labels. Set either to
+    # False to trade quality for fewer calls on datasets you've verified
+    # are easy.
     #
     # ``subtask_describe_first``: run a grounding pass that narrates ONLY
     # what is visible in the video (no subtask JSON yet), then inject that
     # description into the segmentation prompt. Forces the model to
     # observe before committing to structured output — the strongest
     # lever against subtasks invented from the task text. +1 VLM call/ep.
-    subtask_describe_first: bool = False
+    subtask_describe_first: bool = True
     # ``subtask_verify``: after segmentation, re-watch the video and drop
     # any proposed subtask that can't be verified as visible. Prunes
     # hallucinations; can only remove subtasks, never add/rewrite them.
     # Fail-open (keeps un-verified spans if the verify call returns
     # nothing). +1 VLM call/ep.
-    subtask_verify: bool = False
+    subtask_verify: bool = True
 
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4

From 799d0e3bccb4e092b4cd05a2488e6cf658b5106b Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:34:34 +0200
Subject: [PATCH 16/45] annotate: stitch subtasks to full-episode coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The verify pass prunes subtasks, which could leave the first subtask
starting after t0 or leave gaps between spans — so the subtask timeline
no longer tiled the episode and frames fell through with no active
subtask label.

New deterministic post-step (no VLM call), default on via
PlanConfig.subtask_full_coverage:
  * first subtask start pulled back to the episode's first frame t0
    (idle / approach before the first labelled action folds into it)
  * each subtask end snapped to the next subtask start (gaps closed)
  * last subtask end extended to the last frame t_last

Runs after segment + verify in _generate_subtasks. Starts other than
the first are left as the VLM/verify produced them (already frame-
snapped + distinct), so the cover is contiguous and non-overlapping.
Disable with --plan.subtask_full_coverage=false if a consumer wants
sparse subtasks.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 10 +++++
 .../modules/plan_subtasks_memory.py           | 43 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 18867f701..0389363b8 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -73,6 +73,16 @@ class PlanConfig:
     # nothing). +1 VLM call/ep.
     subtask_verify: bool = True
 
+    # ``subtask_full_coverage``: deterministic post-step (no VLM call)
+    # that stitches the surviving subtask spans into a contiguous cover
+    # of the whole episode — first subtask pulled back to t0, each span's
+    # end snapped to the next span's start, last span extended to t_last.
+    # Without it the verify pass (which prunes spans) can leave the
+    # subtask timeline starting late or full of gaps, so frames fall
+    # through with no active subtask. On by default; disable only if a
+    # downstream consumer genuinely wants sparse (non-tiling) subtasks.
+    subtask_full_coverage: bool = True
+
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4
     # subclip and lets the server sample frames at ``use_video_url_fps``.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 1ba9b142b..b117c5657 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -571,9 +571,52 @@ class PlanSubtasksMemoryModule:
         # ---- Pass 3 (optional): verification / pruning ---------------
         if getattr(self.config, "subtask_verify", False):
             cleaned = self._verify_subtasks(record, effective_task, cleaned)
+            if not cleaned:
+                return []
+
+        # ---- Full-episode coverage stitch ----------------------------
+        # The VLM (especially after the verify pass prunes spans) can
+        # leave the first subtask starting after t0 or leave gaps between
+        # spans, so the subtask timeline no longer tiles the whole
+        # episode and frames fall through with no active subtask. Stitch
+        # the surviving spans into a contiguous cover of [t0, t_last].
+        if getattr(self.config, "subtask_full_coverage", True):
+            cleaned = self._stitch_full_coverage(cleaned, record)
 
         return cleaned
 
+    def _stitch_full_coverage(
+        self, spans: list[dict[str, Any]], record: EpisodeRecord
+    ) -> list[dict[str, Any]]:
+        """Make subtask spans tile the full episode with no gaps.
+
+        * The first subtask starts at the episode's first frame ``t0``
+          (any idle / approach before the first labelled action is folded
+          into it), so every early frame has an active subtask.
+        * Each subtask's ``end`` is snapped to the next subtask's
+          ``start`` (gaps between spans are closed), and the final
+          subtask's ``end`` extends to the last frame ``t_last``.
+
+        Starts are otherwise left as the (already frame-snapped, distinct)
+        values the VLM + verify produced — only the FIRST start is pulled
+        back to ``t0``, which can't collide with a later span because it
+        was already the earliest. Purely deterministic; runs after the
+        VLM passes.
+        """
+        if not spans or not record.frame_timestamps:
+            return spans
+        t0 = float(record.frame_timestamps[0])
+        t_last = float(record.frame_timestamps[-1])
+        spans = sorted(spans, key=lambda s: float(s["start"]))
+        spans[0]["start"] = t0
+        for i in range(len(spans) - 1):
+            spans[i]["end"] = float(spans[i + 1]["start"])
+        spans[-1]["end"] = t_last
+        for s in spans:
+            if float(s["end"]) < float(s["start"]):
+                s["end"] = float(s["start"])
+        return spans
+
     def _clean_spans(
         self, spans: Any, record: EpisodeRecord
     ) -> list[dict[str, Any]]:

From 79f9a84407a081cde635fe289e2ffd99142481d7 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:36:23 +0200
Subject: [PATCH 17/45] annotate: make full-episode subtask coverage
 unconditional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the subtask_full_coverage config flag. Stitching subtask spans
into a contiguous full-episode cover is now always applied in
_generate_subtasks — a sparse / gap-ridden subtask timeline is never
desirable for conditioning, so there's no reason to make it optional.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py      | 15 ++++++---------
 .../modules/plan_subtasks_memory.py               |  9 +++++----
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 0389363b8..09c935e66 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -73,15 +73,12 @@ class PlanConfig:
     # nothing). +1 VLM call/ep.
     subtask_verify: bool = True
 
-    # ``subtask_full_coverage``: deterministic post-step (no VLM call)
-    # that stitches the surviving subtask spans into a contiguous cover
-    # of the whole episode — first subtask pulled back to t0, each span's
-    # end snapped to the next span's start, last span extended to t_last.
-    # Without it the verify pass (which prunes spans) can leave the
-    # subtask timeline starting late or full of gaps, so frames fall
-    # through with no active subtask. On by default; disable only if a
-    # downstream consumer genuinely wants sparse (non-tiling) subtasks.
-    subtask_full_coverage: bool = True
+    # NOTE: subtask spans are ALWAYS stitched into a contiguous
+    # full-episode cover (first subtask pulled back to t0, gaps closed,
+    # last span extended to t_last) as a deterministic post-step in
+    # ``_generate_subtasks._stitch_full_coverage``. This is not
+    # configurable — a sparse / gap-ridden subtask timeline is never
+    # desirable for conditioning, so it is unconditional.
 
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index b117c5657..4ffef49c1 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -578,10 +578,11 @@ class PlanSubtasksMemoryModule:
         # The VLM (especially after the verify pass prunes spans) can
         # leave the first subtask starting after t0 or leave gaps between
         # spans, so the subtask timeline no longer tiles the whole
-        # episode and frames fall through with no active subtask. Stitch
-        # the surviving spans into a contiguous cover of [t0, t_last].
-        if getattr(self.config, "subtask_full_coverage", True):
-            cleaned = self._stitch_full_coverage(cleaned, record)
+        # episode and frames fall through with no active subtask. Always
+        # stitch the surviving spans into a contiguous cover of
+        # [t0, t_last] — there is no scenario where a sparse, gap-ridden
+        # subtask timeline is desirable for conditioning.
+        cleaned = self._stitch_full_coverage(cleaned, record)
 
         return cleaned
 

From 1fb46ab30057f312d0c885183ecebaa9824ef702 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 16:02:25 +0200
Subject: [PATCH 18/45] annotate: cap embedded-frame budget to fit VLM context
 (fix 32k overflow)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switching the plan module to embedded frames (use_video_url=false)
exposed a context overflow: at frames_per_second=2.0 with the old
max_video_frames=128 default, a 480x640 episode embeds ~128 frames ≈
33-39k vision tokens, over the model's 32768 context — every plan call
died with 'Input length exceeds maximum context length' (HTTP 400),
crashing the whole annotation job.

The video_url path never hit this because the server downsampled; the
embedded path sends every sampled frame, so the frame count is a hard
token budget.

Fix:
  * config default max_video_frames 128 -> 32 (~8-10k vision tokens,
    comfortable headroom for the prompt + describe/verify passes).
    Frames are still sampled UNIFORMLY across the whole episode, so
    longer episodes are subsampled, not truncated — full temporal
    coverage preserved, just coarser density.
  * run_hf_job.py: frames_per_second 2.0 -> 1.0, explicit
    --plan.max_video_frames=32, with a comment explaining the token
    budget and the 'do not raise toward 128 with embedded frames' rule.

Only the plan module embeds the full episode; VQA (1 frame/tick) and
interjections (4-frame window) were never at risk.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py                 | 14 +++++++++++---
 .../annotations/steerable_pipeline/config.py       | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index ade582861..e83a56db8 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -58,10 +58,18 @@ CMD = (
     # handing the server a file:// clip. The embedded path is more
     # reliable: if clip extraction ever fails, the video_url path would
     # silently send NO video and the VLM would hallucinate subtasks from
-    # the task text alone. 2 fps gives dense visual grounding so the VLM
-    # labels what actually happens.
-    "--plan.frames_per_second=2.0 "
+    # the task text alone.
+    #
+    # CONTEXT BUDGET: with embedded frames, each frame is ~250-320 vision
+    # tokens. The model's context is 32768 (see --max-model-len). 32
+    # frames sampled uniformly across the episode (~8-10k tokens) fits
+    # comfortably alongside the prompt and the describe/verify passes.
+    # Do NOT raise max_video_frames toward 128 with embedded frames — that
+    # is ~33-39k tokens and overflows the context (BadRequestError 400,
+    # "Input length exceeds maximum context length").
     "--plan.use_video_url=false "
+    "--plan.frames_per_second=1.0 "
+    "--plan.max_video_frames=32 "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
     # stove", "Pick the mug...") is authoritative and is what eval uses.
     # ``derive_task_from_video=off`` keeps that canonical task driving
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 09c935e66..37371a7fb 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -44,9 +44,19 @@ class PlanConfig:
     derive_task_from_video: str = "if_short"
     derive_task_min_words: int = 3
 
-    # Frame sampling for the subtask-decomposition prompt.
+    # Frame sampling for the subtask-decomposition prompt. Frames are
+    # sampled uniformly across the whole episode up to ``max_video_frames``
+    # (so longer episodes are subsampled, not truncated).
+    #
+    # ``max_video_frames`` is a HARD context-budget cap. With the embedded-
+    # frame path (use_video_url=false), every frame becomes ~250-320 vision
+    # tokens, so 128 frames ≈ 33-39k tokens — over a 32k-context VLM. 32
+    # frames (~8-10k tokens) leaves ample room for the prompt + the
+    # describe / verify passes. Raise only if your serving context is
+    # larger AND your episodes need finer temporal resolution; if you hit
+    # "Input length exceeds maximum context length", lower this.
     frames_per_second: float = 1.0
-    max_video_frames: int = 128
+    max_video_frames: int = 32
 
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8

From cd128cbbd5d811ba625f1c86c9dbe8f21ed5b734 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 16:10:49 +0200
Subject: [PATCH 19/45] annotate: add verb-scoped disambiguation rules to
 subtask prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adopt the one prompt technique Scale's dense-captioning study found
reliably positive: targeted, verb-scoped, visually-grounded
disambiguation rules. Their lesson was that such a rule must fire ONLY
on the spatial situation it names (their narrow 'Stack vs Put' rule
helped; an over-broad directional 'Scoop' rule bled into other verbs
and hurt), so each rule here is phrased visually and scoped to one
confusable pair:
  * stack-vs-put (on top of an object vs on a surface)
  * insert-vs-put (fitted slot vs surface)
  * pick-up/retrieve-vs-put (decide by which way the OBJECT moves:
    gripper closes + object moves with hand = pick up; gripper opens +
    object stays = put — directly targets Scale's dominant
    direction-flip failure)
  * pour-vs-put (tilt + flow vs untilted move)

This is the highest-confidence, lowest-risk change from the Scale
findings; our pipeline already aligns with their 'avoid' list (no
temporal tokens, no overlays, no fancy sampling, no sequential context
injection, uniform sampling, describe-don't-predict framing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompts/module_1_subtasks.txt               | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index e1c8f822e..e6a5260a7 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -85,6 +85,23 @@ Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 - Every subtask's [start_time, end_time] must lie within
   [0.0, {episode_duration}] seconds.
 
+SPECIAL CASES — verb disambiguation (each rule is narrowly visual and
+fires ONLY on the spatial situation it names; it must not change how you
+label any other situation):
+- STACK vs PUT: if an object is placed ON TOP OF another specific object
+  (not on a flat table / shelf / counter), use "stack ... on ...", not
+  "put". "stack blue book on green book", NOT "put blue book on table".
+- INSERT vs PUT: if an object goes INTO a fitted slot / hole / socket /
+  receptacle (push-fit), use "insert ... into ...", not "put".
+- RETRIEVE/PICK-UP vs PUT (direction): watch the gripper. If it CLOSES
+  on the object and the object moves WITH the hand, it is "pick up" /
+  "retrieve" (object leaves its location). If the gripper OPENS and the
+  object stays where the hand left it, it is "put" / "place" (object
+  arrives at a location). Decide by which way the object moves, not by
+  where the hand ends up.
+- POUR vs PUT: only use "pour" when the source is tilted and contents
+  flow out; moving a full container without tilting is "put"/"place".
+
 Output strictly valid JSON of shape:
 
   {{

From 3236c6ee4a877dbfff277b51eb192763f18b4f03 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 16:16:26 +0200
Subject: [PATCH 20/45] examples(annotate): switch run_hf_job to Qwen3.6-27B
 (dense VLM)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Swap the annotation VLM from Qwen3.6-35B-A3B (sparse MoE, ~3B active)
to Qwen3.6-27B (dense, 27B all-active). Per Scale's dense-captioning
study, model capacity is the #1 lever and the dominant failure is
visual grounding — both helped by ~9x more active params. Qwen3.6-27B
is a vision-language model (vision encoder, image + video), same family
so the chat template / video handling / enable_thinking=false flag are
unchanged, and at 27B dense it still fits one H200 per server, so the
two-parallel-server layout (TP=1, one per GPU) is preserved — no
throughput-layout change, just a much stronger model.

Kept: parallel_servers=2, num_gpus=2, max-model-len 32768 (the 32-frame
embedded budget is ~10k tokens, well under), gpu-mem 0.8.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index e83a56db8..cbae22796 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python
-"""Launch ``lerobot-annotate`` on a Hugging Face job (vllm + Qwen3.6 MoE).
+"""Launch ``lerobot-annotate`` on a Hugging Face job (vllm + Qwen3.6-27B VLM).
 
 Spawns one ``h200x2`` job that:
 
   1. installs this branch of ``lerobot`` plus the annotation extras,
-  2. boots two vllm servers (one per GPU) with Qwen3.6-35B-A3B-FP8,
+  2. boots two vllm servers (one per GPU) with Qwen3.6-27B (dense VLM),
   3. runs the plan / interjections / vqa modules across the dataset
      in free-form mode (each episode generates its own subtasks +
      memory),
@@ -40,10 +40,10 @@ CMD = (
     "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
     "--push_to_hub=true "
     "--vlm.backend=openai "
-    "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
+    "--vlm.model_id=Qwen/Qwen3.6-27B "
     "--vlm.parallel_servers=2 "
     "--vlm.num_gpus=2 "
-    '--vlm.serve_command="vllm serve Qwen/Qwen3.6-35B-A3B-FP8 '
+    '--vlm.serve_command="vllm serve Qwen/Qwen3.6-27B '
     "--tensor-parallel-size 1 --max-model-len 32768 "
     '--gpu-memory-utilization 0.8 --uvicorn-log-level warning --port {port}" '
     "--vlm.serve_ready_timeout_s=1800 "

From 518e1913371ec33f859eb829f50d866bb7fb6f22 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 16:26:14 +0200
Subject: [PATCH 21/45] annotate: windowed subtask generation for constant
 temporal density
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Long episodes no longer get sparse subtasks. Previously a long episode
was subsampled to max_video_frames=32 across its whole duration (~1
frame/4s for a 2-min clip). New opt-in windowing keeps a CONSTANT
frames_per_second density by splitting the episode into fixed-length
windows and running the subtask chain per window.

New PlanConfig.subtask_window_seconds (default 0.0 = off). When > 0 and
the episode is longer than one window:
  * episode is split into consecutive [w0, w1] windows of this length
  * each window's frames are sampled at frames_per_second (so a 32s
    window at 1 fps = 32 frames, filling but not exceeding the per-call
    context budget)
  * the full describe -> segment -> verify chain runs PER window, in
    window-relative time [0, L]; spans are offset back to absolute
  * all windows' spans are merged, frame-snap-deduped, and stitched into
    one contiguous whole-episode cover

Implementation:
  * _episode_video_block / _video_message / _describe_episode /
    _verify_subtasks gain an optional window=(w0,w1); when set they
    embed frames sampled in that absolute range at frames_per_second
    (video_url path skipped — it's whole-episode).
  * _clean_spans gains bounds= (override clamp range, for window-relative
    spans) and dedupe= (skip frame-snap until the merged absolute set).
  * new _generate_subtasks_windowed + _subtasks_for_window orchestrate
    the loop; _generate_subtasks branches to them when window_s > 0.

run_hf_job.py: --plan.subtask_window_seconds=32 (32s windows at 1 fps).
Cost scales with episode length (chain calls × ceil(duration/window)).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |   6 +
 .../annotations/steerable_pipeline/config.py  |  13 ++
 .../modules/plan_subtasks_memory.py           | 185 ++++++++++++++++--
 3 files changed, 187 insertions(+), 17 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index cbae22796..86575f72f 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -70,6 +70,12 @@ CMD = (
     "--plan.use_video_url=false "
     "--plan.frames_per_second=1.0 "
     "--plan.max_video_frames=32 "
+    # Constant 1 fps density via windowing: episodes longer than 32s are
+    # split into 32-second windows (each 32 frames @ 1 fps, fits context),
+    # so long episodes get MORE subtasks instead of a sparser whole-episode
+    # view. describe->segment->verify runs per window; spans are merged +
+    # stitched to a contiguous whole-episode cover. 0 disables.
+    "--plan.subtask_window_seconds=32 "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
     # stove", "Pick the mug...") is authoritative and is what eval uses.
     # ``derive_task_from_video=off`` keeps that canonical task driving
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 37371a7fb..414824cfb 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -58,6 +58,19 @@ class PlanConfig:
     frames_per_second: float = 1.0
     max_video_frames: int = 32
 
+    # Windowed subtask generation for CONSTANT temporal density. When > 0
+    # and an episode is longer than this many seconds, the plan module
+    # processes the episode in consecutive windows of this length, each
+    # sampled at ``frames_per_second``, instead of subsampling the whole
+    # episode to ``max_video_frames`` (which makes long episodes sparse).
+    # The describe -> segment -> verify chain runs per window; results are
+    # offset to absolute time, merged, and stitched into a contiguous
+    # whole-episode cover. Cost scales with episode length (≈ chain calls
+    # × ceil(duration / window)). Set to ~max_video_frames / frames_per_
+    # second (e.g. 32s at 1 fps) so each window fills — but never exceeds —
+    # the per-call frame budget. 0 disables (single whole-episode call).
+    subtask_window_seconds: float = 0.0
+
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 4ffef49c1..991ee3a3b 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -272,9 +272,14 @@ class PlanSubtasksMemoryModule:
         """One-shot text-only user message wrapped for ``generate_json``."""
         return [{"role": "user", "content": [{"type": "text", "text": text}]}]
 
-    def _video_message(self, record: EpisodeRecord, prompt: str) -> list[dict[str, Any]]:
-        """User message combining the episode video block with ``prompt``."""
-        content = [*self._episode_video_block(record), {"type": "text", "text": prompt}]
+    def _video_message(
+        self,
+        record: EpisodeRecord,
+        prompt: str,
+        window: tuple[float, float] | None = None,
+    ) -> list[dict[str, Any]]:
+        """User message combining the (optionally windowed) video block with ``prompt``."""
+        content = [*self._episode_video_block(record, window=window), {"type": "text", "text": prompt}]
         return [{"role": "user", "content": content}]
 
     def _derive_task_from_video(self, record: EpisodeRecord) -> str | None:
@@ -442,8 +447,10 @@ class PlanSubtasksMemoryModule:
                 flat.append(key)
         return flat
 
-    def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]:
-        """Same video block ``_generate_subtasks`` builds — extracted helper.
+    def _episode_video_block(
+        self, record: EpisodeRecord, window: tuple[float, float] | None = None
+    ) -> list[dict[str, Any]]:
+        """Video block for the segmentation / describe / verify prompts.
 
         Always returns a block that actually carries the video. When
         ``use_video_url`` is set we try the server-side ``video_url``
@@ -452,9 +459,29 @@ class PlanSubtasksMemoryModule:
         block — an empty block would leave the VLM with no visual
         grounding at all and it would hallucinate subtasks purely from
         the task text.
+
+        When ``window=(w0, w1)`` is given (windowed subtask generation,
+        ``subtask_window_seconds > 0``), embed frames sampled at the FIXED
+        ``frames_per_second`` rate within ``[w0, w1]`` — constant temporal
+        density regardless of episode length, so long episodes are split
+        into windows rather than subsampled to a sparse 32-frame whole-
+        episode view. The ``video_url`` path is skipped for windows (it is
+        a whole-episode clip). ``max_video_frames`` still caps each window
+        as a context-budget safety net.
         """
         if not record.frame_timestamps:
             return []
+        if window is not None:
+            w0, w1 = float(window[0]), float(window[1])
+            dur = max(0.0, w1 - w0)
+            n = max(1, int(round(dur * self.config.frames_per_second)) + 1)
+            n = min(n, self.config.max_video_frames)
+            if n <= 1 or dur <= 0.0:
+                timestamps = [0.5 * (w0 + w1)]
+            else:
+                step = dur / (n - 1)
+                timestamps = [w0 + i * step for i in range(n)]
+            return to_video_block(self.frame_provider.frames_at(record, timestamps))
         if self.config.use_video_url and isinstance(self.frame_provider, VideoFrameProvider):
             cache_dir = Path(self.frame_provider.root) / ".annotate_staging" / ".video_clips"
             clip = self.frame_provider.episode_clip_path(record, cache_dir)
@@ -541,6 +568,17 @@ class PlanSubtasksMemoryModule:
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
         effective_task = task if task is not None else record.episode_task
 
+        # ---- Windowed path (constant temporal density) ---------------
+        # When ``subtask_window_seconds > 0`` and the episode is longer
+        # than one window, process the episode in fixed-length windows so
+        # the VLM always sees ``frames_per_second`` density (instead of a
+        # sparse 32-frame whole-episode view). Each window runs the full
+        # describe -> segment -> verify chain on its own frames; results
+        # are merged + stitched into a contiguous whole-episode cover.
+        window_s = float(getattr(self.config, "subtask_window_seconds", 0.0) or 0.0)
+        if window_s > 0.0 and episode_duration > window_s:
+            return self._generate_subtasks_windowed(record, effective_task, window_s)
+
         # ---- Pass 1 (optional): grounding description ----------------
         observation_block = ""
         if getattr(self.config, "subtask_describe_first", False):
@@ -586,6 +624,91 @@ class PlanSubtasksMemoryModule:
 
         return cleaned
 
+    def _generate_subtasks_windowed(
+        self, record: EpisodeRecord, task: str, window_s: float
+    ) -> list[dict[str, Any]]:
+        """Subtask generation in fixed-length windows at constant fps.
+
+        Splits ``[t0, t_last]`` into consecutive windows of ``window_s``
+        seconds, runs the describe -> segment -> verify chain on each
+        window's own frames (sampled at ``frames_per_second``), offsets
+        each window's spans back to absolute episode time, then merges +
+        stitches into a contiguous whole-episode cover.
+        """
+        t0 = float(record.frame_timestamps[0])
+        t_last = float(record.frame_timestamps[-1])
+        all_spans: list[dict[str, Any]] = []
+        w0 = t0
+        n_windows = 0
+        while w0 < t_last - 1e-6:
+            w1 = min(w0 + window_s, t_last)
+            all_spans.extend(self._subtasks_for_window(record, task, w0, w1))
+            n_windows += 1
+            w0 = w1
+        logger.info(
+            "episode %d: windowed subtask gen over %d window(s) of %.1fs -> %d raw spans",
+            record.episode_index,
+            n_windows,
+            window_s,
+            len(all_spans),
+        )
+        # Merge across windows: clamp to the absolute episode, sort, and
+        # frame-snap to distinct starts (handles any boundary collisions).
+        cleaned = self._clean_spans(all_spans, record)
+        if not cleaned:
+            return []
+        return self._stitch_full_coverage(cleaned, record)
+
+    def _subtasks_for_window(
+        self, record: EpisodeRecord, task: str, w0: float, w1: float
+    ) -> list[dict[str, Any]]:
+        """Run describe -> segment -> verify on one ``[w0, w1]`` window.
+
+        The model works in window-RELATIVE time ``[0, L]`` (it perceives
+        the window as a clip starting at 0); spans are offset back to
+        absolute ``[w0, w1]`` before returning.
+        """
+        window = (w0, w1)
+        win_len = max(0.0, w1 - w0)
+
+        observation_block = ""
+        if getattr(self.config, "subtask_describe_first", False):
+            description = self._describe_episode(record, task, window=window)
+            if description:
+                observation_block = (
+                    "You watched this video clip and described, chronologically, "
+                    "ONLY what the robot actually does:\n"
+                    f'"""{description}"""\n\n'
+                    "Segment THAT grounded description (cross-checked against "
+                    "the clip) into atomic subtasks. Do not introduce any "
+                    "action that is not in your description above.\n\n"
+                )
+
+        prompt = load_prompt("module_1_subtasks").format(
+            episode_task=task,
+            min_subtask_seconds=self.config.min_subtask_seconds,
+            max_steps=self.config.plan_max_steps,
+            episode_duration=f"{win_len:.3f}",
+            observation_block=observation_block,
+        )
+        spans = self._vlm_field(self._video_message(record, prompt, window=window), "subtasks")
+        # Window-relative clamp; no frame-snap dedupe yet (done on the
+        # merged absolute set).
+        cleaned = self._clean_spans(spans, record, bounds=(0.0, win_len), dedupe=False)
+        if not cleaned:
+            return []
+
+        if getattr(self.config, "subtask_verify", False):
+            cleaned = self._verify_subtasks(record, task, cleaned, window=window)
+            if not cleaned:
+                return []
+
+        # Offset window-relative spans back to absolute episode time.
+        for s in cleaned:
+            s["start"] = w0 + float(s["start"])
+            s["end"] = w0 + float(s["end"])
+        return cleaned
+
     def _stitch_full_coverage(
         self, spans: list[dict[str, Any]], record: EpisodeRecord
     ) -> list[dict[str, Any]]:
@@ -619,13 +742,28 @@ class PlanSubtasksMemoryModule:
         return spans
 
     def _clean_spans(
-        self, spans: Any, record: EpisodeRecord
+        self,
+        spans: Any,
+        record: EpisodeRecord,
+        bounds: tuple[float, float] | None = None,
+        dedupe: bool = True,
     ) -> list[dict[str, Any]]:
-        """Clamp / sort / dedupe raw VLM subtask spans into valid rows."""
+        """Clamp / sort / (optionally) dedupe raw VLM subtask spans into valid rows.
+
+        ``bounds`` overrides the clamp range — pass the window's
+        ``(w_lo, w_hi)`` when cleaning window-relative spans, or leave
+        ``None`` to clamp to the whole episode ``[t0, t_last]``.
+        ``dedupe`` runs the frame-snap distinct-start step; skip it for
+        window-relative spans (frame snapping is done once on the merged,
+        absolute-time set).
+        """
         if not spans:
             return []
-        t0 = record.frame_timestamps[0]
-        t_last = record.frame_timestamps[-1]
+        if bounds is not None:
+            lo, hi = float(bounds[0]), float(bounds[1])
+        else:
+            lo = record.frame_timestamps[0]
+            hi = record.frame_timestamps[-1]
         cleaned: list[dict[str, Any]] = []
         for span in spans:
             try:
@@ -634,20 +772,24 @@ class PlanSubtasksMemoryModule:
                 text = str(span["text"]).strip()
             except (KeyError, ValueError, TypeError):
                 continue
-            start = max(t0, min(start, t_last))
-            end = max(t0, min(end, t_last))
+            start = max(lo, min(start, hi))
+            end = max(lo, min(end, hi))
             if end < start:
                 start, end = end, start
             if not text:
                 continue
             cleaned.append({"text": text, "start": start, "end": end})
         cleaned.sort(key=lambda s: s["start"])
-        return self._dedupe_starts_to_distinct_frames(cleaned, record)
+        if dedupe:
+            return self._dedupe_starts_to_distinct_frames(cleaned, record)
+        return cleaned
 
-    def _describe_episode(self, record: EpisodeRecord, task: str) -> str:
-        """Grounding pass: free-form chronological description of the video."""
+    def _describe_episode(
+        self, record: EpisodeRecord, task: str, window: tuple[float, float] | None = None
+    ) -> str:
+        """Grounding pass: free-form chronological description of the (windowed) video."""
         prompt = load_prompt("module_1_subtask_describe").format(episode_task=task)
-        text = self._vlm_field(self._video_message(record, prompt), "description")
+        text = self._vlm_field(self._video_message(record, prompt, window=window), "description")
         return text.strip() if isinstance(text, str) and text.strip() else ""
 
     def _verify_subtasks(
@@ -655,6 +797,7 @@ class PlanSubtasksMemoryModule:
         record: EpisodeRecord,
         task: str,
         spans: list[dict[str, Any]],
+        window: tuple[float, float] | None = None,
     ) -> list[dict[str, Any]]:
         """Adversarial pass: drop proposed subtasks not visible in the video.
 
@@ -674,8 +817,16 @@ class PlanSubtasksMemoryModule:
         prompt = load_prompt("module_1_subtask_verify").format(
             episode_task=task, subtasks_json=subtasks_json
         )
-        kept_raw = self._vlm_field(self._video_message(record, prompt), "subtasks")
-        kept = self._clean_spans(kept_raw, record)
+        kept_raw = self._vlm_field(self._video_message(record, prompt, window=window), "subtasks")
+        # Windowed verify: the video is sampled from the absolute window
+        # ``[w0, w1]`` but the model perceives it as a clip starting at 0,
+        # so proposed + returned times are window-RELATIVE in ``[0, L]``.
+        # Clamp to that relative range and skip the absolute frame-snap
+        # dedupe (done once later on the merged absolute-time set).
+        clamp = (0.0, float(window[1] - window[0])) if window is not None else None
+        kept = self._clean_spans(
+            kept_raw, record, bounds=clamp, dedupe=window is None
+        )
         if not kept:
             logger.info(
                 "episode %d: verify pass returned nothing — keeping the %d "

From 53c7b4c69ac0c4366b5d67fa0328cbbacc87dc07 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 17:38:18 +0200
Subject: [PATCH 22/45] annotate: ruff lint + format pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quality-gate fixes after the main merge:
  * UP037: drop redundant quotes from PlanConfig forward-ref annotations
    (action_records / task_aug_axes) — safe under 'from __future__ import
    annotations'.
  * ruff format applied to config.py, executor.py, general_vqa.py,
    plan_subtasks_memory.py, validator.py, lerobot_annotate.py.

No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 42 +++++++++++++------
 .../steerable_pipeline/executor.py            |  4 +-
 .../steerable_pipeline/modules/general_vqa.py |  4 +-
 .../modules/plan_subtasks_memory.py           | 31 +++++++-------
 .../steerable_pipeline/validator.py           | 14 ++-----
 src/lerobot/scripts/lerobot_annotate.py       |  9 ++--
 6 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 414824cfb..63fbe83b0 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -116,7 +116,7 @@ class PlanConfig:
     # that record back to canonical subtask text — reducing the VLM's
     # "creative" surface to just the perception step. See
     # ``ActionRecordsConfig`` for details. Off by default (back-compat).
-    action_records: "ActionRecordsConfig" = field(default_factory=lambda: ActionRecordsConfig())
+    action_records: ActionRecordsConfig = field(default_factory=lambda: ActionRecordsConfig())
 
     # Structured 5-axis augmentation taxonomy for the t=0 task variants
     # (replaces the free-form ``n_task_rephrasings`` flow when enabled).
@@ -124,7 +124,7 @@ class PlanConfig:
     # free-form rephrasings, the VLM produces variants along named
     # axes (synonym / omit_arm / omit_orientation / omit_grasp_method /
     # combined). Off by default (back-compat).
-    task_aug_axes: "TaskAugAxesConfig" = field(default_factory=lambda: TaskAugAxesConfig())
+    task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())
 
 
 @dataclass
@@ -136,12 +136,12 @@ class ActionRecordsConfig:
     subtask to extract a typed record::
 
         {
-          "verb": "pick" | "place" | "press" | ...,    # closed vocabulary
-          "object": "<canonical_object_name>",
-          "arm": "left" | "right" | "both" | null,
-          "grasp_type": "pinch" | "wrap" | "hook" | ... | null,
-          "destination": "<canonical_destination>" | null,
-          "mistake": "<short text>" | null,
+            "verb": "pick" | "place" | "press" | ...,  # closed vocabulary
+            "object": "<canonical_object_name>",
+            "arm": "left" | "right" | "both" | null,
+            "grasp_type": "pinch" | "wrap" | "hook" | ... | null,
+            "destination": "<canonical_destination>" | null,
+            "mistake": "<short text>" | null,
         }
 
     The record is emitted as a separate row with ``style="action_record"``
@@ -176,16 +176,34 @@ class ActionRecordsConfig:
     # exactly one. Override per-dataset (e.g. ``["pick", "place", "open",
     # "close"]`` for door-only manipulation) for tighter constraint.
     verb_vocabulary: tuple[str, ...] = (
-        "pick", "place", "push", "pull", "open", "close", "turn",
-        "press", "lift", "insert", "pour", "move", "reach", "grasp",
-        "release", "wipe", "dump",
+        "pick",
+        "place",
+        "push",
+        "pull",
+        "open",
+        "close",
+        "turn",
+        "press",
+        "lift",
+        "insert",
+        "pour",
+        "move",
+        "reach",
+        "grasp",
+        "release",
+        "wipe",
+        "dump",
     )
 
     # Closed grasp-type vocabulary. ``null`` is always allowed (no
     # contact / unclear). Adjust per-hardware (e.g. drop ``hook`` /
     # ``key`` for parallel-jaw grippers).
     grasp_vocabulary: tuple[str, ...] = (
-        "pinch", "wrap", "hook", "key", "lateral",
+        "pinch",
+        "wrap",
+        "hook",
+        "key",
+        "lateral",
     )
 
 
diff --git a/src/lerobot/annotations/steerable_pipeline/executor.py b/src/lerobot/annotations/steerable_pipeline/executor.py
index 355e25460..4b7eb687d 100644
--- a/src/lerobot/annotations/steerable_pipeline/executor.py
+++ b/src/lerobot/annotations/steerable_pipeline/executor.py
@@ -238,9 +238,7 @@ class Executor:
         prompt path is reused.
         """
         if not self.plan.enabled or not self.interjections.enabled:
-            return PhaseResult(
-                name="plan_update", episodes_processed=0, episodes_skipped=len(records)
-            )
+            return PhaseResult(name="plan_update", episodes_processed=0, episodes_skipped=len(records))
         processed = 0
         for record in records:
             staging = EpisodeStaging(staging_dir, record.episode_index)
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index 1e5ad8838..579007912 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -206,9 +206,7 @@ class GeneralVqaModule:
             episode_task=record.episode_task,
             question_type=question_type,
         )
-        images = self.frame_provider.frames_at(
-            record, [frame_timestamp], camera_key=camera_key
-        )
+        images = self.frame_provider.frames_at(record, [frame_timestamp], camera_key=camera_key)
         content = [*to_image_blocks(images), {"type": "text", "text": prompt}]
         return [{"role": "user", "content": content}]
 
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 991ee3a3b..0d9c1a7dd 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -172,9 +172,7 @@ class PlanSubtasksMemoryModule:
         # "what's still left" at inference time.
         for span in subtask_spans:
             boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
-            plan_text = self._generate_plan(
-                record, subtask_spans, refresh_t=boundary_t, task=effective_task
-            )
+            plan_text = self._generate_plan(record, subtask_spans, refresh_t=boundary_t, task=effective_task)
             if plan_text is not None:
                 rows.append(
                     {
@@ -336,7 +334,9 @@ class PlanSubtasksMemoryModule:
         if not frames:
             logger.debug(
                 "action_record: no frames at span %.2f-%.2f for ep %s; skipping",
-                start_t, end_t, record.episode_index,
+                start_t,
+                end_t,
+                record.episode_index,
             )
             return None
 
@@ -811,12 +811,15 @@ class PlanSubtasksMemoryModule:
         import json  # noqa: PLC0415
 
         subtasks_json = json.dumps(
-            {"subtasks": [{"text": s["text"], "start": round(s["start"], 3), "end": round(s["end"], 3)} for s in spans]},
+            {
+                "subtasks": [
+                    {"text": s["text"], "start": round(s["start"], 3), "end": round(s["end"], 3)}
+                    for s in spans
+                ]
+            },
             indent=2,
         )
-        prompt = load_prompt("module_1_subtask_verify").format(
-            episode_task=task, subtasks_json=subtasks_json
-        )
+        prompt = load_prompt("module_1_subtask_verify").format(episode_task=task, subtasks_json=subtasks_json)
         kept_raw = self._vlm_field(self._video_message(record, prompt, window=window), "subtasks")
         # Windowed verify: the video is sampled from the absolute window
         # ``[w0, w1]`` but the model perceives it as a clip starting at 0,
@@ -824,9 +827,7 @@ class PlanSubtasksMemoryModule:
         # Clamp to that relative range and skip the absolute frame-snap
         # dedupe (done once later on the merged absolute-time set).
         clamp = (0.0, float(window[1] - window[0])) if window is not None else None
-        kept = self._clean_spans(
-            kept_raw, record, bounds=clamp, dedupe=window is None
-        )
+        kept = self._clean_spans(kept_raw, record, bounds=clamp, dedupe=window is None)
         if not kept:
             logger.info(
                 "episode %d: verify pass returned nothing — keeping the %d "
@@ -927,17 +928,13 @@ class PlanSubtasksMemoryModule:
         if not subtask_spans:
             return None
         remaining = [
-            s
-            for s in subtask_spans
-            if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
+            s for s in subtask_spans if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t)
         ]
         if not remaining:
             # Past the last subtask boundary on a late refresh — nothing
             # left to plan; emit None so the caller skips the row.
             return None
-        return "\n".join(
-            f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1)
-        )
+        return "\n".join(f"{i}. {span.get('text', '').strip()}" for i, span in enumerate(remaining, start=1))
 
     def _generate_memory(
         self,
diff --git a/src/lerobot/annotations/steerable_pipeline/validator.py b/src/lerobot/annotations/steerable_pipeline/validator.py
index a3c3d51f9..203e3f157 100644
--- a/src/lerobot/annotations/steerable_pipeline/validator.py
+++ b/src/lerobot/annotations/steerable_pipeline/validator.py
@@ -137,9 +137,7 @@ class StagingValidator:
         persistent: list[dict[str, Any]] = []
         for row in all_rows:
             self._check_column_routing(row, report, record.episode_index)
-            self._check_camera_field(
-                row, report, record.episode_index, self.dataset_camera_keys
-            )
+            self._check_camera_field(row, report, record.episode_index, self.dataset_camera_keys)
             if column_for_style(row.get("style")) == LANGUAGE_PERSISTENT:
                 persistent.append(row)
             else:
@@ -166,15 +164,9 @@ class StagingValidator:
         try:
             validate_camera_field(style, camera)
         except ValueError as exc:
-            report.add_error(
-                f"ep={episode_index} module={row.get('_module')}: {exc}"
-            )
+            report.add_error(f"ep={episode_index} module={row.get('_module')}: {exc}")
             return
-        if (
-            is_view_dependent_style(style)
-            and dataset_camera_keys
-            and camera not in dataset_camera_keys
-        ):
+        if is_view_dependent_style(style) and dataset_camera_keys and camera not in dataset_camera_keys:
             report.add_error(
                 f"ep={episode_index} module={row.get('_module')}: camera {camera!r} on style "
                 f"{style!r} is not one of the dataset's video keys {sorted(dataset_camera_keys)!r}"
diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py
index 7fee1f052..4c18b7937 100644
--- a/src/lerobot/scripts/lerobot_annotate.py
+++ b/src/lerobot/scripts/lerobot_annotate.py
@@ -64,9 +64,7 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
     logger.info("annotate: root=%s", root)
 
     vlm = make_vlm_client(cfg.vlm)
-    frame_provider = make_frame_provider(
-        root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend
-    )
+    frame_provider = make_frame_provider(root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend)
     # Surface the resolved cameras up front so a silent vqa-module no-op
     # is obvious in job output rather than discovered post-hoc by counting
     # parquet rows.
@@ -168,7 +166,10 @@ def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
             if isinstance(ds_version, str) and ds_version.startswith("v"):
                 version_tag = ds_version
         except Exception as exc:  # noqa: BLE001
-            print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
+            print(
+                f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}",
+                flush=True,
+            )
     revision = getattr(commit_info, "oid", None)
     tag_kwargs = {
         "repo_id": repo_id,

From 1417fd69b242e750b412ee2746b52cd63e8660ed Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 17:41:46 +0200
Subject: [PATCH 23/45] docs(annotate): prettier format annotation_pipeline.mdx

Quality-gate fix: ruff-format/markdown prettier hook reflow of the
annotation pipeline doc. No content change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 9d6e66231..05e4d103d 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -11,15 +11,15 @@ A vocabulary-discovery phase derives a small canonical wording, then three
 modules write into a per-episode staging tree, then a single writer
 rewrites the data shards in place:
 
-| Style / atom                                | Column                | Module         |
-| ------------------------------------------- | --------------------- | -------------- |
-| `subtask` (Pi0.7-style "how, not what")     | `language_persistent` | `plan`         |
-| `plan` (initial + refresh on interjection)  | `language_persistent` | `plan`         |
-| `memory` (MEM-style compression)            | `language_persistent` | `plan`         |
-| `task_aug` (rephrasings of canonical task)  | `language_persistent` | `plan`         |
-| `interjection`                              | `language_events`     | `interjections`|
-| speech tool-call atom (`style=null`, `say`) | `language_events`     | `interjections`|
-| `vqa` (user / assistant pair)               | `language_events`     | `vqa`          |
+| Style / atom                                | Column                | Module          |
+| ------------------------------------------- | --------------------- | --------------- |
+| `subtask` (Pi0.7-style "how, not what")     | `language_persistent` | `plan`          |
+| `plan` (initial + refresh on interjection)  | `language_persistent` | `plan`          |
+| `memory` (MEM-style compression)            | `language_persistent` | `plan`          |
+| `task_aug` (rephrasings of canonical task)  | `language_persistent` | `plan`          |
+| `interjection`                              | `language_events`     | `interjections` |
+| speech tool-call atom (`style=null`, `say`) | `language_events`     | `interjections` |
+| `vqa` (user / assistant pair)               | `language_events`     | `vqa`           |
 
 The `plan` module is constrained to a **canonical vocabulary** discovered
 once per dataset by the `vocabulary` module (phase 0). It watches a few

From 4c86332fe39db3c7e16834d001c5e4208e0373e3 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 18:00:41 +0200
Subject: [PATCH 24/45] feat(annotate): add plan toggle, drop subtask verify
 pass, 4xH200 job

- PlanConfig.emit_plan (default True): keep subtasks + memory but skip
  the per-boundary "plan" rows and their VLM call when False.
- Remove the subtask_verify pass entirely: pruning dropped legitimate
  subtasks and the stitch step already guarantees full-episode coverage.
  Deletes _verify_subtasks, both call sites, and the now-unused
  module_1_subtask_verify prompt.
- run_hf_job example: 4xH200 (4 vllm servers), emit_plan=false, vqa off.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 examples/annotations/run_hf_job.py            |  39 +++---
 .../annotations/steerable_pipeline/config.py  |  28 ++---
 .../modules/plan_subtasks_memory.py           | 119 +++++-------------
 .../prompts/module_1_subtask_verify.txt       |  33 -----
 4 files changed, 56 insertions(+), 163 deletions(-)
 delete mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 86575f72f..6af40a268 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python
 """Launch ``lerobot-annotate`` on a Hugging Face job (vllm + Qwen3.6-27B VLM).
 
-Spawns one ``h200x2`` job that:
+Spawns one ``h200x4`` job that:
 
   1. installs this branch of ``lerobot`` plus the annotation extras,
-  2. boots two vllm servers (one per GPU) with Qwen3.6-27B (dense VLM),
+  2. boots four vllm servers (one per GPU) with Qwen3.6-27B (dense VLM),
   3. runs the plan / interjections / vqa modules across the dataset
      in free-form mode (each episode generates its own subtasks +
      memory),
@@ -36,13 +36,13 @@ CMD = (
     "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
     "export VLLM_VIDEO_BACKEND=pyav && "
     "lerobot-annotate "
-    "--repo_id=pepijn223/robocasa_smoke_2atomic_v3 "
-    "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
+    "--repo_id=pepijn223/robocasa_pretrain_human300_v4 "
+    "--dest_repo_id=pepijn223/robocasa_pretrain_human300_v4_annotated5 "
     "--push_to_hub=true "
     "--vlm.backend=openai "
     "--vlm.model_id=Qwen/Qwen3.6-27B "
-    "--vlm.parallel_servers=2 "
-    "--vlm.num_gpus=2 "
+    "--vlm.parallel_servers=4 "
+    "--vlm.num_gpus=4 "
     '--vlm.serve_command="vllm serve Qwen/Qwen3.6-27B '
     "--tensor-parallel-size 1 --max-model-len 32768 "
     '--gpu-memory-utilization 0.8 --uvicorn-log-level warning --port {port}" '
@@ -63,7 +63,7 @@ CMD = (
     # CONTEXT BUDGET: with embedded frames, each frame is ~250-320 vision
     # tokens. The model's context is 32768 (see --max-model-len). 32
     # frames sampled uniformly across the episode (~8-10k tokens) fits
-    # comfortably alongside the prompt and the describe/verify passes.
+    # comfortably alongside the prompt and the describe pass.
     # Do NOT raise max_video_frames toward 128 with embedded frames — that
     # is ~33-39k tokens and overflows the context (BadRequestError 400,
     # "Input length exceeds maximum context length").
@@ -73,7 +73,7 @@ CMD = (
     # Constant 1 fps density via windowing: episodes longer than 32s are
     # split into 32-second windows (each 32 frames @ 1 fps, fits context),
     # so long episodes get MORE subtasks instead of a sparser whole-episode
-    # view. describe->segment->verify runs per window; spans are merged +
+    # view. describe->segment runs per window; spans are merged +
     # stitched to a contiguous whole-episode cover. 0 disables.
     "--plan.subtask_window_seconds=32 "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
@@ -95,26 +95,23 @@ CMD = (
     # the subtask text — useful only for long composite manipulation
     # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
-    "--plan.plan_max_steps=6 "
-    # NOTE: the multi-call subtask quality chain (describe -> segment ->
-    # verify, 3 VLM calls/episode) is ON BY DEFAULT now. Pass
-    # --plan.subtask_describe_first=false / --plan.subtask_verify=false to
-    # disable on datasets you've verified are easy and want fewer calls.
+    "--plan.plan_max_steps=10 "
+    # Only annotate subtasks + memory — skip the numbered "plan" rows
+    # (and their per-boundary VLM call). Flip to true to re-enable plan.
+    "--plan.emit_plan=false "
+    # NOTE: the grounding pass (describe -> segment, +1 VLM call/episode)
+    # is ON BY DEFAULT. Pass --plan.subtask_describe_first=false to disable
+    # on datasets you've verified are easy and want fewer calls.
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
-    # Phase 4 — general VQA.
-    # Ground VQA on the SAME single camera as plan/interjections
-    # (--vlm.camera_key) instead of iterating every camera. The whole
-    # pipeline then focuses on one view, e.g. observation.images.base.
-    "--vqa.restrict_to_default_camera=true "
-    "--vqa.K=1 "
-    "--vqa.vqa_emission_hz=1.0"
+    # Phase 4 — general VQA: DISABLED for this run.
+    "--vqa.enabled=false"
 )
 
 job = run_job(
     image="vllm/vllm-openai:latest",
     command=["bash", "-c", CMD],
-    flavor="h200x2",
+    flavor="h200x4",
     secrets={"HF_TOKEN": token},
     timeout="2h",
 )
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 63fbe83b0..cdcf38072 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -74,27 +74,19 @@ class PlanConfig:
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # Multi-call subtask quality chain. ON by default — the single-call
-    # 'watch video -> emit subtask JSON' pattern makes the VLM commit to
-    # structured output before reasoning about the video, so it
-    # pattern-matches the task text and hallucinates steps. The chain
-    # costs 2 extra VLM calls/episode (3 total for subtasks) but is the
-    # difference between trustworthy and fabricated labels. Set either to
-    # False to trade quality for fewer calls on datasets you've verified
-    # are easy.
-    #
     # ``subtask_describe_first``: run a grounding pass that narrates ONLY
     # what is visible in the video (no subtask JSON yet), then inject that
-    # description into the segmentation prompt. Forces the model to
-    # observe before committing to structured output — the strongest
-    # lever against subtasks invented from the task text. +1 VLM call/ep.
+    # description into the segmentation prompt. Forces the model to observe
+    # before committing to structured output — the strongest lever against
+    # subtasks invented from the task text. ON by default; +1 VLM call/ep.
+    # Set False to trade quality for fewer calls on easy datasets.
     subtask_describe_first: bool = True
-    # ``subtask_verify``: after segmentation, re-watch the video and drop
-    # any proposed subtask that can't be verified as visible. Prunes
-    # hallucinations; can only remove subtasks, never add/rewrite them.
-    # Fail-open (keeps un-verified spans if the verify call returns
-    # nothing). +1 VLM call/ep.
-    subtask_verify: bool = True
+
+    # Emit ``style="plan"`` rows (the numbered still-todo list re-emitted at
+    # every subtask boundary). Set False to keep only subtasks + memory and
+    # skip the plan rows entirely — saves one ``_generate_plan`` VLM call per
+    # subtask boundary. Subtask and memory generation are unaffected.
+    emit_plan: bool = True
 
     # NOTE: subtask spans are ALWAYS stitched into a contiguous
     # full-episode cover (first subtask pulled back to t0, gaps closed,
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 0d9c1a7dd..fecd42d3a 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -170,19 +170,22 @@ class PlanSubtasksMemoryModule:
         # contains exactly the subtasks that started at or after the
         # current span. Saves the runtime from having to derive
         # "what's still left" at inference time.
-        for span in subtask_spans:
-            boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
-            plan_text = self._generate_plan(record, subtask_spans, refresh_t=boundary_t, task=effective_task)
-            if plan_text is not None:
-                rows.append(
-                    {
-                        "role": "assistant",
-                        "content": plan_text,
-                        "style": "plan",
-                        "timestamp": float(boundary_t),
-                        "tool_calls": None,
-                    }
+        if self.config.emit_plan:
+            for span in subtask_spans:
+                boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
+                plan_text = self._generate_plan(
+                    record, subtask_spans, refresh_t=boundary_t, task=effective_task
                 )
+                if plan_text is not None:
+                    rows.append(
+                        {
+                            "role": "assistant",
+                            "content": plan_text,
+                            "style": "plan",
+                            "timestamp": float(boundary_t),
+                            "tool_calls": None,
+                        }
+                    )
         # memory rows at every subtask boundary except the very first start
         prior_memory = ""
         for i, span in enumerate(subtask_spans[1:], start=1):
@@ -450,7 +453,7 @@ class PlanSubtasksMemoryModule:
     def _episode_video_block(
         self, record: EpisodeRecord, window: tuple[float, float] | None = None
     ) -> list[dict[str, Any]]:
-        """Video block for the segmentation / describe / verify prompts.
+        """Video block for the segmentation / describe prompts.
 
         Always returns a block that actually carries the video. When
         ``use_video_url`` is set we try the server-side ``video_url``
@@ -514,6 +517,8 @@ class PlanSubtasksMemoryModule:
         (the previous version told the model "an interjection happened"
         without telling it what the user said).
         """
+        if not self.config.emit_plan:
+            return
         existing = staging.read("plan")
         # Pass the episode's last frame timestamp so the final subtask
         # span is closed (otherwise its ``end`` equals its ``start``,
@@ -559,9 +564,6 @@ class PlanSubtasksMemoryModule:
              the model segments its own grounded observations instead of
              pattern-matching the task text.
           2. segmentation — emit subtask JSON (as before).
-          3. ``subtask_verify`` — an adversarial pass that re-watches the
-             video and drops any proposed subtask it cannot actually see,
-             pruning hallucinations.
         """
         if record.row_count == 0 or not record.frame_timestamps:
             return []
@@ -573,8 +575,8 @@ class PlanSubtasksMemoryModule:
         # than one window, process the episode in fixed-length windows so
         # the VLM always sees ``frames_per_second`` density (instead of a
         # sparse 32-frame whole-episode view). Each window runs the full
-        # describe -> segment -> verify chain on its own frames; results
-        # are merged + stitched into a contiguous whole-episode cover.
+        # describe -> segment chain on its own frames; results are merged +
+        # stitched into a contiguous whole-episode cover.
         window_s = float(getattr(self.config, "subtask_window_seconds", 0.0) or 0.0)
         if window_s > 0.0 and episode_duration > window_s:
             return self._generate_subtasks_windowed(record, effective_task, window_s)
@@ -606,18 +608,11 @@ class PlanSubtasksMemoryModule:
         if not cleaned:
             return []
 
-        # ---- Pass 3 (optional): verification / pruning ---------------
-        if getattr(self.config, "subtask_verify", False):
-            cleaned = self._verify_subtasks(record, effective_task, cleaned)
-            if not cleaned:
-                return []
-
         # ---- Full-episode coverage stitch ----------------------------
-        # The VLM (especially after the verify pass prunes spans) can
-        # leave the first subtask starting after t0 or leave gaps between
-        # spans, so the subtask timeline no longer tiles the whole
-        # episode and frames fall through with no active subtask. Always
-        # stitch the surviving spans into a contiguous cover of
+        # The VLM can leave the first subtask starting after t0 or leave
+        # gaps between spans, so the subtask timeline no longer tiles the
+        # whole episode and frames fall through with no active subtask.
+        # Always stitch the surviving spans into a contiguous cover of
         # [t0, t_last] — there is no scenario where a sparse, gap-ridden
         # subtask timeline is desirable for conditioning.
         cleaned = self._stitch_full_coverage(cleaned, record)
@@ -630,8 +625,8 @@ class PlanSubtasksMemoryModule:
         """Subtask generation in fixed-length windows at constant fps.
 
         Splits ``[t0, t_last]`` into consecutive windows of ``window_s``
-        seconds, runs the describe -> segment -> verify chain on each
-        window's own frames (sampled at ``frames_per_second``), offsets
+        seconds, runs the describe -> segment chain on each window's own
+        frames (sampled at ``frames_per_second``), offsets
         each window's spans back to absolute episode time, then merges +
         stitches into a contiguous whole-episode cover.
         """
@@ -662,7 +657,7 @@ class PlanSubtasksMemoryModule:
     def _subtasks_for_window(
         self, record: EpisodeRecord, task: str, w0: float, w1: float
     ) -> list[dict[str, Any]]:
-        """Run describe -> segment -> verify on one ``[w0, w1]`` window.
+        """Run describe -> segment on one ``[w0, w1]`` window.
 
         The model works in window-RELATIVE time ``[0, L]`` (it perceives
         the window as a clip starting at 0); spans are offset back to
@@ -698,11 +693,6 @@ class PlanSubtasksMemoryModule:
         if not cleaned:
             return []
 
-        if getattr(self.config, "subtask_verify", False):
-            cleaned = self._verify_subtasks(record, task, cleaned, window=window)
-            if not cleaned:
-                return []
-
         # Offset window-relative spans back to absolute episode time.
         for s in cleaned:
             s["start"] = w0 + float(s["start"])
@@ -722,7 +712,7 @@ class PlanSubtasksMemoryModule:
           subtask's ``end`` extends to the last frame ``t_last``.
 
         Starts are otherwise left as the (already frame-snapped, distinct)
-        values the VLM + verify produced — only the FIRST start is pulled
+        values the VLM produced — only the FIRST start is pulled
         back to ``t0``, which can't collide with a later span because it
         was already the earliest. Purely deterministic; runs after the
         VLM passes.
@@ -792,59 +782,6 @@ class PlanSubtasksMemoryModule:
         text = self._vlm_field(self._video_message(record, prompt, window=window), "description")
         return text.strip() if isinstance(text, str) and text.strip() else ""
 
-    def _verify_subtasks(
-        self,
-        record: EpisodeRecord,
-        task: str,
-        spans: list[dict[str, Any]],
-        window: tuple[float, float] | None = None,
-    ) -> list[dict[str, Any]]:
-        """Adversarial pass: drop proposed subtasks not visible in the video.
-
-        Keeps the original span on a verified ``text`` match (the verify
-        prompt is told not to rewrite text), so verification can only
-        PRUNE — never invent or mutate. If the verify call fails or
-        returns nothing parseable, the un-verified spans are kept (fail
-        open: better to keep a possibly-good label than silently drop
-        everything on a transient VLM hiccup).
-        """
-        import json  # noqa: PLC0415
-
-        subtasks_json = json.dumps(
-            {
-                "subtasks": [
-                    {"text": s["text"], "start": round(s["start"], 3), "end": round(s["end"], 3)}
-                    for s in spans
-                ]
-            },
-            indent=2,
-        )
-        prompt = load_prompt("module_1_subtask_verify").format(episode_task=task, subtasks_json=subtasks_json)
-        kept_raw = self._vlm_field(self._video_message(record, prompt, window=window), "subtasks")
-        # Windowed verify: the video is sampled from the absolute window
-        # ``[w0, w1]`` but the model perceives it as a clip starting at 0,
-        # so proposed + returned times are window-RELATIVE in ``[0, L]``.
-        # Clamp to that relative range and skip the absolute frame-snap
-        # dedupe (done once later on the merged absolute-time set).
-        clamp = (0.0, float(window[1] - window[0])) if window is not None else None
-        kept = self._clean_spans(kept_raw, record, bounds=clamp, dedupe=window is None)
-        if not kept:
-            logger.info(
-                "episode %d: verify pass returned nothing — keeping the %d "
-                "un-verified subtask(s) (fail-open)",
-                record.episode_index,
-                len(spans),
-            )
-            return spans
-        if len(kept) < len(spans):
-            logger.info(
-                "episode %d: verify pass pruned %d -> %d subtask(s)",
-                record.episode_index,
-                len(spans),
-                len(kept),
-            )
-        return kept
-
     @staticmethod
     def _dedupe_starts_to_distinct_frames(
         spans: list[dict[str, Any]], record: EpisodeRecord
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
deleted file mode 100644
index e52dc0aeb..000000000
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-You previously segmented a teleoperated robot demonstration into these
-candidate subtasks (JSON):
-
-{subtasks_json}
-
-The user's task was: "{episode_task}"
-
-This is a VERIFICATION pass. Re-watch the video. For EACH candidate
-subtask, decide whether the robot can ACTUALLY be seen performing that
-action within its [start, end] time window.
-
-Rules:
-- KEEP a subtask only if its action is clearly visible in the video in
-  roughly that time window.
-- DROP any subtask whose action you cannot see, that describes
-  something not actually present in the video, that was inferred from
-  the task instruction rather than observed, or that duplicates another
-  kept subtask.
-- Do NOT add new subtasks. Do NOT rewrite the text of kept subtasks.
-  Do NOT change the start/end timestamps of kept subtasks.
-- It is correct and expected to return FEWER subtasks than you were
-  given — even just one — if that is all the video supports. Returning
-  zero is allowed if none can be verified.
-
-Output strictly valid JSON of the SAME shape, containing only the kept
-subtasks in chronological order:
-
-  {{
-    "subtasks": [
-      {{"text": "<kept verbatim>", "start": <float>, "end": <float>}},
-      ...
-    ]
-  }}

From 906b585826c0e34235a347554abfb94e0d65484b Mon Sep 17 00:00:00 2001
From: Khalil Meftah <khalil.meftah@huggingface.co>
Date: Tue, 2 Jun 2026 19:25:13 +0200
Subject: [PATCH 25/45] fix(datasets): default `private` to `None` in
 `push_to_hub` to respect Hub org visibility settings (#3713)

---
 src/lerobot/configs/dataset.py          | 4 ++--
 src/lerobot/datasets/lerobot_dataset.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lerobot/configs/dataset.py b/src/lerobot/configs/dataset.py
index d5c6fa312..c40c0fae2 100644
--- a/src/lerobot/configs/dataset.py
+++ b/src/lerobot/configs/dataset.py
@@ -41,8 +41,8 @@ class DatasetRecordConfig:
     video: bool = True
     # Upload dataset to Hugging Face hub.
     push_to_hub: bool = True
-    # Upload on private repository on the Hugging Face hub.
-    private: bool = False
+    # If True, upload as private; if None, defer to the org default on the Hub (only affects orgs).
+    private: bool | None = None
     # Add tags to your dataset on the hub.
     tags: list[str] | None = None
     # Number of subprocesses handling the saving of frames as PNG. Set to 0 to use threads only;
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 9734bcc74..d0dcf087d 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -524,7 +524,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         license: str | None = "apache-2.0",
         tag_version: bool = True,
         push_videos: bool = True,
-        private: bool = False,
+        private: bool | None = None,
         allow_patterns: list[str] | str | None = None,
         upload_large_folder: bool = False,
         **card_kwargs,
@@ -543,7 +543,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
             tag_version: If ``True``, create a Git tag for the current codebase
                 version.
             push_videos: If ``False``, skip uploading the ``videos/`` directory.
-            private: If ``True``, create a private repository.
+            private: If ``True``, create a private repository. If ``None``
+                (default), defer to the org default on the Hub (only affects orgs).
             allow_patterns: Glob pattern(s) restricting which files to upload.
             upload_large_folder: If ``True``, use ``upload_large_folder`` instead
                 of ``upload_folder`` for very large datasets.

From 19fe315971079e116b917ed4d482bfbf3c93bbbe Mon Sep 17 00:00:00 2001
From: Haoming Song <haomingsong24@gmail.com>
Date: Wed, 3 Jun 2026 17:46:35 +0800
Subject: [PATCH 26/45] fix(train): enable relative action overrides for
 pretrained processors (#3711)

* fix(train): enable relative action overrides for pretrained processors
Keep pretrained processor pipelines when use_relative_actions is enabled and
apply relative/absolute action processor settings through overrides. Rename the
relative action processor registry key to relative_actions_processor.

* fix(config): reject rename_map without pretrained checkpoint

Fail fast when rename_map is set during fresh initialization, since fresh
configs derive feature names from the current dataset and no rename is applied.

---------

Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
---
 src/lerobot/configs/train.py                  |  6 ++++
 .../processor/relative_action_processor.py    |  2 +-
 src/lerobot/scripts/lerobot_train.py          | 29 ++++++++-----------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py
index 55498d3ac..bac1a946b 100644
--- a/src/lerobot/configs/train.py
+++ b/src/lerobot/configs/train.py
@@ -177,6 +177,12 @@ class TrainPipelineConfig(HubMixin):
             )
 
         active_cfg = self.trainable_config
+        if self.rename_map and active_cfg.pretrained_path is None:
+            raise ValueError(
+                "`rename_map` requires a pretrained policy checkpoint. "
+                "Fresh initialization derives feature names from the current dataset, so no rename is applied."
+            )
+
         if not self.job_name:
             if self.env is None:
                 self.job_name = f"{active_cfg.type}"
diff --git a/src/lerobot/processor/relative_action_processor.py b/src/lerobot/processor/relative_action_processor.py
index e1e65acb1..5b1039291 100644
--- a/src/lerobot/processor/relative_action_processor.py
+++ b/src/lerobot/processor/relative_action_processor.py
@@ -81,7 +81,7 @@ def to_absolute_actions(actions: Tensor, state: Tensor, mask: Sequence[bool]) ->
     return actions
 
 
-@ProcessorStepRegistry.register("delta_actions_processor")
+@ProcessorStepRegistry.register("relative_actions_processor")
 @dataclass
 class RelativeActionsProcessorStep(ProcessorStep):
     """Converts absolute actions to relative actions (action -= state) for masked dimensions.
diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
index 463668eb2..4ddef3105 100644
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -292,19 +292,8 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
 
     active_cfg = cfg.trainable_config
     processor_pretrained_path = active_cfg.pretrained_path
-    if (
-        getattr(active_cfg, "use_relative_actions", False)
-        and processor_pretrained_path is not None
-        and not cfg.resume
-    ):
-        logging.warning(
-            "use_relative_actions=true with pretrained processors can skip relative transforms if "
-            "the checkpoint processors do not define them. Building processors from current policy config."
-        )
-        processor_pretrained_path = None
 
     processor_kwargs = {}
-    postprocessor_kwargs = {}
     if (processor_pretrained_path and not cfg.resume) or not processor_pretrained_path:
         processor_kwargs["dataset_stats"] = dataset.meta.stats
 
@@ -312,24 +301,31 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
         processor_kwargs["dataset_meta"] = dataset.meta
 
     if not cfg.is_reward_model_training and processor_pretrained_path is not None:
-        processor_kwargs["preprocessor_overrides"] = {
+        preprocessor_overrides = {
             "device_processor": {"device": device.type},
             "normalizer_processor": {
                 "stats": dataset.meta.stats,
                 "features": {**policy.config.input_features, **policy.config.output_features},
                 "norm_map": policy.config.normalization_mapping,
             },
+            "rename_observations_processor": {"rename_map": cfg.rename_map},
         }
-        processor_kwargs["preprocessor_overrides"]["rename_observations_processor"] = {
-            "rename_map": cfg.rename_map
-        }
-        postprocessor_kwargs["postprocessor_overrides"] = {
+        postprocessor_overrides = {
             "unnormalizer_processor": {
                 "stats": dataset.meta.stats,
                 "features": policy.config.output_features,
                 "norm_map": policy.config.normalization_mapping,
             },
         }
+        if getattr(active_cfg, "use_relative_actions", False):
+            preprocessor_overrides["relative_actions_processor"] = {
+                "enabled": True,
+                "exclude_joints": getattr(active_cfg, "relative_exclude_joints", []),
+                "action_names": getattr(active_cfg, "action_feature_names", None),
+            }
+            postprocessor_overrides["absolute_actions_processor"] = {"enabled": True}
+        processor_kwargs["preprocessor_overrides"] = preprocessor_overrides
+        processor_kwargs["postprocessor_overrides"] = postprocessor_overrides
 
     if cfg.is_reward_model_training:
         preprocessor, postprocessor = make_reward_pre_post_processors(
@@ -341,7 +337,6 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
             policy_cfg=cfg.policy,
             pretrained_path=processor_pretrained_path,
             **processor_kwargs,
-            **postprocessor_kwargs,
         )
 
     if is_main_process:

From 741c2d0a3939ba54d6cf47f29a7fbbba3766a600 Mon Sep 17 00:00:00 2001
From: Nikodem Bartnik <39432165+NikodemBartnik@users.noreply.github.com>
Date: Wed, 3 Jun 2026 14:22:05 +0200
Subject: [PATCH 27/45] Docs/add lelab (#3707)

* first text draft (no images)

* simplified docs

* fix formatting

* add youtube video

* add a tip about compatibility

* fix broken link
---
 docs/source/_toctree.yml |  2 ++
 docs/source/lelab.mdx    | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 docs/source/lelab.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index a216548d8..ce36fad5d 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -9,6 +9,8 @@
 - sections:
   - local: il_robots
     title: Imitation Learning for Robots
+  - local: lelab
+    title: LeLab - Lerobot GUI
   - local: bring_your_own_policies
     title: Adding a Policy
   - local: integrate_hardware
diff --git a/docs/source/lelab.mdx b/docs/source/lelab.mdx
new file mode 100644
index 000000000..a9f28e57b
--- /dev/null
+++ b/docs/source/lelab.mdx
@@ -0,0 +1,29 @@
+# LeLab - LeRobot Guide
+
+LeLab is a graphical user interface built on top of the LeRobot library, designed to make robotics accessible without needing to memorize CLI commands. From a single app you can configure your robot, teleoperate it, collect datasets, train policies locally or on cloud GPUs via HF Jobs, and deploy trained models back onto your robot. It's the easiest way to go from an unboxed SO-101 to a working policy, and a great companion for anyone learning the LeRobot workflow. Source code and issues live on GitHub: [huggingface/leLab](https://github.com/huggingface/leLab).
+
+> [!TIP]
+> For now LeLab is compatible only with SO-ARM101
+
+<Youtube id="VqyKUuW9V1g" />
+
+### Installation
+
+Requires [`uv`](https://docs.astral.sh/uv/getting-started/installation/). Install and launch in one command:
+
+```
+uv tool install git+https://github.com/huggingface/leLab.git && lelab
+```
+
+After install, run `lelab` from your terminal anytime to start the app.
+
+### Features
+
+- **Add robots** — Select arm type (leader/follower), calibrate each joint from the middle position, and attach cameras.
+- **Teleoperation** — Control the follower arm with the leader and see a live 3D visualization of the arms.
+- **Dataset recording** — Define a task description, number of episodes, and episode/reset durations. Press spacebar to advance between episodes. 30+ episodes recommended.
+- **Local training** — Train a policy directly on your own machine with a selected dataset, policy type, batch size, and step count.
+- **Cloud training with HF Jobs** — Train on powerful GPUs via [HF Jobs](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) with transparent pricing. Run `hf auth login` first. See the [Compute HW Guide](hardware_guide) for hardware/batch size tips.
+- **Training visualization** — Watch progress live in the app, with checkpoints saved automatically.
+- **Run trained policies** — Pick any model from your jobs list and run inference on your robot with one click.
+- **Use community datasets** — Provide any Hugging Face dataset ID to train on datasets you didn't record yourself.

From d1b1c5c8cff5e1f637495e1667a1d6c7c5258f3b Mon Sep 17 00:00:00 2001
From: Jaimin <jaimin9999@gmail.com>
Date: Wed, 3 Jun 2026 08:48:19 -0400
Subject: [PATCH 28/45] docs: fix broken dataset script paths (datasets/v30 ->
 scripts) (#3695)

The docs pointed at src/lerobot/datasets/v30/, which does not exist.
Both scripts actually live in src/lerobot/scripts/:

- convert_dataset_v21_to_v30.py
- augment_dataset_quantile_stats.py

Updated the four references (one python -m module path and three
file-path invocations) to the correct location, matching each
script's own usage docstring.
---
 docs/source/lerobot-dataset-v3.mdx  | 2 +-
 docs/source/molmoact2.mdx           | 2 +-
 docs/source/pi05.mdx                | 2 +-
 docs/source/porting_datasets_v3.mdx | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/lerobot-dataset-v3.mdx b/docs/source/lerobot-dataset-v3.mdx
index c23677d8c..21cb232d3 100644
--- a/docs/source/lerobot-dataset-v3.mdx
+++ b/docs/source/lerobot-dataset-v3.mdx
@@ -275,7 +275,7 @@ A converter aggregates per‑episode files into larger shards and writes episode
 pip install "https://github.com/huggingface/lerobot/archive/33cad37054c2b594ceba57463e8f11ee374fa93c.zip"
 
 # Convert an existing v2.1 dataset hosted on the Hub:
-python -m lerobot.datasets.v30.convert_dataset_v21_to_v30 --repo-id=<HF_USER/DATASET_ID>
+python -m lerobot.scripts.convert_dataset_v21_to_v30 --repo-id=<HF_USER/DATASET_ID>
 ```
 
 **What it does**
diff --git a/docs/source/molmoact2.mdx b/docs/source/molmoact2.mdx
index ddd178acd..c6ae24e9e 100644
--- a/docs/source/molmoact2.mdx
+++ b/docs/source/molmoact2.mdx
@@ -238,7 +238,7 @@ your dataset has not been converted with quantile statistics, you can add them
 with:
 
 ```bash
-python src/lerobot/datasets/v30/augment_dataset_quantile_stats.py \
+python src/lerobot/scripts/augment_dataset_quantile_stats.py \
   --repo-id=your_dataset
 ```
 
diff --git a/docs/source/pi05.mdx b/docs/source/pi05.mdx
index f99ad3286..127a2adc7 100644
--- a/docs/source/pi05.mdx
+++ b/docs/source/pi05.mdx
@@ -91,7 +91,7 @@ lerobot-train \
 If your dataset is not converted with `quantiles`, you can convert it with the following command:
 
 ```bash
-python src/lerobot/datasets/v30/augment_dataset_quantile_stats.py \
+python src/lerobot/scripts/augment_dataset_quantile_stats.py \
     --repo-id=your_dataset \
 ```
 
diff --git a/docs/source/porting_datasets_v3.mdx b/docs/source/porting_datasets_v3.mdx
index 46793265e..b2c3c15a0 100644
--- a/docs/source/porting_datasets_v3.mdx
+++ b/docs/source/porting_datasets_v3.mdx
@@ -300,7 +300,7 @@ This replaces the old episode-per-file structure with efficient, optimally-sized
 If you have existing datasets in v2.1 format, use the migration tool:
 
 ```bash
-python src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py \
+python src/lerobot/scripts/convert_dataset_v21_to_v30.py \
     --repo-id your_id/existing_dataset
 ```
 

From b9246ef61bc22afa716ca969034c273189a0cc43 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 15:56:53 +0200
Subject: [PATCH 29/45] tests(annotations): guard on the 'dataset' extra so
 base fast-test tier skips cleanly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fast Pytest Tests failed at COLLECTION in the base '--extra test' tier
with 'ModuleNotFoundError: No module named datasets': tests/annotations/
conftest.py imported the fixture dataset builder (-> lerobot.datasets ->
the HF 'datasets' lib + pandas/pyarrow), which only ship under the
'dataset' extra, so the whole annotations package crashed.

Fix uses the repo's proven module-level guard pattern (see
tests/datasets/test_language.py), NOT a conftest-level importorskip —
verified empirically that pytest.importorskip raised during conftest
*import* is treated as a collection ERROR (exit 1), while module-level
importorskip is a clean SKIP.

  * conftest.py: import build_annotation_dataset LAZILY inside the
    fixtures so the conftest itself imports cleanly in every tier.
  * test_modules / test_validator / test_writer / test_pipeline_recipe_
    render: add module-level pytest.importorskip('datasets') +
    ('pandas') before the pyarrow / lerobot.* imports (# noqa: E402 to
    match the existing convention). pyarrow-importing modules place the
    guard before the pyarrow import.
  * tests/scripts/test_lerobot_annotate.py: same guard (its _push_to_hub
    path imports lerobot.datasets).

Result:
  - base / hardware / viz tiers (no dataset extra): annotation tests
    skip cleanly; the rest of the suite runs -> exit 0.
  - dataset tier: datasets present -> guards pass through -> annotation
    tests run with the stub VLM. The pipeline modules import only
    stdlib + relative + lerobot.datasets (no module-level datatrove /
    vllm / openai), so they import fine there.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/annotations/conftest.py                 | 12 ++++++++-
 tests/annotations/test_modules.py             | 20 +++++++++-----
 .../test_pipeline_recipe_render.py            | 26 ++++++++++++-------
 tests/annotations/test_validator.py           | 16 +++++++++---
 tests/annotations/test_writer.py              | 15 ++++++++---
 tests/scripts/test_lerobot_annotate.py        |  7 +++++
 6 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/tests/annotations/conftest.py b/tests/annotations/conftest.py
index 8134c79a5..198e90319 100644
--- a/tests/annotations/conftest.py
+++ b/tests/annotations/conftest.py
@@ -26,12 +26,20 @@ from pathlib import Path
 
 import pytest
 
-from tests.fixtures.dataset_factories import build_annotation_dataset
+# NOTE: ``build_annotation_dataset`` pulls in ``lerobot.datasets`` (-> the HF
+# ``datasets`` library + ``pandas``), which only ship under the ``dataset``
+# extra. It is imported LAZILY inside the fixtures below so this conftest
+# imports cleanly in dependency tiers without that extra (e.g. the base
+# ``--extra test`` fast-test tier). The annotation test modules guard
+# themselves with a module-level ``pytest.importorskip("datasets")`` so
+# their collection is skipped — never erroring — when the extra is absent.
 
 
 @pytest.fixture
 def fixture_dataset_root(tmp_path: Path) -> Path:
     """A tiny dataset with two episodes, 12 frames each at 10 fps."""
+    from tests.fixtures.dataset_factories import build_annotation_dataset
+
     return build_annotation_dataset(
         tmp_path / "ds",
         episode_specs=[
@@ -44,6 +52,8 @@ def fixture_dataset_root(tmp_path: Path) -> Path:
 
 @pytest.fixture
 def single_episode_root(tmp_path: Path) -> Path:
+    from tests.fixtures.dataset_factories import build_annotation_dataset
+
     return build_annotation_dataset(
         tmp_path / "ds_one",
         episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py
index 73685a079..189481169 100644
--- a/tests/annotations/test_modules.py
+++ b/tests/annotations/test_modules.py
@@ -22,21 +22,29 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 
-from lerobot.annotations.steerable_pipeline.config import (
+import pytest
+
+# ``lerobot.annotations`` imports pull in ``lerobot.datasets`` (-> the HF
+# ``datasets`` library), which only ships under the ``dataset`` extra. Skip
+# this module in tiers without it instead of erroring at import.
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
+
+from lerobot.annotations.steerable_pipeline.config import (  # noqa: E402
     InterjectionsConfig,
     PlanConfig,
     VqaConfig,
 )
-from lerobot.annotations.steerable_pipeline.modules import (
+from lerobot.annotations.steerable_pipeline.modules import (  # noqa: E402
     GeneralVqaModule,
     InterjectionsAndSpeechModule,
     PlanSubtasksMemoryModule,
 )
-from lerobot.annotations.steerable_pipeline.reader import iter_episodes
-from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging
-from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
+from lerobot.annotations.steerable_pipeline.reader import iter_episodes  # noqa: E402
+from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging  # noqa: E402
+from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient  # noqa: E402
 
-from ._helpers import make_canned_responder
+from ._helpers import make_canned_responder  # noqa: E402
 
 
 @dataclass
diff --git a/tests/annotations/test_pipeline_recipe_render.py b/tests/annotations/test_pipeline_recipe_render.py
index 6d0f6a29f..43a616934 100644
--- a/tests/annotations/test_pipeline_recipe_render.py
+++ b/tests/annotations/test_pipeline_recipe_render.py
@@ -19,26 +19,34 @@ from __future__ import annotations
 
 from pathlib import Path
 
-import pyarrow.parquet as pq
+import pytest
 
-from lerobot.annotations.steerable_pipeline.config import (
+# ``pyarrow`` and the ``lerobot.datasets`` chain (-> the HF ``datasets``
+# library) only ship under the ``dataset`` extra. Skip this module in
+# tiers without it instead of erroring at import.
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
+
+import pyarrow.parquet as pq  # noqa: E402
+
+from lerobot.annotations.steerable_pipeline.config import (  # noqa: E402
     AnnotationPipelineConfig,
     InterjectionsConfig,
     PlanConfig,
     VqaConfig,
 )
-from lerobot.annotations.steerable_pipeline.executor import Executor
-from lerobot.annotations.steerable_pipeline.modules import (
+from lerobot.annotations.steerable_pipeline.executor import Executor  # noqa: E402
+from lerobot.annotations.steerable_pipeline.modules import (  # noqa: E402
     GeneralVqaModule,
     InterjectionsAndSpeechModule,
     PlanSubtasksMemoryModule,
 )
-from lerobot.annotations.steerable_pipeline.validator import StagingValidator
-from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
-from lerobot.configs.recipe import MessageTurn, TrainingRecipe
-from lerobot.datasets.language_render import render_sample
+from lerobot.annotations.steerable_pipeline.validator import StagingValidator  # noqa: E402
+from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter  # noqa: E402
+from lerobot.configs.recipe import MessageTurn, TrainingRecipe  # noqa: E402
+from lerobot.datasets.language_render import render_sample  # noqa: E402
 
-from ._helpers import make_canned_responder
+from ._helpers import make_canned_responder  # noqa: E402
 
 
 def _build_pr1_style_blend_recipe() -> TrainingRecipe:
diff --git a/tests/annotations/test_validator.py b/tests/annotations/test_validator.py
index c01d862cf..6b421cc98 100644
--- a/tests/annotations/test_validator.py
+++ b/tests/annotations/test_validator.py
@@ -20,10 +20,18 @@ from __future__ import annotations
 import json
 from pathlib import Path
 
-from lerobot.annotations.steerable_pipeline.reader import iter_episodes
-from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging
-from lerobot.annotations.steerable_pipeline.validator import StagingValidator
-from lerobot.annotations.steerable_pipeline.writer import speech_atom
+import pytest
+
+# ``lerobot.annotations`` imports pull in ``lerobot.datasets`` (-> the HF
+# ``datasets`` library), which only ships under the ``dataset`` extra. Skip
+# this module in tiers without it instead of erroring at import.
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
+
+from lerobot.annotations.steerable_pipeline.reader import iter_episodes  # noqa: E402
+from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging  # noqa: E402
+from lerobot.annotations.steerable_pipeline.validator import StagingValidator  # noqa: E402
+from lerobot.annotations.steerable_pipeline.writer import speech_atom  # noqa: E402
 
 
 def _validate(root: Path, staging_dir: Path):
diff --git a/tests/annotations/test_writer.py b/tests/annotations/test_writer.py
index 29f14c0e8..22dfbcb29 100644
--- a/tests/annotations/test_writer.py
+++ b/tests/annotations/test_writer.py
@@ -20,12 +20,19 @@ from __future__ import annotations
 import json
 from pathlib import Path
 
-import pyarrow.parquet as pq
 import pytest
 
-from lerobot.annotations.steerable_pipeline.reader import iter_episodes
-from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging
-from lerobot.annotations.steerable_pipeline.writer import (
+# ``pyarrow`` and the ``lerobot.annotations`` -> ``lerobot.datasets`` chain
+# (-> the HF ``datasets`` library) only ship under the ``dataset`` extra.
+# Skip this module in tiers without it instead of erroring at import.
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+pytest.importorskip("pandas", reason="pandas is required (install lerobot[dataset])")
+
+import pyarrow.parquet as pq  # noqa: E402
+
+from lerobot.annotations.steerable_pipeline.reader import iter_episodes  # noqa: E402
+from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging  # noqa: E402
+from lerobot.annotations.steerable_pipeline.writer import (  # noqa: E402
     LanguageColumnsWriter,
     speech_atom,
 )
diff --git a/tests/scripts/test_lerobot_annotate.py b/tests/scripts/test_lerobot_annotate.py
index c98ee7cb3..9f80d2e8c 100644
--- a/tests/scripts/test_lerobot_annotate.py
+++ b/tests/scripts/test_lerobot_annotate.py
@@ -3,6 +3,13 @@
 import json
 from types import SimpleNamespace
 
+import pytest
+
+# ``lerobot.scripts.lerobot_annotate`` (and the ``_push_to_hub`` path it
+# exercises) imports ``lerobot.datasets``, which only ships under the
+# ``dataset`` extra. Skip in tiers without it instead of erroring.
+pytest.importorskip("datasets", reason="datasets is required (install lerobot[dataset])")
+
 
 def test_push_to_hub_tags_uploaded_dataset_revision(tmp_path, monkeypatch):
     from lerobot.scripts.lerobot_annotate import _push_to_hub

From 273a8fc33505b9c2b197fd398a1d9dbb16a32376 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:09:22 +0200
Subject: [PATCH 30/45] deps(annotations): drop hard vllm dependency to unblock
 CI torch/torchcodec resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fast Pytest 'dataset' tier failed collecting tests/datasets/test_video_
decoder_cache.py with 'Could not load libtorchcodec ... undefined symbol:
torch_dtype_float4_e2m1fn_x2' — a torch/torchcodec ABI mismatch.

Root cause: the annotations extra's vllm hard-pins an older torch
(via xformers/xgrammar -> torch 2.8). uv resolves a SINGLE unified lock
across all extras, so vllm capped torch to 2.8 for every tier —
including dataset, whose torchcodec 0.11.1 needs torch 2.11. The
result was torch 2.8 + torchcodec 0.11.1 installed together -> ABI break.
(main has no vllm, so it resolves torch 2.11 + torchcodec 0.11.1 cleanly.)

Fix: remove vllm from the annotations extra. It is not needed by
the shipped workflow — examples/annotations/run_hf_job.py gets vllm from
the vllm/vllm-openai image and talks to it over the OpenAI-compatible
API (--vlm.backend=openai), and vlm_client._make_vllm_client imports vllm
lazily. For the in-process --vlm.backend=vllm path, install vllm
separately (the ImportError now says so).

After the fix uv resolves torch 2.11.0 + torchcodec 0.11.1 (matching
main); uv lock --check is clean. The annotations extra still provides
datasets / transformers / openai.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml                                |   12 +-
 .../steerable_pipeline/vlm_client.py          |    6 +-
 uv.lock                                       | 1248 +++--------------
 3 files changed, 214 insertions(+), 1052 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f9baedeb9..86599aa31 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -231,7 +231,17 @@ annotations = [
     "lerobot[dataset]",
     "lerobot[transformers-dep]",
     "openai>=1.40,<2.0",
-    "vllm>=0.6.0,<1.0.0; sys_platform == 'linux'",
+    # NOTE: ``vllm`` is intentionally NOT a hard dependency here. vLLM
+    # hard-pins an older torch (via xformers/xgrammar), and because uv
+    # resolves a single unified lock across all extras, including it would
+    # cap ``torch`` for every other extra too (e.g. forcing torch 2.8 while
+    # ``torchcodec`` in the ``dataset`` extra needs torch 2.11 -> ABI break
+    # in CI). vLLM is also not needed by the shipped workflow: the HF Jobs
+    # launcher (``examples/annotations/run_hf_job.py``) gets it from the
+    # ``vllm/vllm-openai`` image and talks to it over the OpenAI-compatible
+    # API (``--vlm.backend=openai``), and ``vlm_client._make_vllm_client``
+    # imports vllm lazily with an actionable error. To use the in-process
+    # ``--vlm.backend=vllm`` locally, ``pip install vllm`` separately.
 ]
 
 # Development
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index b5cf2c1b7..8aa7d01c6 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -216,7 +216,11 @@ def _make_vllm_client(config: VlmConfig) -> VlmClient:
         from vllm import LLM, SamplingParams  # type: ignore[import-not-found]
     except ImportError as exc:
         raise ImportError(
-            "vllm is required for backend='vllm'. Install with `pip install lerobot[annotations]`."
+            "vllm is required for backend='vllm'. Install it separately with "
+            "`pip install vllm` (it is not a hard dependency of the "
+            "``annotations`` extra because it pins an older torch). The HF "
+            "Jobs launcher uses the vllm/vllm-openai image + backend='openai' "
+            "instead."
         ) from exc
     # Workaround for cuDNN 9.x + torch 2.8 conv3d regression that surfaces
     # as CUDNN_STATUS_NOT_INITIALIZED in Qwen-VL vision-tower patch
diff --git a/uv.lock b/uv.lock
index 4d43d18d3..6ca254026 100644
--- a/uv.lock
+++ b/uv.lock
@@ -68,8 +68,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" }
 wheels = [
@@ -323,15 +323,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bd/46/d3ec57ad500f598d1554bd14ce4df615960549ab2844961bc4e1f5fbd174/ast_serialize-0.3.0-cp39-abi3-win_arm64.whl", hash = "sha256:0dd00da29985f15f50dc35728b7e1e7c84507bccfea1d9914738530f1c72238a", size = 1077165, upload-time = "2026-04-30T23:24:46.377Z" },
 ]
 
-[[package]]
-name = "astor"
-version = "0.8.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5a/21/75b771132fee241dfe601d39ade629548a9626d1d39f333fde31bc46febe/astor-0.8.1.tar.gz", hash = "sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e", size = 35090, upload-time = "2019-12-10T01:50:35.51Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl", hash = "sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5", size = 27488, upload-time = "2019-12-10T01:50:33.628Z" },
-]
-
 [[package]]
 name = "asttokens"
 version = "3.0.1"
@@ -429,54 +420,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
 ]
 
-[[package]]
-name = "blake3"
-version = "1.0.8"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/75/aa/abcd75e9600987a0bc6cfe9b6b2ff3f0e2cb08c170addc6e76035b5c4cb3/blake3-1.0.8.tar.gz", hash = "sha256:513cc7f0f5a7c035812604c2c852a0c1468311345573de647e310aca4ab165ba", size = 117308, upload-time = "2025-10-14T06:47:48.83Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/7d/85a4c0782f613de23d114a7a78fcce270f75b193b3ff3493a0de24ba104a/blake3-1.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:269f255b110840e52b6ce9db02217e39660ebad3e34ddd5bca8b8d378a77e4e1", size = 371296, upload-time = "2025-10-14T06:45:49.674Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/20/488475254976ed93fab57c67aa80d3b40df77f7d9db6528c9274bff53e08/blake3-1.0.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:66ca28a673025c40db3eba21a9cac52f559f83637efa675b3f6bd8683f0415f3", size = 374516, upload-time = "2025-10-14T06:45:51.23Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/21/2a1c47fedb77fb396512677ec6d46caf42ac6e9a897db77edd0a2a46f7bb/blake3-1.0.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcb04966537777af56c1f399b35525aa70a1225816e121ff95071c33c0f7abca", size = 447911, upload-time = "2025-10-14T06:45:52.637Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/7d/db0626df16029713e7e61b67314c4835e85c296d82bd907c21c6ea271da2/blake3-1.0.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e5b5da177d62cc4b7edf0cea08fe4dec960c9ac27f916131efa890a01f747b93", size = 505420, upload-time = "2025-10-14T06:45:54.445Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/55/6e737850c2d58a6d9de8a76dad2ae0f75b852a23eb4ecb07a0b165e6e436/blake3-1.0.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:38209b10482c97e151681ea3e91cc7141f56adbbf4820a7d701a923124b41e6a", size = 394189, upload-time = "2025-10-14T06:45:55.719Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/94/eafaa5cdddadc0c9c603a6a6d8339433475e1a9f60c8bb9c2eed2d8736b6/blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:504d1399b7fb91dfe5c25722d2807990493185faa1917456455480c36867adb5", size = 388001, upload-time = "2025-10-14T06:45:57.067Z" },
-    { url = "https://files.pythonhosted.org/packages/17/81/735fa00d13de7f68b25e1b9cb36ff08c6f165e688d85d8ec2cbfcdedccc5/blake3-1.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c84af132aa09abeadf9a0118c8fb26f4528f3f42c10ef8be0fcf31c478774ec4", size = 550302, upload-time = "2025-10-14T06:45:58.657Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/c6/d1fe8bdea4a6088bd54b5a58bc40aed89a4e784cd796af7722a06f74bae7/blake3-1.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a25db3d36b55f5ed6a86470155cc749fc9c5b91c949b8d14f48658f9d960d9ec", size = 554211, upload-time = "2025-10-14T06:46:00.269Z" },
-    { url = "https://files.pythonhosted.org/packages/77/57/e8a85fa261894bf7ce7af928ff3408aab60287ab8d58b55d13a3f700b619/blake3-1.0.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19fc6f2b7edab8acff6895fc6e38c19bd79f4c089e21153020c75dfc7397d52d", size = 370994, upload-time = "2025-10-14T06:46:07.398Z" },
-    { url = "https://files.pythonhosted.org/packages/62/cd/765b76bb48b8b294fea94c9008b0d82b4cfa0fa2f3c6008d840d01a597e4/blake3-1.0.8-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4f54cff7f15d91dc78a63a2dd02a3dccdc932946f271e2adb4130e0b4cf608ba", size = 374372, upload-time = "2025-10-14T06:46:08.698Z" },
-    { url = "https://files.pythonhosted.org/packages/36/7a/32084eadbb28592bb07298f0de316d2da586c62f31500a6b1339a7e7b29b/blake3-1.0.8-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7e12a777f6b798eb8d06f875d6e108e3008bd658d274d8c676dcf98e0f10537", size = 447627, upload-time = "2025-10-14T06:46:10.002Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/f4/3788a1d86e17425eea147e28d7195d7053565fc279236a9fd278c2ec495e/blake3-1.0.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddfc59b0176fb31168f08d5dd536e69b1f4f13b5a0f4b0c3be1003efd47f9308", size = 507536, upload-time = "2025-10-14T06:46:11.614Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/01/4639cba48513b94192681b4da472cdec843d3001c5344d7051ee5eaef606/blake3-1.0.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2336d5b2a801a7256da21150348f41610a6c21dae885a3acb1ebbd7333d88d8", size = 394105, upload-time = "2025-10-14T06:46:12.808Z" },
-    { url = "https://files.pythonhosted.org/packages/21/ae/6e55c19c8460fada86cd1306a390a09b0c5a2e2e424f9317d2edacea439f/blake3-1.0.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4072196547484c95a5a09adbb952e9bb501949f03f9e2a85e7249ef85faaba8", size = 386928, upload-time = "2025-10-14T06:46:16.284Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/6c/05b7a5a907df1be53a8f19e7828986fc6b608a44119641ef9c0804fbef15/blake3-1.0.8-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:0eab3318ec02f8e16fe549244791ace2ada2c259332f0c77ab22cf94dfff7130", size = 550003, upload-time = "2025-10-14T06:46:17.791Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/03/f0ea4adfedc1717623be6460b3710fcb725ca38082c14274369803f727e1/blake3-1.0.8-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a33b9a1fb6d1d559a8e0d04b041e99419a6bb771311c774f6ff57ed7119c70ed", size = 553857, upload-time = "2025-10-14T06:46:19.088Z" },
-    { url = "https://files.pythonhosted.org/packages/13/da/722cebca11238f3b24d3cefd2361c9c9ea47cfa0ad9288eeb4d1e0b7cf93/blake3-1.0.8-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef153c5860d5bf1cc71aece69b28097d2a392913eb323d6b52555c875d0439fc", size = 370441, upload-time = "2025-10-14T06:46:26.29Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/d5/2f7440c8e41c0af995bad3a159e042af0f4ed1994710af5b4766ca918f65/blake3-1.0.8-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e8ae3689f0c7bfa6ce6ae45cab110e4c3442125c4c23b28f1f097856de26e4d1", size = 374312, upload-time = "2025-10-14T06:46:27.451Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/6c/fb6a7812e60ce3e110bcbbb11f167caf3e975c589572c41e1271f35f2c41/blake3-1.0.8-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fb83532f7456ddeb68dae1b36e1f7c52f9cb72852ac01159bbcb1a12b0f8be0", size = 447007, upload-time = "2025-10-14T06:46:29.056Z" },
-    { url = "https://files.pythonhosted.org/packages/13/3b/c99b43fae5047276ea9d944077c190fc1e5f22f57528b9794e21f7adedc6/blake3-1.0.8-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae7754c7d96e92a70a52e07c732d594cf9924d780f49fffd3a1e9235e0f5ba7", size = 507323, upload-time = "2025-10-14T06:46:30.661Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/bb/ba90eddd592f8c074a0694cb0a744b6bd76bfe67a14c2b490c8bdfca3119/blake3-1.0.8-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bacaae75e98dee3b7da6c5ee3b81ee21a3352dd2477d6f1d1dbfd38cdbf158a", size = 393449, upload-time = "2025-10-14T06:46:31.805Z" },
-    { url = "https://files.pythonhosted.org/packages/25/ed/58a2acd0b9e14459cdaef4344db414d4a36e329b9720921b442a454dd443/blake3-1.0.8-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9456c829601d72852d8ba0af8dae0610f7def1d59f5942efde1e2ef93e8a8b57", size = 386844, upload-time = "2025-10-14T06:46:33.195Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/04/fed09845b18d90862100c8e48308261e2f663aab25d3c71a6a0bdda6618b/blake3-1.0.8-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:497ef8096ec4ac1ffba9a66152cee3992337cebf8ea434331d8fd9ce5423d227", size = 549550, upload-time = "2025-10-14T06:46:35.23Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/65/1859fddfabc1cc72548c2269d988819aad96d854e25eae00531517925901/blake3-1.0.8-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:511133bab85ff60ed143424ce484d08c60894ff7323f685d7a6095f43f0c85c3", size = 553805, upload-time = "2025-10-14T06:46:36.532Z" },
-    { url = "https://files.pythonhosted.org/packages/49/fa/b913eb9cc4af708c03e01e6b88a8bb3a74833ba4ae4b16b87e2829198e06/blake3-1.0.8-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a47939f04b89c5c6ff1e51e883e5efab1ea1bf01a02f4d208d216dddd63d0dd8", size = 370654, upload-time = "2025-10-14T06:46:43.907Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/4f/245e0800c33b99c8f2b570d9a7199b51803694913ee4897f339648502933/blake3-1.0.8-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:73e0b4fa25f6e3078526a592fb38fca85ef204fd02eced6731e1cdd9396552d4", size = 374693, upload-time = "2025-10-14T06:46:45.186Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/a6/8cb182c8e482071dbdfcc6ec0048271fd48bcb78782d346119ff54993700/blake3-1.0.8-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0543c57eb9d6dac9d4bced63e9f7f7b546886ac04cec8da3c3d9c8f30cbbb7", size = 447673, upload-time = "2025-10-14T06:46:46.358Z" },
-    { url = "https://files.pythonhosted.org/packages/06/b7/1cbbb5574d2a9436d1b15e7eb5b9d82e178adcaca71a97b0fddaca4bfe3a/blake3-1.0.8-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed972ebd553c0c25363459e9fc71a38c045d8419e365b59acd8cd791eff13981", size = 507233, upload-time = "2025-10-14T06:46:48.109Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/45/b55825d90af353b3e26c653bab278da9d6563afcf66736677f9397e465be/blake3-1.0.8-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bafdec95dfffa3f6571e529644744e280337df15ddd9728f224ba70c5779b23", size = 393852, upload-time = "2025-10-14T06:46:49.511Z" },
-    { url = "https://files.pythonhosted.org/packages/34/73/9058a1a457dd20491d1b37de53d6876eff125e1520d9b2dd7d0acbc88de2/blake3-1.0.8-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d78f06f3fb838b34c330e2987090376145cbe5944d8608a0c4779c779618f7b", size = 386442, upload-time = "2025-10-14T06:46:51.205Z" },
-    { url = "https://files.pythonhosted.org/packages/30/6d/561d537ffc17985e276e08bf4513f1c106f1fdbef571e782604dc4e44070/blake3-1.0.8-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:dd03ff08d1b6e4fdda1cd03826f971ae8966ef6f683a8c68aa27fb21904b5aa9", size = 549929, upload-time = "2025-10-14T06:46:52.494Z" },
-    { url = "https://files.pythonhosted.org/packages/03/2f/dbe20d2c57f1a67c63be4ba310bcebc707b945c902a0bde075d2a8f5cd5c/blake3-1.0.8-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:4e02a3c499e35bf51fc15b2738aca1a76410804c877bcd914752cac4f71f052a", size = 553750, upload-time = "2025-10-14T06:46:54.194Z" },
-    { url = "https://files.pythonhosted.org/packages/11/33/503b37220a3e2e31917ef13722efd00055af51c5e88ae30974c733d7ece6/blake3-1.0.8-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88d527c247f9609dc1d45a08fd243e39f0d5300d54c57e048de24d4fa9240ebb", size = 370220, upload-time = "2025-10-14T06:47:02.573Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/df/fe817843adf59516c04d44387bd643b422a3b0400ea95c6ede6a49920737/blake3-1.0.8-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506a47897a11ebe8f3cdeb52f1365d6a2f83959e98ccb0c830f8f73277d4d358", size = 373454, upload-time = "2025-10-14T06:47:03.784Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/4d/90a2a623575373dfc9b683f1bad1bf017feafa5a6d65d94fb09543050740/blake3-1.0.8-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5122a61b3b004bbbd979bdf83a3aaab432da3e2a842d7ddf1c273f2503b4884", size = 447102, upload-time = "2025-10-14T06:47:04.958Z" },
-    { url = "https://files.pythonhosted.org/packages/93/ff/4e8ce314f60115c4c657b1fdbe9225b991da4f5bcc5d1c1f1d151e2f39d6/blake3-1.0.8-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0171e85d56dec1219abdae5f49a0ed12cb3f86a454c29160a64fd8a8166bba37", size = 506791, upload-time = "2025-10-14T06:47:06.82Z" },
-    { url = "https://files.pythonhosted.org/packages/44/88/2963a1f18aab52bdcf35379b2b48c34bbc462320c37e76960636b8602c36/blake3-1.0.8-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:003f61e8c41dd9931edddf1cc6a1bb680fb2ac0ad15493ef4a1df9adc59ce9df", size = 393717, upload-time = "2025-10-14T06:47:09.085Z" },
-    { url = "https://files.pythonhosted.org/packages/45/d1/a848ed8e8d4e236b9b16381768c9ae99d92890c24886bb4505aa9c3d2033/blake3-1.0.8-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2c3151955efb09ba58cd3e1263521e15e9e3866a40d6bd3556d86fc968e8f95", size = 386150, upload-time = "2025-10-14T06:47:10.363Z" },
-    { url = "https://files.pythonhosted.org/packages/96/09/e3eb5d60f97c01de23d9f434e6e1fc117efb466eaa1f6ddbbbcb62580d6e/blake3-1.0.8-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:5eb25bca3cee2e0dd746a214784fb36be6a43640c01c55b6b4e26196e72d076c", size = 549120, upload-time = "2025-10-14T06:47:11.713Z" },
-    { url = "https://files.pythonhosted.org/packages/14/ad/3d9661c710febb8957dd685fdb3e5a861aa0ac918eda3031365ce45789e2/blake3-1.0.8-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:ab4e1dea4fa857944944db78e8f20d99ee2e16b2dea5a14f514fb0607753ac83", size = 553264, upload-time = "2025-10-14T06:47:13.317Z" },
-]
-
 [[package]]
 name = "bleach"
 version = "6.3.0"
@@ -494,39 +437,6 @@ css = [
     { name = "tinycss2" },
 ]
 
-[[package]]
-name = "cachetools"
-version = "7.1.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f4/8b/0d3945a13955303b81272f759a0331e54c5c793da455e6f5706b89d2639c/cachetools-7.1.4.tar.gz", hash = "sha256:437f55a4e0c1b01a4f3077cc470e6991d47430970e36fbcb77e2be0df4fc1cd6", size = 40085, upload-time = "2026-05-21T22:40:43.376Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8c/7b/1fc1c09cc0756cf25861a3be10565915953876da48bb228fb9a672b20a42/cachetools-7.1.4-py3-none-any.whl", hash = "sha256:323dc4127934744db5b54eb4924482d7edafbf9554e820d1531c2e08c0e4ef54", size = 16761, upload-time = "2026-05-21T22:40:41.845Z" },
-]
-
-[[package]]
-name = "cbor2"
-version = "6.1.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/be/db/810437bcfe13cf5e09b68bad1ce57c8fa04ca9272c68946bbf2f4fa522c8/cbor2-6.1.1.tar.gz", hash = "sha256:6f0644869e0fdcd6f3874330b8f1cebd009f33191de43acf609dc2409cd362c4", size = 86297, upload-time = "2026-05-14T10:57:42.231Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/13/3a/ae0df2f8e4f8fac9212a3a9684a6213b6ba3190cd7762d78e5bd5043dddb/cbor2-6.1.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a409b0b6de923f68f5e35287f25ec654fc68135991e41ae9a1c500ddd982c1fb", size = 453919, upload-time = "2026-05-14T10:56:55.468Z" },
-    { url = "https://files.pythonhosted.org/packages/87/4c/f5b3feb35e942998f60545199ff9c4c80d552a8b783d07f7ff70e78e8b1f/cbor2-6.1.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:911b34263f39300dd8ec6b78f247b257caba0bbcd278bd2421a54d45595ff602", size = 467302, upload-time = "2026-05-14T10:56:56.76Z" },
-    { url = "https://files.pythonhosted.org/packages/17/6d/a0472d99d9a38728498c9bcb4c65687383a948b0152e0bd7a20c1a87c949/cbor2-6.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:596418d033cff6eb0de9cb4ae63dd91c80e68d4ed01e1d0c61ad51709acc8ed2", size = 521305, upload-time = "2026-05-14T10:56:58.484Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/28/1d8cdb754def050e0d0674a556540d4a26bab0d7cfc3e11df14f2e4a2830/cbor2-6.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ce0e9a33d7ee2c8f47ae216be68a3a0a4d6d9832594a69e34be070cf6d13a9d8", size = 534365, upload-time = "2026-05-14T10:56:59.85Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/a5/653193249a64ca46def52798e8f10ddbc918f11818a977b2aa7248062520/cbor2-6.1.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:559025ad8e1f9f5d019a40dc8f14f43c111c11207b4dde852e943a3002b43ec0", size = 453218, upload-time = "2026-05-14T10:57:07.6Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/79/bdcb9d43ed537abaa89e662d6340244207ec85b6e66e3bd7f40856c3a5d4/cbor2-6.1.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:a6690f7df210386866e120475183132df98f77bf6df624097f66e3214e775084", size = 466244, upload-time = "2026-05-14T10:57:09.297Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/44/fe0543996d53538c074f8ee18f7391b5458c528b1717740d750a9e472e1d/cbor2-6.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f4898b5463a567775a05310407dbea5b4a8d7ae8e81337ae9084f5fe226938ff", size = 520804, upload-time = "2026-05-14T10:57:10.682Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/83/577bbafef3bc887d654a73f3f4ab11e1bd5320abd9108bfc51fbea1498a8/cbor2-6.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf3ef1fae6f14081a15f178e933ab846d3181f059ee4090975518b71f58bb09f", size = 533598, upload-time = "2026-05-14T10:57:12.098Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/7d/08644318380306e0809ecc4756e67fb684b5e78a938ca9ff1c8c7f57fe73/cbor2-6.1.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:350beaac7a6049fe0a48309d7acd24611ab1176b4db1515f7fbcad20f5c09821", size = 453010, upload-time = "2026-05-14T10:57:19.593Z" },
-    { url = "https://files.pythonhosted.org/packages/81/ff/43ef5f16a1a97ef4575c407d077d9355c01dfc54b1b1b8c5329b793c436b/cbor2-6.1.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:74bf0c3f48d215d49a99eb253fef6c00c19033339da22da4c29b53fe854093b8", size = 465110, upload-time = "2026-05-14T10:57:20.981Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/61/3069cee66bc4bedb95dce49b5e90d07e6c1ddf712435facf84ce0353da4a/cbor2-6.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a731277d123cee9c87e649077376f694892e4a2c3b0b1cb97132205c620947d8", size = 520269, upload-time = "2026-05-14T10:57:22.514Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/70/4b2ac02e0aa09419c13c434ce535cf508f08d5c411c6912d760c480ed8e6/cbor2-6.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:16e6df5a4971c2006805669be472a43bb382d0f3464c2236634b4e93095d7dd6", size = 532515, upload-time = "2026-05-14T10:57:24.289Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/28/780af53231e1a6afc36f2b922ff587a9e1a25df7756628101a6070a9312f/cbor2-6.1.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fd9d300ad983b860fbfb0ab148ddd3a379be25430bb141ad41344adc1c0792c1", size = 446311, upload-time = "2026-05-14T10:57:31.507Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/5d/cc298ed16745995cf21caeec52213d157be8d5bfb405ee8ed420ffb5e038/cbor2-6.1.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:b8594563ccfd56f2bb56cdd8445f7a1f00d3065d84ea06f8e361da765abee08f", size = 459640, upload-time = "2026-05-14T10:57:32.967Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/37/e4d95459d48e8a739c086249884b27458541df5a7fc149debdb0e0c7becb/cbor2-6.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8df2a530b45c7769ed43c02e3f7c9841ed4990887e1c29858b08363a35067bf5", size = 511667, upload-time = "2026-05-14T10:57:34.465Z" },
-    { url = "https://files.pythonhosted.org/packages/40/e8/32e529bd938c71456d38d7c6a62d0d75399e720553d6514a467fee9b004d/cbor2-6.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d63181b5b213ab72eed01e62bfa4c994fe7de68433d12548d54156411ba0aac4", size = 527195, upload-time = "2026-05-14T10:57:36.09Z" },
-]
-
 [[package]]
 name = "certifi"
 version = "2026.4.22"
@@ -942,21 +852,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" },
 ]
 
-[[package]]
-name = "compressed-tensors"
-version = "0.11.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "frozendict", marker = "sys_platform == 'linux'" },
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
-    { name = "transformers", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b8/99/3fdabfc95609d6efdf02fa7f1ed0245524cb1209d3d4a17109d3205d2eed/compressed_tensors-0.11.0.tar.gz", hash = "sha256:95ddf19699f775df6494dd864e5f52e8a24f8015496520190c1a22c6cfc44b1f", size = 187566, upload-time = "2025-08-19T18:59:31.854Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d2/81/e3073017a8f5c75169e79108eda209e6089e3f96c9f197d307cbda7df71c/compressed_tensors-0.11.0-py3-none-any.whl", hash = "sha256:e1cbc46e1ae032b7ceea915fe18c8d2de5a54d3a50a607969b6bdfe703b6cb83", size = 179951, upload-time = "2025-08-19T18:59:29.308Z" },
-]
-
 [[package]]
 name = "contourpy"
 version = "1.3.3"
@@ -1107,6 +1002,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/61/e8/cb8e80d6f9f55b99588625062822bf946cf03ed06315df4bd8397f5632a1/coverage-7.14.0-py3-none-any.whl", hash = "sha256:8de5b61163aee3d05c8a2beab6f47913df7981dad1baf82c414d99158c286ab1", size = 211764, upload-time = "2026-05-10T18:02:29.538Z" },
 ]
 
+[[package]]
+name = "cuda-bindings"
+version = "12.9.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/45/557d4ed1fa54f0c7db8aee083229f624990d69f7d00f55477eed5c7e169a/cuda_bindings-12.9.7-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0666d3c082ef8f4b2d670950589373550e9f3bf564d635dd883f24a0b40402ff", size = 7071026, upload-time = "2026-05-27T18:44:13.356Z" },
+    { url = "https://files.pythonhosted.org/packages/91/97/e3c6e58ece26a053419ba0a18444b5443cfc64451bbf37f84e8143b8bdca/cuda_bindings-12.9.7-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c7ef48c5e13ae90f3b2ecfb72f8e99ac43c8f4c43e67e1325b8aae331453687", size = 7611059, upload-time = "2026-05-27T18:44:15.252Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/7b/f1575e41e1a17dc2f2a408b2e8e864c9324e41e3e23f6401e5efc54c152a/cuda_bindings-12.9.7-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:266379e4942051f544a8e7ea1a30ead8d7e8199b6b30fcdc8917cae2bf614e61", size = 6978549, upload-time = "2026-05-27T18:44:18.839Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/dc/62d62eb4f91eb721bcf46da51b13e9872ccd8fa7e60eb8ba7b7baeac72c6/cuda_bindings-12.9.7-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59cf4a37b0d662ba15037c9ceebe1a306ebf2c01a8235a09be13cd07094fdb74", size = 7457675, upload-time = "2026-05-27T18:44:20.637Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/77/94d9b85f26add6fe9c9cb7c4ec3b96bc598f7ea5cfbd7490cc0a36adf5be/cuda_bindings-12.9.7-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2dbcd4801954eb3508f4dc2fa0d0c8eb93eb3f45326fd61be2731418c371e7a0", size = 6870886, upload-time = "2026-05-27T18:44:24.164Z" },
+    { url = "https://files.pythonhosted.org/packages/04/dd/3ec34b569e1b990b11276feba306bf8f446656cc38e8ed0f49b5facfeffa/cuda_bindings-12.9.7-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3747ea132642416786a8e31bf229032df3a7856911ae5426a7be53d032df183d", size = 7345663, upload-time = "2026-05-27T18:44:26.333Z" },
+    { url = "https://files.pythonhosted.org/packages/68/e4/075052d42872cf8162da53f14447a4b8abc004c3750e4b724ee502428da0/cuda_bindings-12.9.7-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:775960ac9e530717f3b48e165cc6f68684fa9a4141764fd923e4c1a9820acc73", size = 7060090, upload-time = "2026-05-27T18:44:30.281Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/cd/3289c810a4d45e5364a3387a74b4c9b6f6f57ee96ae0e5b537cc61dec242/cuda_bindings-12.9.7-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c47ec1a7a441d91aab32339951df7a1be53451121a12c094bba51467717a35a", size = 7504419, upload-time = "2026-05-27T18:44:31.992Z" },
+    { url = "https://files.pythonhosted.org/packages/11/43/472a6281c3d94e71687e27c657a8f60718d3579b4d94c41deea503165f8a/cuda_bindings-12.9.7-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:00a833d399b31071fab4cf3de2929840ae462dc4848116eeff033d09219e7116", size = 6899146, upload-time = "2026-05-27T18:44:35.556Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/13/10c1d0b32a9da65142d213e0733d748457fb3fd066aee4317335266f15c6/cuda_bindings-12.9.7-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11aeafa2b33995f890086b3fb0f062075176d956e9b6a6fe1a699dddc413f6ad", size = 7369087, upload-time = "2026-05-27T18:44:37.359Z" },
+]
+
 [[package]]
 name = "cuda-pathfinder"
 version = "1.5.4"
@@ -1116,22 +1031,46 @@ wheels = [
 ]
 
 [[package]]
-name = "cupy-cuda12x"
-version = "14.1.0"
+name = "cuda-toolkit"
+version = "12.8.1"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/11/c8ee77e4f285530527f3c2eb4a0df444056f5bd53451385d04e64d676cb5/cupy_cuda12x-14.1.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0ef0fcdfea1a1c650c217f34223553e0914df140d82ecd457c047e849025c21c", size = 144383804, upload-time = "2026-05-23T01:10:11.429Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/0b/802bbda4b40957c3c151f8338436bbea225a9bd3290bed5488e32131d641/cupy_cuda12x-14.1.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:cf33564a11c966de724a45bb84b30ca6fa08d0a3d5696316af230ba4177bfbde", size = 133516920, upload-time = "2026-05-23T01:10:16.586Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/69/57ac08d11fa2826644898009acc81c29d2628d2beb30a95cc057d4390da3/cupy_cuda12x-14.1.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:44c12a92216fbc65c6f4ebc055c801ab43eaf8c9316bd1a9ef9b91ad8c1d57db", size = 143920077, upload-time = "2026-05-23T01:10:26.873Z" },
-    { url = "https://files.pythonhosted.org/packages/23/a1/6b83754040db8f2e3dfefa7671144f4c2f105adbe9da76ce76a61a5c5df8/cupy_cuda12x-14.1.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:e9f2006077082ef04cec7ac6a897730b21fe51ebbd2b020bef503ccf5ccd7b4d", size = 133071609, upload-time = "2026-05-23T01:10:32.129Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/ec/9a1ebe6a16c906f7c866d346716dbfb26506075609b95acf20a1a6781b9c/cupy_cuda12x-14.1.0-cp314-cp314-manylinux2014_aarch64.whl", hash = "sha256:cc78d1f0ebc1c4e77e81c6d293cbc2402c5bf4bd6d86f586e5816fe7f69b3c91", size = 143788622, upload-time = "2026-05-23T01:10:42.234Z" },
-    { url = "https://files.pythonhosted.org/packages/59/ff/831460649d69cc2c22b634b1027771e56406c119b432abacfd18226e15e0/cupy_cuda12x-14.1.0-cp314-cp314-manylinux2014_x86_64.whl", hash = "sha256:f6e50046ef5923243273852f97812d35dba5726c9d32390e1c89530a03a3491f", size = 132406357, upload-time = "2026-05-23T01:10:48.877Z" },
-    { url = "https://files.pythonhosted.org/packages/86/50/6dab7e3e10fc711c07703be460a34edb87ac1841fa10c247c22cf8b5f144/cupy_cuda12x-14.1.0-cp314-cp314t-manylinux2014_aarch64.whl", hash = "sha256:3a30f8f08135295ffa23378268c848eacb2d204af26ed46194540c741de2b38f", size = 144093044, upload-time = "2026-05-23T01:11:06.559Z" },
-    { url = "https://files.pythonhosted.org/packages/58/5c/e5aa39d6c1a3189c456323cb8a44ce3dbb2d2d516b606433dd9dfe6a10dd/cupy_cuda12x-14.1.0-cp314-cp314t-manylinux2014_x86_64.whl", hash = "sha256:1670cfcf2ba4fc7b0eff39851c1618e1ba0b92259e49fd68249339530022b27c", size = 132635331, upload-time = "2026-05-23T01:11:11.984Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/c8/7dce3a0b15b42a3b58e7d96eb22a687d3bf2c44e01d149a6874629cd9938/cuda_toolkit-12.8.1-py2.py3-none-any.whl", hash = "sha256:adc7906af4ecbf9a352f9dca5734eceb21daec281ccfcf5675e1d2f724fc2cba", size = 2283, upload-time = "2025-08-13T02:03:07.842Z" },
+]
+
+[package.optional-dependencies]
+cublas = [
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+]
+cudart = [
+    { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" },
+]
+cufft = [
+    { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" },
+]
+cufile = [
+    { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" },
+]
+cupti = [
+    { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" },
+]
+curand = [
+    { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" },
+]
+cusolver = [
+    { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" },
+]
+cusparse = [
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+]
+nvjitlink = [
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+]
+nvrtc = [
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" },
+]
+nvtx = [
+    { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1231,28 +1170,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
 ]
 
-[[package]]
-name = "depyf"
-version = "0.19.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "astor", marker = "sys_platform == 'linux'" },
-    { name = "dill", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/19/38/69157d711be575f1b9cf3177b64ef4ade44373fc02839f183fdd98ec2dd6/depyf-0.19.0.tar.gz", hash = "sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44", size = 6171405, upload-time = "2025-04-20T08:07:41.224Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/28/4d/1192acbcdc5e843f5e5d51f6e8788f2b60a9fe0b578ac385ded67a0b0b26/depyf-0.19.0-py3-none-any.whl", hash = "sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5", size = 39034, upload-time = "2025-04-20T08:07:37.036Z" },
-]
-
-[[package]]
-name = "detect-installer"
-version = "0.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5f/ce/6897d812825e9d4c53e3c7112726e800cc5231b013b2223bf64f653ff362/detect_installer-0.1.0.tar.gz", hash = "sha256:00ad7ba0a36e3cf7d08a40d3643011746dbc112597c7d475cc91c416710ca4e7", size = 3049, upload-time = "2026-02-23T10:40:22.567Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/34/8cc73273414405086c58852916e4031812a6a30fe04c057e37ad99397b7f/detect_installer-0.1.0-py3-none-any.whl", hash = "sha256:034fb20fd665c36e6ba52b8821525ea07fb4f7f938cac459df889fb33801528a", size = 4539, upload-time = "2026-02-23T10:40:23.807Z" },
-]
-
 [[package]]
 name = "diffusers"
 version = "0.35.2"
@@ -1281,15 +1198,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" },
 ]
 
-[[package]]
-name = "diskcache"
-version = "5.6.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" },
-]
-
 [[package]]
 name = "distlib"
 version = "0.4.0"
@@ -1374,15 +1282,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c5/37/15603079854394f16e3833a7b50696c1f3cbf30a2243a119f64f18a16f36/dm_tree-0.1.9-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1f5d1e96b3a7de22b25b13a5eb30f41f8cf9c02dd4479a24920de99e780903c", size = 153052, upload-time = "2025-01-30T20:45:35.907Z" },
 ]
 
-[[package]]
-name = "dnspython"
-version = "2.8.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/57666417c0f90f08bcafa776861060426765fdb422eb10212086fb811d26/dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f", size = 368251, upload-time = "2025-09-07T18:58:00.022Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" },
-]
-
 [[package]]
 name = "docopt"
 version = "0.6.2"
@@ -1479,19 +1378,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/cb/809f0c3e4e7bfe78c6dd468631896a8866c3ba853e3c855cc3fa58fae660/eiquadprog-1.2.9-0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:416f4b584ea30072f166b2a6a3e0a63a2a260a378f9bcbd2dfc9cde13b810a50", size = 118538, upload-time = "2025-02-17T19:00:16.297Z" },
 ]
 
-[[package]]
-name = "email-validator"
-version = "2.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "dnspython", marker = "sys_platform == 'linux'" },
-    { name = "idna", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f5/22/900cb125c76b7aaa450ce02fd727f452243f2e91a61af068b40adba60ea9/email_validator-2.3.0.tar.gz", hash = "sha256:9fc05c37f2f6cf439ff414f8fc46d917929974a82244c20eb10231ba60c54426", size = 51238, upload-time = "2025-08-26T13:09:06.831Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" },
-]
-
 [[package]]
 name = "etils"
 version = "1.14.0"
@@ -1562,111 +1448,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5a/ff/2e4eca3ade2c22fe1dea7043b8ee9dabe47753349eb1b56a202de8af6349/fastapi-0.136.1-py3-none-any.whl", hash = "sha256:a6e9d7eeada96c93a4d69cb03836b44fa34e2854accb7244a1ece36cd4781c3f", size = 117683, upload-time = "2026-04-23T16:49:42.437Z" },
 ]
 
-[package.optional-dependencies]
-standard = [
-    { name = "email-validator", marker = "sys_platform == 'linux'" },
-    { name = "fastapi-cli", extra = ["standard"], marker = "sys_platform == 'linux'" },
-    { name = "fastar", marker = "sys_platform == 'linux'" },
-    { name = "httpx", marker = "sys_platform == 'linux'" },
-    { name = "jinja2", marker = "sys_platform == 'linux'" },
-    { name = "pydantic-extra-types", marker = "sys_platform == 'linux'" },
-    { name = "pydantic-settings", marker = "sys_platform == 'linux'" },
-    { name = "python-multipart", marker = "sys_platform == 'linux'" },
-    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
-]
-
-[[package]]
-name = "fastapi-cli"
-version = "0.0.24"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "rich-toolkit", marker = "sys_platform == 'linux'" },
-    { name = "typer", marker = "sys_platform == 'linux'" },
-    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6e/58/74797ae9e4610cfa0c6b34c8309096d3b20bb29be3b8b5fbf1004d10fa5f/fastapi_cli-0.0.24.tar.gz", hash = "sha256:1afc9c9e21d7ebc8a3ca5e31790cd8d837742be7e4f8b9236e99cb3451f0de00", size = 19043, upload-time = "2026-02-24T10:45:10.476Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/4b/68f9fe268e535d79c76910519530026a4f994ce07189ac0dded45c6af825/fastapi_cli-0.0.24-py3-none-any.whl", hash = "sha256:4a1f78ed798f106b4fee85ca93b85d8fe33c0a3570f775964d37edb80b8f0edc", size = 12304, upload-time = "2026-02-24T10:45:09.552Z" },
-]
-
-[package.optional-dependencies]
-standard = [
-    { name = "fastapi-cloud-cli", marker = "sys_platform == 'linux'" },
-    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
-]
-
-[[package]]
-name = "fastapi-cloud-cli"
-version = "0.18.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "detect-installer", marker = "sys_platform == 'linux'" },
-    { name = "fastar", marker = "sys_platform == 'linux'" },
-    { name = "httpx", marker = "sys_platform == 'linux'" },
-    { name = "pydantic", extra = ["email"], marker = "sys_platform == 'linux'" },
-    { name = "rich-toolkit", marker = "sys_platform == 'linux'" },
-    { name = "rignore", marker = "sys_platform == 'linux'" },
-    { name = "sentry-sdk", marker = "sys_platform == 'linux'" },
-    { name = "typer", marker = "sys_platform == 'linux'" },
-    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/7f/1d/57221a834b0f62dfa510c2b3db6e9b682cfbc280cef41919a8811ce1ff89/fastapi_cloud_cli-0.18.0.tar.gz", hash = "sha256:95f7a79200e3a90a005e068a4d8ede49d4b04accb095ccd4fd47da998fc28c74", size = 53320, upload-time = "2026-05-22T09:53:54.462Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/1e/1d54aabf71c003e89e73df92c3dfded311228e68db7cea5db90b3e0ef2b5/fastapi_cloud_cli-0.18.0-py3-none-any.whl", hash = "sha256:1f136fc651b0b6e2f4a9679e23c56e1c3be3405e74469c14ba6e2d5b87fdc113", size = 37087, upload-time = "2026-05-22T09:53:53.001Z" },
-]
-
-[[package]]
-name = "fastar"
-version = "0.11.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/03/0f/0aeb3fc50046617702acc0078b277b58367fd62eb727b9ec733ae0e8bbcc/fastar-0.11.0.tar.gz", hash = "sha256:aa7f100f7313c03fdb20f1385927ba95671071ba308ad0c1763fef295e1895ce", size = 70238, upload-time = "2026-04-13T17:11:17.143Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ab/69/9816d69ac8265c9e50456637a487ccfb7a9c566efd9dbcd673df9c2558c2/fastar-0.11.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bd2f05666d4df7e14885b5c38fefd92a785917387513d33d837ff42ec143a22f", size = 863950, upload-time = "2026-04-13T17:09:11.506Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/0d/f88daad53aff2e754b6b5ff2a7113f72447a34f6ef17cc23ca99988117b7/fastar-0.11.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1e6e74aba1ae77ca4aedcaf1697cd413319f4c88a5ccbe5b42c709517c5097e", size = 760737, upload-time = "2026-04-13T17:07:55.958Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/a6/82ef4ecd969d50d92ed3ed9dbd8fe77faa24be5e5736f716edc9f4ce8d62/fastar-0.11.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38ef77fe940bbc9b37a98bd838727f844b11731cd39358a2640ff864fb385086", size = 757603, upload-time = "2026-04-13T17:08:10.623Z" },
-    { url = "https://files.pythonhosted.org/packages/03/35/50249f0d827251f8ac511495e2eacccebda80a00a0ad73e9615b8113b84f/fastar-0.11.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8955e61b32d6aff82c983217abf80933fd823b0e727586fc72f08043d996fd59", size = 923952, upload-time = "2026-04-13T17:08:25.526Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/d8/faee41659e9c379d906d24eaee6d6833ac8cfef0a5df480e5c2a8d3efb33/fastar-0.11.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:483532442cdb08fbff0169510224eae0836f2f672cea6aacb52847d90fefdc46", size = 816574, upload-time = "2026-04-13T17:08:56.076Z" },
-    { url = "https://files.pythonhosted.org/packages/22/47/0448ea7992b997dad2bf004bfd98eca74b5858630eae080b50c7b17d9ddc/fastar-0.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef5a6071121e05d8287fc75bccb054bcbac8bb0501200a0c0a8feeace5303ea4", size = 819382, upload-time = "2026-04-13T17:09:26.66Z" },
-    { url = "https://files.pythonhosted.org/packages/33/ef/0d63eb43586831b7a6f8b22c4d77125a7c594423af1f4f090fa9541b9b40/fastar-0.11.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:e45e598af5afe8412197d4786efd6cf29be02e7d3d4f6a3461149eae5d7e94f1", size = 885254, upload-time = "2026-04-13T17:08:40.9Z" },
-    { url = "https://files.pythonhosted.org/packages/01/25/edd584675d69e49a165052c3ee886df1c5d574f3e7d813c990306387c623/fastar-0.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2e160919b1c47ddb8538e7e8eb4cd527281b40f0bf75110a75993838ef61f286", size = 971239, upload-time = "2026-04-13T17:10:12.997Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/37/e8bb24f506ba2b08fbaf36c5800e843bd4d542954e9331f00418e2d23349/fastar-0.11.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:4bb4dc0fc8f7a6807febcebce8a2f3626ba4955a9263d81ecc630aad83be84c0", size = 1035185, upload-time = "2026-04-13T17:10:30.207Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/bf/be753736296338149ee4cb3e92e2b5423d6ba17c7b951d15218fd7e99bbf/fastar-0.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4ec95af56aa173f6e320e1183001bf108ba59beaf13edd1fc8200648db203588", size = 1072191, upload-time = "2026-04-13T17:10:47.072Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/cd/a81c1aaafb5a22ce57c98ae22f39c89413ed53e4ee6e1b1444b0bd666a6c/fastar-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:136cf342735464091c39dc3708168f9fdeb9ebea40b1ead937c61afaf46143d9", size = 1028054, upload-time = "2026-04-13T17:11:04.293Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/2b/d11d84bdd5e0e377771b955755771e3460b290da5809cb78c1b735ee2228/fastar-0.11.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:881247e6b6eaea59fc6569f9b61447aa6b9fc2ee864e048b4643d69c52745805", size = 863054, upload-time = "2026-04-13T17:09:13.048Z" },
-    { url = "https://files.pythonhosted.org/packages/25/39/d3f428b318fa940b1b6e785b8d54fc895dfb5d5b945ef8d5442ffa904fb2/fastar-0.11.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:863b7929845c9fec92ef6c8d59579cf46af5136655e5342f8df5cebe46cab06c", size = 760247, upload-time = "2026-04-13T17:07:57.396Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/04/03949aee82aabb8ede06ac5a4a5579ffaf98a8fe59ce958494508ff15513/fastar-0.11.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:96b4a57df12bf3211662627a3ea29d62ecb314a2434a0d0843f9fc23e47536e5", size = 756512, upload-time = "2026-04-13T17:08:12.415Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/0c/2ca1ae0a3828ca51047962d932b80daca2522db73e8cb9d040cb6ebe28d5/fastar-0.11.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ceef1c2c4df7b7b8ebd3f5d718bbf457b9bbdf25ce0bd07870211ec4fbd9aff4", size = 922183, upload-time = "2026-04-13T17:08:27.187Z" },
-    { url = "https://files.pythonhosted.org/packages/65/68/7fe808b1f73a68e686f25434f538c6dc10ef4dfb3db0ace22cd861744bf8/fastar-0.11.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8e545918441910a779659d4759ad0eef349e935fbdb4668a666d3681567eb05", size = 816394, upload-time = "2026-04-13T17:08:57.657Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/17/07d086080f8a83b8d7966955e29bcdbd6a060f5bd949dc9d5abd3658cead/fastar-0.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28095bb8f821e85fc2764e1a55f03e5e2876dee2abe7cd0ee9420d929905d643", size = 818983, upload-time = "2026-04-13T17:09:28.46Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/e2/2c4edf0910af2e814ff6d65b77a91196d472ca8a9fb2033bd983f6856caa/fastar-0.11.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0fafb95ecbe70f666a5e9b35dd63974ccdc9bb3d99ccdbd4014a823ec3e659b5", size = 884689, upload-time = "2026-04-13T17:08:42.763Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/ba/04fdcbd6558e60de4ced3b55230fac47675d181252582b2fcec3c74608e5/fastar-0.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:af48fed039b94016629dcdad1c95c90c486326dd068de2b0a4df419ee09b6821", size = 970677, upload-time = "2026-04-13T17:10:15.124Z" },
-    { url = "https://files.pythonhosted.org/packages/df/b3/2b860a9658550167dbd5824c85e88d0b4b912bf493e42a6322544d6e483d/fastar-0.11.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:74cd96163f39b8638ab4e8d49708ca887959672a22871d8170d01f067319533b", size = 1034026, upload-time = "2026-04-13T17:10:32.318Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/9b/fa42ea1188b144bac4b1b60753dfd449974a4d5eda132029ee7711569f94/fastar-0.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4e8b993cb5613bab495ed482810bedc0986633fcb9a3b55c37ec88e0d6714f6a", size = 1071147, upload-time = "2026-04-13T17:10:48.833Z" },
-    { url = "https://files.pythonhosted.org/packages/95/c8/d2e501556dca9f1fbc9246111a31792fb49ad908fa4927f34938a97a3604/fastar-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dfe39d91fc28e37e06162d94afe01050220edb7df554acb5b702b5503e564816", size = 1028377, upload-time = "2026-04-13T17:11:06.374Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/5e/9395c7353d079cb4f5be0f7982ce0dc9f2e7dec5fd175eef466729d6023a/fastar-0.11.0-cp314-cp314-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7c371f1d4386c699018bb64eb2fa785feacf32785559049d2bb72fe4af023f53", size = 864378, upload-time = "2026-04-13T17:09:14.611Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/ba/1e4f67148223ff219612b6281a6000357abbcc2417964fa5c83f11d68fce/fastar-0.11.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cad7fa41e3e66554387481c1a09365e4638becd322904932674159d5f4046728", size = 760921, upload-time = "2026-04-13T17:07:59.138Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/82/09d11fb6d12f17993ffaf32ffd30c3c121a11e2966e84f19fb6f66430118/fastar-0.11.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf36652fa71b83761717c9899b98732498f8a2cb6327ff16bbf07f6be85c3437", size = 757012, upload-time = "2026-04-13T17:08:14.186Z" },
-    { url = "https://files.pythonhosted.org/packages/52/1f/5aeeacc4cb65615e2c9292cd9c5b0cd6fb6d2e6ee472ca6adc6c1b1b22ef/fastar-0.11.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f68ff8c17833053da4841720e95edde80ce45bb994b6b7d51418dddaac70ee47", size = 924510, upload-time = "2026-04-13T17:08:28.741Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/1a/1e5bdabbeaf2e856928956292609f2ff6a650f94480fb8afaca30229e483/fastar-0.11.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4563ed37a12ea1cdc398af8571258d24b988bf342b7b3bf5451bd5891243280c", size = 816602, upload-time = "2026-04-13T17:08:59.461Z" },
-    { url = "https://files.pythonhosted.org/packages/87/24/f960147910da3bed41a3adfcb026e17d5f50f4cf467a3324237a7088f61a/fastar-0.11.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cee63c9875cba3b70dc44338c560facc5d6e763047dcc4a30501f9a68cf5f890", size = 819452, upload-time = "2026-04-13T17:09:29.926Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/f4/3e77d7901d5707fd7f8a352e153c8ae09ea974e6fabad0b7c4eb9944b8d4/fastar-0.11.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:bd76bfffae6d0a91f4ac4a612f721e7aec108db97dccdd120ae063cd66959f27", size = 885254, upload-time = "2026-04-13T17:08:44.285Z" },
-    { url = "https://files.pythonhosted.org/packages/47/01/1585edd5ec47782ae93cd94edf05828e0ab02ef00aec00aea4194a600464/fastar-0.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f5b707501ec01c1bc0518f741f01d322e50c9adc19a451aa24f67a2316e9397", size = 971496, upload-time = "2026-04-13T17:10:17.024Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/e9/6874c9d1236ded565a0bed54b320ac9f165f287b1d89490fb70f9f323c81/fastar-0.11.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:37c0b5a88a657839aad98b0a6c9e4ac4c2c15d6b49c44ee3935c6b08e9d3e479", size = 1034685, upload-time = "2026-04-13T17:10:34.063Z" },
-    { url = "https://files.pythonhosted.org/packages/14/d8/4ab20613ce2983427aee958e39be878dba874aa227c530a845e32429c4f6/fastar-0.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6c55f536c62a6efb180c1af0d5182948bff576bbfe6276e8e1359c9c7d2215d8", size = 1072675, upload-time = "2026-04-13T17:10:50.53Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/ae/5ac3b7c20ce4b08f011dd2b979f96caabe64f9b10b157f211ea91bdfadca/fastar-0.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3082eeca59e189b9039335862f4c2780c0c8871d656bfdf559db4414a105b251", size = 1029330, upload-time = "2026-04-13T17:11:08.138Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/c3/38f1dac77ae0c71c37b176277c96d830796b8ce2fe69705f917829b53829/fastar-0.11.0-cp314-cp314t-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bd3eca3bbfec84a614bcb4143b4ad4f784d0895babc26cfc88436af88ca23c7a", size = 864403, upload-time = "2026-04-13T17:09:16.58Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/f0/e69c363bdb3e5a5848e937b662b5469581ee6682c51bc1c0556494773929/fastar-0.11.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff86a967acb0d621dd24063dda090daa67bf4993b9570e97fe156de88a9006ca", size = 759480, upload-time = "2026-04-13T17:08:00.599Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/29/4d8737590c2a6357d614d7cc7288e8f68e7e449680b8922997cc4349e65e/fastar-0.11.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:86eaf7c0e985d93a7734168be2fb232b2a8cca53e41431c2782d7c12b12c03b1", size = 756219, upload-time = "2026-04-13T17:08:15.699Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/ec/400de7b3b7d48801908f19cf5462177104395799472671b3e8152b2b04ca/fastar-0.11.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91f07b0b8eb67e2f177733a1f884edad7dfb9f8977ffef15927b20cb9604027d", size = 923669, upload-time = "2026-04-13T17:08:30.574Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/01/8926c53da923fed7ab4b96e7fbf7f73b663beb4f02095b654d6fab46f9ad/fastar-0.11.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f85c896885eb4abf1a635d54dea22cac6ae48d04fc2ea26ae652fcf1febe1220", size = 815729, upload-time = "2026-04-13T17:09:01.204Z" },
-    { url = "https://files.pythonhosted.org/packages/89/f0/5fef4c7946e352651b504b1a4235dac3505e7cfd24020788ab50552e84bf/fastar-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:075c07095c8de4b774ba8f28b9c0a02b1a2cd254da50cbe464dd3bb2432e9158", size = 819812, upload-time = "2026-04-13T17:09:31.907Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/c8/0ebc3298b4a45e7bddc50b169ae6a6f5b80c939394d4befe6e60de535ee7/fastar-0.11.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:07f028933820c65750baf3383b807ecce1cd9385cf00ce192b79d263ad6b856c", size = 884074, upload-time = "2026-04-13T17:08:45.802Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/9f/7baa4cdff8d6fbca41fa5c764b48a941fed8a9ec6c4cc92de65895a28299/fastar-0.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:039f875efa0f01fa43c20bf4e2fc7305489c61d0ac76eda991acfba7820a0e63", size = 969450, upload-time = "2026-04-13T17:10:18.667Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/dc/1ebbfb58a47056ba866494f19efbcdd2ba2897096b94f36e796594b4d05b/fastar-0.11.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:fff12452a9a5c6814a012445f26365541cc3d99dcca61f09762e6a389f7a32ea", size = 1033775, upload-time = "2026-04-13T17:10:36.165Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/5f/ce4e3914066f08c99eb8c32952cc07c1a013e81b1db1b0f598130bf6b974/fastar-0.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:2bf733e09f942b6fa876efe30a90508d1f4caef5630c00fb2a84fba355873712", size = 1072158, upload-time = "2026-04-13T17:10:52.497Z" },
-    { url = "https://files.pythonhosted.org/packages/03/2a/6bca72992c84151c387cc6558f3867f5ebe5fb3684ee6fa9b76280ba4b8e/fastar-0.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d1531fa848fdd3677d2dce0a4b436ea64d9ae38fb8babe2ddbc180dd153cb7a3", size = 1028577, upload-time = "2026-04-13T17:11:09.934Z" },
-]
-
 [[package]]
 name = "fastjsonschema"
 version = "2.21.2"
@@ -1700,8 +1481,8 @@ version = "2.8.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "einops", marker = "platform_machine != 'arm64' or sys_platform != 'darwin'" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'arm64' and sys_platform == 'darwin') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz", hash = "sha256:1e71dd64a9e0280e0447b8a0c2541bad4bf6ac65bdeaa2f90e51a9e57de0370d", size = 8447812, upload-time = "2025-08-15T08:28:12.911Z" }
 
@@ -1763,15 +1544,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014", size = 9121, upload-time = "2021-03-11T07:16:28.351Z" },
 ]
 
-[[package]]
-name = "frozendict"
-version = "2.4.7"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/90/b2/2a3d1374b7780999d3184e171e25439a8358c47b481f68be883c14086b4c/frozendict-2.4.7.tar.gz", hash = "sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd", size = 317082, upload-time = "2025-11-11T22:40:14.251Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/74/f94141b38a51a553efef7f510fc213894161ae49b88bffd037f8d2a7cb2f/frozendict-2.4.7-py3-none-any.whl", hash = "sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550", size = 16264, upload-time = "2025-11-11T22:40:12.836Z" },
-]
-
 [[package]]
 name = "frozenlist"
 version = "1.8.0"
@@ -1884,21 +1656,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/da/71/ae30dadffc90b9006d77af76b393cb9dfbfc9629f339fc1574a1c52e6806/future-1.0.0-py3-none-any.whl", hash = "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216", size = 491326, upload-time = "2024-02-21T11:52:35.956Z" },
 ]
 
-[[package]]
-name = "gguf"
-version = "0.19.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "sys_platform == 'linux'" },
-    { name = "requests", marker = "sys_platform == 'linux'" },
-    { name = "tqdm", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/48/ae/17f1308ae45cd7b08ebb521747d5b23f4efc4d172038a4e228dd5106c3ff/gguf-0.19.0.tar.gz", hash = "sha256:dbadcd6cc7ccd44256f2229fe7c2dff5e8aa5cf0612ab987fd2b1a57e428923f", size = 111220, upload-time = "2026-05-06T13:04:03.667Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b3/bb/d71d6da82763528c2c2ed6b59a9d6142c6595545a4c448e2085d155e88c2/gguf-0.19.0-py3-none-any.whl", hash = "sha256:70bcd10edfe697fb2dad6e40af2234b9d8ece9a41a99761405121ebda1c3c1cd", size = 118475, upload-time = "2026-05-06T13:04:02.588Z" },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -2444,15 +2201,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
-[[package]]
-name = "interegular"
-version = "0.3.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/dc/9d/8b6dde58a028a3962ce17e84d5fe73758df61378e00ef8ac3d85da34b0ff/interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600", size = 24705, upload-time = "2024-01-06T23:01:22.372Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635, upload-time = "2024-01-06T23:01:20.829Z" },
-]
-
 [[package]]
 name = "ipykernel"
 version = "6.31.0"
@@ -3131,10 +2879,10 @@ dependencies = [
     { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
     { name = "setuptools", version = "80.10.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "termcolor" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
-    { name = "torchvision", version = "0.23.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torchvision", version = "0.26.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torchvision", version = "0.26.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "tqdm" },
 ]
 
@@ -3219,7 +2967,6 @@ annotations = [
     { name = "pyarrow" },
     { name = "torchcodec", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'win32'" },
     { name = "transformers" },
-    { name = "vllm", marker = "sys_platform == 'linux'" },
 ]
 async = [
     { name = "contourpy" },
@@ -3736,7 +3483,6 @@ requires-dist = [
     { name = "torchvision", marker = "sys_platform == 'linux'", specifier = ">=0.22.0,<0.27.0", index = "https://download.pytorch.org/whl/cu128" },
     { name = "tqdm", specifier = ">=4.66.0,<5.0.0" },
     { name = "transformers", marker = "extra == 'transformers-dep'", specifier = ">=5.4.0,<5.6.0" },
-    { name = "vllm", marker = "sys_platform == 'linux' and extra == 'annotations'", specifier = ">=0.6.0,<1.0.0" },
     { name = "wandb", marker = "extra == 'training'", specifier = ">=0.24.0,<0.25.0" },
 ]
 provides-extras = ["dataset", "training", "hardware", "viz", "core-scripts", "evaluation", "dataset-viz", "av-dep", "pygame-dep", "placo-dep", "transformers-dep", "grpcio-dep", "can-dep", "peft-dep", "scipy-dep", "diffusers-dep", "qwen-vl-utils-dep", "matplotlib-dep", "pyserial-dep", "deepdiff-dep", "pynput-dep", "pyzmq-dep", "motorbridge-dep", "motorbridge-smart-servo-dep", "feetech", "dynamixel", "damiao", "robstride", "openarms", "gamepad", "hopejr", "lekiwi", "unitree-g1", "reachy2", "rebot", "kinematics", "intelrealsense", "phone", "diffusion", "wallx", "pi", "molmoact2", "smolvla", "multi-task-dit", "groot", "sarm", "robometer", "topreward", "xvla", "eo1", "hilserl", "async", "peft", "annotations", "dev", "notebook", "test", "video-benchmark", "aloha", "pusht", "libero", "metaworld", "all"]
@@ -3801,16 +3547,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ce/62/b40b382fa0c66fee1478073eb8db352a4a6beda4a1adccf1df911d8c289c/librt-0.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dee008f20b542e3cd162ba338a7f9ec0f6d23d395f66fe8aeeec3c9d067ea253", size = 102572, upload-time = "2026-05-10T18:17:06.809Z" },
 ]
 
-[[package]]
-name = "llguidance"
-version = "0.7.30"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bf/38/d1ef3ae08d8d857e5e0690c5b1e07bf7eb4a1cae5881d87215826dc6cadb/llguidance-0.7.30.tar.gz", hash = "sha256:e93bf75f2b6e48afb86a5cee23038746975e1654672bf5ba0ae75f7d4d4a2248", size = 1055528, upload-time = "2025-06-23T00:23:49.247Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9c/5b/6a166564b14f9f805f0ea01ec233a84f55789cb7eeffe1d6224ccd0e6cdd/llguidance-0.7.30-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8741c867e4bc7e42f7cdc68350c076b4edd0ca10ecefbde75f15a9f6bc25d0", size = 14867038, upload-time = "2025-06-23T00:23:39.571Z" },
-    { url = "https://files.pythonhosted.org/packages/af/80/5a40b9689f17612434b820854cba9b8cabd5142072c491b5280fe5f7a35e/llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9edc409b9decd6cffba5f5bf3b4fbd7541f95daa8cbc9510cbf96c6ab1ffc153", size = 15004926, upload-time = "2025-06-23T00:23:43.965Z" },
-]
-
 [[package]]
 name = "llvmlite"
 version = "0.44.0"
@@ -3823,21 +3559,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d8/e1/12c5f20cb9168fb3464a34310411d5ad86e4163c8ff2d14a2b57e5cc6bac/llvmlite-0.44.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc", size = 41184245, upload-time = "2025-01-20T11:14:31.731Z" },
 ]
 
-[[package]]
-name = "lm-format-enforcer"
-version = "0.11.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "interegular", marker = "sys_platform == 'linux'" },
-    { name = "packaging", marker = "sys_platform == 'linux'" },
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/84/d5/41cd417ba7dfdbbcfe46cebf81fb3dfd7c591b89897560ad05bb410a465d/lm_format_enforcer-0.11.3.tar.gz", hash = "sha256:e68081c108719cce284a9bcc889709b26ffb085a1945b5eba3a12cfa96d528da", size = 40258, upload-time = "2025-08-24T19:37:47.527Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/ef/11292bb0b85cf4c93447cab5a29f64576ed14d3ab4280e35ddd23486594a/lm_format_enforcer-0.11.3-py3-none-any.whl", hash = "sha256:cf586350875def1ae7a8fba84fcbbfc8371424b6c9d05c1fcba70aa233fbf06f", size = 45418, upload-time = "2025-08-24T19:37:46.325Z" },
-]
-
 [[package]]
 name = "lxml"
 version = "6.1.0"
@@ -4132,34 +3853,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/29/e1/c758357e82c2deb2401a7214fbfbc6ddf09b5453371dee1b7a2da0aab274/metaworld-3.0.0-py3-none-any.whl", hash = "sha256:f1dd9f8a1bcceab34a5f3c20113724dd90d21984ae89df98c21c842eb3ece137", size = 36660341, upload-time = "2025-06-14T01:44:32.171Z" },
 ]
 
-[[package]]
-name = "mistral-common"
-version = "1.11.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "jsonschema", marker = "sys_platform == 'linux'" },
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "pillow", marker = "sys_platform == 'linux'" },
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "pydantic-extra-types", extra = ["pycountry"], marker = "sys_platform == 'linux'" },
-    { name = "requests", marker = "sys_platform == 'linux'" },
-    { name = "tiktoken", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c2/eb/12167a1bea9714582e5b4f539f9c019323363e314a499c72855ff0e5ad43/mistral_common-1.11.2.tar.gz", hash = "sha256:79f68fc2d1190f28637f40e053f919c8c2697e00b2aa679ddee562a95183f4ad", size = 6357845, upload-time = "2026-05-04T19:47:40.413Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/f0/6a5d604b972e442b9d36c117d01788feddad099e4965699e3516ee6fefc3/mistral_common-1.11.2-py3-none-any.whl", hash = "sha256:ebb42062cd705a0aa2bc69b4cde2b83d446ae58150b7e29322c90cb08fcfca6c", size = 6531968, upload-time = "2026-05-04T19:47:37.718Z" },
-]
-
-[package.optional-dependencies]
-audio = [
-    { name = "soundfile", marker = "sys_platform == 'linux'" },
-    { name = "soxr", marker = "sys_platform == 'linux'" },
-]
-image = [
-    { name = "opencv-python-headless", marker = "sys_platform == 'linux'" },
-]
-
 [[package]]
 name = "mistune"
 version = "3.2.1"
@@ -4252,54 +3945,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
 ]
 
-[[package]]
-name = "msgpack"
-version = "1.1.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
-    { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/68/93180dce57f684a61a88a45ed13047558ded2be46f03acb8dec6d7c513af/msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999", size = 412721, upload-time = "2025-10-08T09:15:16.567Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/ba/459f18c16f2b3fc1a1ca871f72f07d70c07bf768ad0a507a698b8052ac58/msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e", size = 424657, upload-time = "2025-10-08T09:15:17.825Z" },
-    { url = "https://files.pythonhosted.org/packages/38/f8/4398c46863b093252fe67368b44edc6c13b17f4e6b0e4929dbf0bdb13f23/msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162", size = 402668, upload-time = "2025-10-08T09:15:19.003Z" },
-    { url = "https://files.pythonhosted.org/packages/28/ce/698c1eff75626e4124b4d78e21cca0b4cc90043afb80a507626ea354ab52/msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794", size = 419040, upload-time = "2025-10-08T09:15:20.183Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/a9/3536e385167b88c2cc8f4424c49e28d49a6fc35206d4a8060f136e71f94c/msgpack-1.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e2cb7b9031568a2a5c73aa077180f93dd2e95b4f8d3b8e14a73ae94a9e667e", size = 411885, upload-time = "2025-10-08T09:15:27.22Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/40/dc34d1a8d5f1e51fc64640b62b191684da52ca469da9cd74e84936ffa4a6/msgpack-1.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:180759d89a057eab503cf62eeec0aa61c4ea1200dee709f3a8e9397dbb3b6931", size = 419658, upload-time = "2025-10-08T09:15:28.4Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/ef/2b92e286366500a09a67e03496ee8b8ba00562797a52f3c117aa2b29514b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:04fb995247a6e83830b62f0b07bf36540c213f6eac8e851166d8d86d83cbd014", size = 403290, upload-time = "2025-10-08T09:15:29.764Z" },
-    { url = "https://files.pythonhosted.org/packages/78/90/e0ea7990abea5764e4655b8177aa7c63cdfa89945b6e7641055800f6c16b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8e22ab046fa7ede9e36eeb4cfad44d46450f37bb05d5ec482b02868f451c95e2", size = 415234, upload-time = "2025-10-08T09:15:31.022Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/6b/62e85ff7193663fbea5c0254ef32f0c77134b4059f8da89b958beb7696f3/msgpack-1.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5559d03930d3aa0f3aacb4c42c776af1a2ace2611871c84a75afe436695e6245", size = 435242, upload-time = "2025-10-08T09:15:37.647Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/47/5c74ecb4cc277cf09f64e913947871682ffa82b3b93c8dad68083112f412/msgpack-1.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70c5a7a9fea7f036b716191c29047374c10721c389c21e9ffafad04df8c52c90", size = 432509, upload-time = "2025-10-08T09:15:38.794Z" },
-    { url = "https://files.pythonhosted.org/packages/24/a4/e98ccdb56dc4e98c929a3f150de1799831c0a800583cde9fa022fa90602d/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f2cb069d8b981abc72b41aea1c580ce92d57c673ec61af4c500153a626cb9e20", size = 415957, upload-time = "2025-10-08T09:15:40.238Z" },
-    { url = "https://files.pythonhosted.org/packages/da/28/6951f7fb67bc0a4e184a6b38ab71a92d9ba58080b27a77d3e2fb0be5998f/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d62ce1f483f355f61adb5433ebfd8868c5f078d1a52d042b0a998682b4fa8c27", size = 422910, upload-time = "2025-10-08T09:15:41.505Z" },
-]
-
-[[package]]
-name = "msgspec"
-version = "0.21.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e3/60/f79b9b013a16fa3a58350c9295ddc6789f2e335f36ea61ed10a21b215364/msgspec-0.21.1.tar.gz", hash = "sha256:2313508e394b0d208f8f56892ca9b2799e2561329de9763b19619595a6c0f72c", size = 319193, upload-time = "2026-04-12T21:44:50.394Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8a/37/655101799590bcc5fddb2bd3fe0e6194e816c2d1da7c361725f5eb89a910/msgspec-0.21.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:846758412e9518252b2ac9bffd6f0e54d9ff614f5f9488df7749f81ff5c80920", size = 218871, upload-time = "2026-04-12T21:44:09.917Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/d1/d4cd9fe89c7d400d7a18f86ccc94daa3f0927f53558846fcb60791dce5d6/msgspec-0.21.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21995e74b5c598c2e004110ad66ec7f1b8c20bf2bcf3b2de8fd9a3094422d3ff", size = 225025, upload-time = "2026-04-12T21:44:11.191Z" },
-    { url = "https://files.pythonhosted.org/packages/24/bf/e20549e602b9edccadeeff98760345a416f9cce846a657e8b18e3396b212/msgspec-0.21.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6129f0cca52992e898fd5344187f7c8127b63d810b2fd73e36fca73b4c6475ee", size = 222672, upload-time = "2026-04-12T21:44:12.481Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/68/04d7a8f0f786545cf9b8c280c57aa6befb5977af6e884b8b54191cbe44b3/msgspec-0.21.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ef3ec2296248d1f8b9231acb051b6d471dfde8f21819e86c9adaaa9f42918521", size = 227303, upload-time = "2026-04-12T21:44:13.709Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/d9/9e9d7d7e5061b47540d03d640fab9b3965ba7ae49c1b2154861c8f007518/msgspec-0.21.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48943e278b3854c2f89f955ddc6f9f430d3f0784b16e47d10604ee0463cd21f5", size = 218880, upload-time = "2026-04-12T21:44:20.028Z" },
-    { url = "https://files.pythonhosted.org/packages/74/66/2bb344f34abb4b57e60c7c9c761994e0417b9718ec1460bf00c296f2a7ea/msgspec-0.21.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9aa659ebb0101b1cbc31461212b87e341d961f0ab0772aaf068a99e001ec4aa", size = 225050, upload-time = "2026-04-12T21:44:21.577Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/84/7c1e412f76092277bf760cef12b7979d03314d259ab5b5cafde5d0c1722d/msgspec-0.21.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7b27d1a8ead2b6f5b0c4f2d07b8be1ccfcc041c8a0e704781edebe3ae13c484", size = 222713, upload-time = "2026-04-12T21:44:22.83Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/27/0bba04b2b4ef05f3d068429410bc71d2cea925f1596a8f41152cccd5edb8/msgspec-0.21.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:38fe93e86b61328fe544cb7fd871fad5a27c8734bfda90f65e5dbe288ae50f61", size = 227259, upload-time = "2026-04-12T21:44:24.11Z" },
-    { url = "https://files.pythonhosted.org/packages/85/7d/1e29a319d678d6cb962ae5bdf32a6858ebdf38f73bc654c0e9c742a0c2c8/msgspec-0.21.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:68604db36b3b4dd9bf160e436e12798a4738848144cea1aca1cb984011eb160f", size = 219866, upload-time = "2026-04-12T21:44:31.104Z" },
-    { url = "https://files.pythonhosted.org/packages/25/1f/cca084ca2572810fff12ea9dbdcbe39eac048f40daf4a9077b49fcbe8cee/msgspec-0.21.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3d6b9dc50948eaf65df54d2fd0ff66e6d8c32f116037209ee861810eb9b676cb", size = 224993, upload-time = "2026-04-12T21:44:32.649Z" },
-    { url = "https://files.pythonhosted.org/packages/71/94/d2120fc9d419a89a3a7c13e5b7078798c4b392a96a02a6e2b3ce43a8766c/msgspec-0.21.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:52c5e21930942302394429c5a582ce7e6b62c7f983b3760834c2ce107e0dd6df", size = 223535, upload-time = "2026-04-12T21:44:33.839Z" },
-    { url = "https://files.pythonhosted.org/packages/75/17/42418b66a3ad972a89bab73dd78b79cc6282bb488a25e73c853cee7443b9/msgspec-0.21.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:abbb39d65681fa24ed394e01af3d59d869068324f900c61d06062b7fb9980f2f", size = 227222, upload-time = "2026-04-12T21:44:35.093Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/08/673a7bb05e5702dc787ddd3011195b509f9867927970da59052211929987/msgspec-0.21.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f60800e6299b798142dc40b0644da77ceac5ea0568be58228417eae14135c847", size = 226281, upload-time = "2026-04-12T21:44:42.181Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/45/86508cf57283e9070b3c447e3ab25b792a7a0855a3ea4e0c6d111ac34c97/msgspec-0.21.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5f8e9dfcd98419cf7568808470c4317a3fb30bef0e3715b568730a2b272a20d7", size = 229863, upload-time = "2026-04-12T21:44:43.442Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/62/e7c9367cd08d590559faacd711edbae36840342843e669440363f33c7d36/msgspec-0.21.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:92d89dfad13bd1ea640dc3e37e724ed380da1030b272bdf5ecafb983c3ad7c75", size = 230445, upload-time = "2026-04-12T21:44:44.806Z" },
-    { url = "https://files.pythonhosted.org/packages/42/b4/c0f54632103846b658a10930025f4de41c8724b5e4805a5f3b395586cb7e/msgspec-0.21.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d03867786e5d7ba25d666df4b11320c27170f4aeafcb8e3a8b0a50a4fb742ca", size = 231822, upload-time = "2026-04-12T21:44:46.343Z" },
-]
-
 [[package]]
 name = "mujoco"
 version = "3.8.1"
@@ -4706,6 +4351,7 @@ name = "nvidia-cublas-cu12"
 version = "12.8.4.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" },
     { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" },
 ]
 
@@ -4714,6 +4360,7 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" },
     { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" },
 ]
 
@@ -4723,6 +4370,7 @@ version = "12.8.93"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" },
 ]
 
 [[package]]
@@ -4730,18 +4378,20 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" },
     { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" },
 ]
 
 [[package]]
 name = "nvidia-cudnn-cu12"
-version = "9.10.2.21"
+version = "9.19.0.56"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
+    { url = "https://files.pythonhosted.org/packages/09/b8/277c51962ee46fa3e5b203ac5f76107c650f781d6891e681e28e6f3e9fe6/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:08caaf27fe556aca82a3ee3b5aa49a77e7de0cfcb7ff4e5c29da426387a8267e", size = 656910700, upload-time = "2026-02-03T20:40:25.508Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/41/65225d42fba06fb3dd3972485ea258e7dd07a40d6e01c95da6766ad87354/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:ac6ad90a075bb33a94f2b4cf4622eac13dd4dc65cf6dd9c7572a318516a36625", size = 657906812, upload-time = "2026-02-03T20:44:12.638Z" },
 ]
 
 [[package]]
@@ -4749,9 +4399,10 @@ name = "nvidia-cufft-cu12"
 version = "11.3.3.83"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" },
     { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
 ]
 
@@ -4761,6 +4412,7 @@ version = "1.13.1.3"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" },
 ]
 
 [[package]]
@@ -4768,6 +4420,7 @@ name = "nvidia-curand-cu12"
 version = "10.3.9.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" },
     { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" },
 ]
 
@@ -4776,11 +4429,12 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.3.90"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" },
     { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
 ]
 
@@ -4789,9 +4443,10 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.8.93"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" },
     { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
 ]
 
@@ -4800,15 +4455,17 @@ name = "nvidia-cusparselt-cu12"
 version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" },
     { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
 ]
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.27.3"
+version = "2.28.9"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" },
+    { url = "https://files.pythonhosted.org/packages/08/c4/120d2dfd92dff2c776d68f361ff8705fdea2ca64e20b612fab0fd3f581ac/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:50a36e01c4a090b9f9c47d92cec54964de6b9fcb3362d0e19b8ffc6323c21b60", size = 296766525, upload-time = "2025-11-18T05:49:16.094Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/4e/44dbb46b3d1b0ec61afda8e84837870f2f9ace33c564317d59b70bc19d3e/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:485776daa8447da5da39681af455aa3b2c2586ddcf4af8772495e7c532c7e5ab", size = 296782137, upload-time = "2025-11-18T05:49:34.248Z" },
 ]
 
 [[package]]
@@ -4817,6 +4474,16 @@ version = "12.8.93"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" },
+]
+
+[[package]]
+name = "nvidia-nvshmem-cu12"
+version = "3.4.5"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" },
 ]
 
 [[package]]
@@ -4824,6 +4491,7 @@ name = "nvidia-nvtx-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" },
     { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
 ]
 
@@ -4921,26 +4589,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/2a/7dd3d207ec669cacc1f186fd856a0f61dbc255d24f6fdc1a6715d6051b0f/openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315", size = 948627, upload-time = "2025-09-24T13:00:50.754Z" },
 ]
 
-[[package]]
-name = "openai-harmony"
-version = "0.0.8"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3e/92/2d038d096f29179c7c9571b431f9e739f87a487121901725e23fe338dd9d/openai_harmony-0.0.8.tar.gz", hash = "sha256:6e43f98e6c242fa2de6f8ea12eab24af63fa2ed3e89c06341fb9d92632c5cbdf", size = 284777, upload-time = "2025-11-05T19:07:06.727Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d3/d2/ce6953ca87db9cae3e775024184da7d1c5cb88cead19a2d75b42f00a959c/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4f709815924ec325b9a890e6ab2bbb0ceec8e319a4e257328eb752cf36b2efc", size = 2948463, upload-time = "2025-11-05T19:06:48.17Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/4c/b553c9651662d6ce102ca7f3629d268b23df1abe5841e24bed81e8a8e949/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5cfcfd963b50a41fc656c84d3440ca6eecdccd6c552158ce790b8f2e33dfb5a9", size = 2704083, upload-time = "2025-11-05T19:06:50.205Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/af/4eec8f9ab9c27bcdb444460c72cf43011d176fc44c79d6e113094ca1e152/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a3a16972aa1cee38ea958470cd04ac9a2d5ac38fdcf77ab686611246220c158", size = 2959765, upload-time = "2025-11-05T19:06:53.62Z" },
-    { url = "https://files.pythonhosted.org/packages/11/3c/33f3374e4624e0e776f6b13b73c45a7ead7f9c4529f8369ed5bfcaa30cac/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b4d5cfa168e74d08f8ba6d58a7e49bc7daef4d58951ec69b66b0d56f4927a68d", size = 3427031, upload-time = "2025-11-05T19:06:51.829Z" },
-    { url = "https://files.pythonhosted.org/packages/25/3f/1a192b93bb47c6b44cd98ba8cc1d3d2a9308f1bb700c3017e6352da11bda/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c007d277218a50db8839e599ed78e0fffe5130f614c3f6d93ae257f282071a29", size = 2953260, upload-time = "2025-11-05T19:06:55.406Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/f8/93b582cad3531797c3db7c2db5400fd841538ccddfd9f5e3df61be99a630/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:8565d4f5a0638da1bffde29832ed63c9e695c558611053add3b2dc0b56c92dbc", size = 3127044, upload-time = "2025-11-05T19:06:59.553Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/10/4327dbf87f75ae813405fd9a9b4a5cde63d506ffed0a096a440a4cabd89c/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:cbaa3bda75ef0d8836e1f8cc84af62f971b1d756d740efc95c38c3e04c0bfde2", size = 2932931, upload-time = "2025-11-05T19:07:01.437Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/c8/1774eec4f6f360ef57618fb8f52e3d3af245b2491bd0297513aa09eec04b/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:772922a9bd24e133950fad71eb1550836f415a88e8c77870e12d0c3bd688ddc2", size = 2996140, upload-time = "2025-11-05T19:07:03.438Z" },
-    { url = "https://files.pythonhosted.org/packages/60/c3/3d1e01e2dba517a91760e4a03e4f20ffc75039a6fe584d0e6f9b5c78fd15/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:007b0476a1f331f8130783f901f1da6f5a7057af1a4891f1b6a31dec364189b5", size = 3205080, upload-time = "2025-11-05T19:07:05.078Z" },
-]
-
 [[package]]
 name = "opencv-python"
 version = "4.13.0.92"
@@ -4986,18 +4634,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/12/27/fb8d7338b4d551900fa3e580acbe7a0cf655d940e164cb5c00ec31961094/orderly_set-5.5.0-py3-none-any.whl", hash = "sha256:46f0b801948e98f427b412fcabb831677194c05c3b699b80de260374baa0b1e7", size = 13068, upload-time = "2025-07-10T20:10:54.377Z" },
 ]
 
-[[package]]
-name = "outlines-core"
-version = "0.2.11"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1a/d3/e04e9145f8f806723dec9b9e5227ad695a3efcd3ced7794cf7c22b15df5e/outlines_core-0.2.11.tar.gz", hash = "sha256:dfce56f717ff5083e54cbcfdb66cad243365437fccbb5509adaa7e31e030f1d8", size = 197263, upload-time = "2025-05-19T10:12:51.719Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/92/c7/a65d1fddf49830ebc41422294eacde35286d9f68994a8aa905cb14f5aade/outlines_core-0.2.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86df9740368866295077346440d911df4972da2b3f1f54b8125e6f329e8a8891", size = 2287677, upload-time = "2025-05-19T10:12:24.24Z" },
-    { url = "https://files.pythonhosted.org/packages/23/79/8795aed8be9b77dd69d78e7cfbfcf28c179e6b08da6e56bbbf48a09fe55f/outlines_core-0.2.11-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:96ce4dd78f106799be4a0a5795cefd1352806162973756a4b6fce4bb6eddd7e4", size = 2113000, upload-time = "2025-05-19T10:12:25.446Z" },
-    { url = "https://files.pythonhosted.org/packages/87/96/7dcdc5198844145ab35528f9f93a58c3d47b87e54d0f79357c631d7b7a9a/outlines_core-0.2.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:daef6eaaf8c3403455ab5cbf265cb5c6838df571eb7c4b23cddac19cfc701726", size = 2287320, upload-time = "2025-05-19T10:12:35.515Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/68/b420b6a3beaadbf8e9f2a82132120027efd6424634013fbeca8c2fed7467/outlines_core-0.2.11-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:76b2512417c68863f8f227a080e87f755682dfd895e23b021121318be11da579", size = 2112861, upload-time = "2025-05-19T10:12:36.742Z" },
-]
-
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -5072,15 +4708,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/5d/8268b644392ee874ee82a635cd0df1773de230bde356c38de28e298392cc/parso-0.8.7-py2.py3-none-any.whl", hash = "sha256:a8926eb2a1b915486941fdbd31e86a4baf88fe8c210f25f2f35ecec5b574ca1c", size = 107025, upload-time = "2026-05-01T23:12:58.867Z" },
 ]
 
-[[package]]
-name = "partial-json-parser"
-version = "0.2.1.1.post7"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6a/6d/eed37d7ebc1e0bcd27b831c0cf1fe94881934316187c4b30d23f29ea0bd4/partial_json_parser-0.2.1.1.post7.tar.gz", hash = "sha256:86590e1ba6bcb6739a2dfc17d2323f028cb5884f4c6ce23db376999132c9a922", size = 10296, upload-time = "2025-11-17T07:27:41.202Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/32/658973117bf0fd82a24abbfb94fe73a5e86216e49342985e10acce54775a/partial_json_parser-0.2.1.1.post7-py3-none-any.whl", hash = "sha256:145119e5eabcf80cbb13844a6b50a85c68bf99d376f8ed771e2a3c3b03e653ae", size = 10877, upload-time = "2025-11-17T07:27:40.457Z" },
-]
-
 [[package]]
 name = "pathspec"
 version = "1.1.1"
@@ -5102,8 +4729,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "tqdm" },
     { name = "transformers" },
 ]
@@ -5282,19 +4909,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/9b/d4b1e644385499c8346fa9b622a3f030dce14cd6ef8a1871c221a17a67e7/prometheus_client-0.25.0-py3-none-any.whl", hash = "sha256:d5aec89e349a6ec230805d0df882f3807f74fd6c1a2fa86864e3c2279059fed1", size = 64154, upload-time = "2026-04-09T19:53:41.324Z" },
 ]
 
-[[package]]
-name = "prometheus-fastapi-instrumentator"
-version = "7.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "prometheus-client", marker = "sys_platform == 'linux'" },
-    { name = "starlette", version = "0.52.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/69/6d/24d53033cf93826aa7857699a4450c1c67e5b9c710e925b1ed2b320c04df/prometheus_fastapi_instrumentator-7.1.0.tar.gz", hash = "sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e", size = 20220, upload-time = "2025-03-19T19:35:05.351Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/27/72/0824c18f3bc75810f55dacc2dd933f6ec829771180245ae3cc976195dec0/prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9", size = 19296, upload-time = "2025-03-19T19:35:04.323Z" },
-]
-
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.52"
@@ -5461,15 +5075,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
 ]
 
-[[package]]
-name = "py-cpuinfo"
-version = "9.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" },
-]
-
 [[package]]
 name = "pyarrow"
 version = "24.0.0"
@@ -5513,103 +5118,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" },
 ]
 
-[[package]]
-name = "pybase64"
-version = "1.4.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/aa/b8/4ed5c7ad5ec15b08d35cc79ace6145d5c1ae426e46435f4987379439dfea/pybase64-1.4.3.tar.gz", hash = "sha256:c2ed274c9e0ba9c8f9c4083cfe265e66dd679126cd9c2027965d807352f3f053", size = 137272, upload-time = "2025-12-06T13:27:04.013Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/1b/9a8cab0042b464e9a876d5c65fe5127445a2436da36fda64899b119b1a1b/pybase64-1.4.3-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f0b3f200c3e06316f6bebabd458b4e4bcd4c2ca26af7c0c766614d91968dee27", size = 68210, upload-time = "2025-12-06T13:23:18.813Z" },
-    { url = "https://files.pythonhosted.org/packages/62/f7/965b79ff391ad208b50e412b5d3205ccce372a2d27b7218ae86d5295b105/pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb632edfd132b3eaf90c39c89aa314beec4e946e210099b57d40311f704e11d4", size = 71599, upload-time = "2025-12-06T13:23:20.195Z" },
-    { url = "https://files.pythonhosted.org/packages/03/4b/a3b5175130b3810bbb8ccfa1edaadbd3afddb9992d877c8a1e2f274b476e/pybase64-1.4.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:356ef1d74648ce997f5a777cf8f1aefecc1c0b4fe6201e0ef3ec8a08170e1b54", size = 59922, upload-time = "2025-12-06T13:23:21.487Z" },
-    { url = "https://files.pythonhosted.org/packages/da/5d/c38d1572027fc601b62d7a407721688b04b4d065d60ca489912d6893e6cf/pybase64-1.4.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:c48361f90db32bacaa5518419d4eb9066ba558013aaf0c7781620279ecddaeb9", size = 56712, upload-time = "2025-12-06T13:23:22.77Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/d4/4e04472fef485caa8f561d904d4d69210a8f8fc1608ea15ebd9012b92655/pybase64-1.4.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:702bcaa16ae02139d881aeaef5b1c8ffb4a3fae062fe601d1e3835e10310a517", size = 59300, upload-time = "2025-12-06T13:23:24.543Z" },
-    { url = "https://files.pythonhosted.org/packages/86/e7/16e29721b86734b881d09b7e23dfd7c8408ad01a4f4c7525f3b1088e25ec/pybase64-1.4.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:53d0ffe1847b16b647c6413d34d1de08942b7724273dd57e67dcbdb10c574045", size = 60278, upload-time = "2025-12-06T13:23:25.608Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/02/18515f211d7c046be32070709a8efeeef8a0203de4fd7521e6b56404731b/pybase64-1.4.3-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:9a1792e8b830a92736dae58f0c386062eb038dfe8004fb03ba33b6083d89cd43", size = 54817, upload-time = "2025-12-06T13:23:26.633Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/be/14e29d8e1a481dbff151324c96dd7b5d2688194bb65dc8a00ca0e1ad1e86/pybase64-1.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d468b1b1ac5ad84875a46eaa458663c3721e8be5f155ade356406848d3701f6", size = 58611, upload-time = "2025-12-06T13:23:27.684Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/8a/a2588dfe24e1bbd742a554553778ab0d65fdf3d1c9a06d10b77047d142aa/pybase64-1.4.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:e97b7bdbd62e71898cd542a6a9e320d9da754ff3ebd02cb802d69087ee94d468", size = 52404, upload-time = "2025-12-06T13:23:28.714Z" },
-    { url = "https://files.pythonhosted.org/packages/27/fc/afcda7445bebe0cbc38cafdd7813234cdd4fc5573ff067f1abf317bb0cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b33aeaa780caaa08ffda87fc584d5eab61e3d3bbb5d86ead02161dc0c20d04bc", size = 68817, upload-time = "2025-12-06T13:23:30.079Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/3a/87c3201e555ed71f73e961a787241a2438c2bbb2ca8809c29ddf938a3157/pybase64-1.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c0efcf78f11cf866bed49caa7b97552bc4855a892f9cc2372abcd3ed0056f0d", size = 57854, upload-time = "2025-12-06T13:23:31.17Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/7d/931c2539b31a7b375e7d595b88401eeb5bd6c5ce1059c9123f9b608aaa14/pybase64-1.4.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:66e3791f2ed725a46593f8bd2761ff37d01e2cdad065b1dceb89066f476e50c6", size = 54333, upload-time = "2025-12-06T13:23:32.422Z" },
-    { url = "https://files.pythonhosted.org/packages/de/5e/537601e02cc01f27e9d75f440f1a6095b8df44fc28b1eef2cd739aea8cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:72bb0b6bddadab26e1b069bb78e83092711a111a80a0d6b9edcb08199ad7299b", size = 56492, upload-time = "2025-12-06T13:23:33.515Z" },
-    { url = "https://files.pythonhosted.org/packages/96/97/2a2e57acf8f5c9258d22aba52e71f8050e167b29ed2ee1113677c1b600c1/pybase64-1.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5b3365dbcbcdb0a294f0f50af0c0a16b27a232eddeeb0bceeefd844ef30d2a23", size = 70974, upload-time = "2025-12-06T13:23:36.27Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/50/b7170cb2c631944388fe2519507fe3835a4054a6a12a43f43781dae82be1/pybase64-1.4.3-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:ea4b785b0607d11950b66ce7c328f452614aefc9c6d3c9c28bae795dc7f072e1", size = 33901, upload-time = "2025-12-06T13:23:40.951Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/8d/20b68f11adfc4c22230e034b65c71392e3e338b413bf713c8945bd2ccfb3/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:27fdff227a0c0e182e0ba37a99109645188978b920dfb20d8b9c17eeee370d0d", size = 30932, upload-time = "2025-12-06T13:23:43.348Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/79/b1b550ac6bff51a4880bf6e089008b2e1ca16f2c98db5e039a08ac3ad157/pybase64-1.4.3-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2a8204f1fdfec5aa4184249b51296c0de95445869920c88123978304aad42df1", size = 31394, upload-time = "2025-12-06T13:23:44.317Z" },
-    { url = "https://files.pythonhosted.org/packages/82/70/b5d7c5932bf64ee1ec5da859fbac981930b6a55d432a603986c7f509c838/pybase64-1.4.3-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:874fc2a3777de6baf6aa921a7aa73b3be98295794bea31bd80568a963be30767", size = 38078, upload-time = "2025-12-06T13:23:45.348Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/c9/24b3b905cf75e23a9a4deaf203b35ffcb9f473ac0e6d8257f91a05dfce62/pybase64-1.4.3-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:1d45c8fe8fe82b65c36b227bb4a2cf623d9ada16bed602ce2d3e18c35285b72a", size = 68244, upload-time = "2025-12-06T13:23:49.026Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/cd/d15b0c3e25e5859fab0416dc5b96d34d6bd2603c1c96a07bb2202b68ab92/pybase64-1.4.3-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ad70c26ba091d8f5167e9d4e1e86a0483a5414805cdb598a813db635bd3be8b8", size = 71620, upload-time = "2025-12-06T13:23:50.081Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/31/4ca953cc3dcde2b3711d6bfd70a6f4ad2ca95a483c9698076ba605f1520f/pybase64-1.4.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e98310b7c43145221e7194ac9fa7fffc84763c87bfc5e2f59f9f92363475bdc1", size = 59930, upload-time = "2025-12-06T13:23:51.68Z" },
-    { url = "https://files.pythonhosted.org/packages/60/55/e7f7bdcd0fd66e61dda08db158ffda5c89a306bbdaaf5a062fbe4e48f4a1/pybase64-1.4.3-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:398685a76034e91485a28aeebcb49e64cd663212fd697b2497ac6dfc1df5e671", size = 56425, upload-time = "2025-12-06T13:23:52.732Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/65/b592c7f921e51ca1aca3af5b0d201a98666d0a36b930ebb67e7c2ed27395/pybase64-1.4.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7e46400a6461187ccb52ed75b0045d937529e801a53a9cd770b350509f9e4d50", size = 59327, upload-time = "2025-12-06T13:23:53.856Z" },
-    { url = "https://files.pythonhosted.org/packages/23/95/1613d2fb82dbb1548595ad4179f04e9a8451bfa18635efce18b631eabe3f/pybase64-1.4.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1b62b9f2f291d94f5e0b76ab499790b7dcc78a009d4ceea0b0428770267484b6", size = 60294, upload-time = "2025-12-06T13:23:54.937Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/73/40431f37f7d1b3eab4673e7946ff1e8f5d6bd425ec257e834dae8a6fc7b0/pybase64-1.4.3-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:f30ceb5fa4327809dede614be586efcbc55404406d71e1f902a6fdcf322b93b2", size = 54858, upload-time = "2025-12-06T13:23:56.031Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/84/f6368bcaf9f743732e002a9858646fd7a54f428490d427dd6847c5cfe89e/pybase64-1.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0d5f18ed53dfa1d4cf8b39ee542fdda8e66d365940e11f1710989b3cf4a2ed66", size = 58629, upload-time = "2025-12-06T13:23:57.12Z" },
-    { url = "https://files.pythonhosted.org/packages/43/75/359532f9adb49c6b546cafc65c46ed75e2ccc220d514ba81c686fbd83965/pybase64-1.4.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:119d31aa4b58b85a8ebd12b63c07681a138c08dfc2fe5383459d42238665d3eb", size = 52448, upload-time = "2025-12-06T13:23:58.298Z" },
-    { url = "https://files.pythonhosted.org/packages/92/6c/ade2ba244c3f33ed920a7ed572ad772eb0b5f14480b72d629d0c9e739a40/pybase64-1.4.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3cf0218b0e2f7988cf7d738a73b6a1d14f3be6ce249d7c0f606e768366df2cce", size = 68841, upload-time = "2025-12-06T13:23:59.886Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/51/b345139cd236be382f2d4d4453c21ee6299e14d2f759b668e23080f8663f/pybase64-1.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:12f4ee5e988bc5c0c1106b0d8fc37fb0508f12dab76bac1b098cb500d148da9d", size = 57910, upload-time = "2025-12-06T13:24:00.994Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/b8/9f84bdc4f1c4f0052489396403c04be2f9266a66b70c776001eaf0d78c1f/pybase64-1.4.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:937826bc7b6b95b594a45180e81dd4d99bd4dd4814a443170e399163f7ff3fb6", size = 54335, upload-time = "2025-12-06T13:24:02.046Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/c7/be63b617d284de46578a366da77ede39c8f8e815ed0d82c7c2acca560fab/pybase64-1.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:88995d1460971ef80b13e3e007afbe4b27c62db0508bc7250a2ab0a0b4b91362", size = 56486, upload-time = "2025-12-06T13:24:03.141Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/96/f252c8f9abd6ded3ef1ccd3cdbb8393a33798007f761b23df8de1a2480e6/pybase64-1.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:72326fe163385ed3e1e806dd579d47fde5d8a59e51297a60fc4e6cbc1b4fc4ed", size = 70978, upload-time = "2025-12-06T13:24:04.221Z" },
-    { url = "https://files.pythonhosted.org/packages/46/fc/cb64964c3b29b432f54d1bce5e7691d693e33bbf780555151969ffd95178/pybase64-1.4.3-cp313-cp313t-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:2e745f2ce760c6cf04d8a72198ef892015ddb89f6ceba489e383518ecbdb13ab", size = 72317, upload-time = "2025-12-06T13:24:11.129Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/b7/fab2240da6f4e1ad46f71fa56ec577613cf5df9dce2d5b4cfaa4edd0e365/pybase64-1.4.3-cp313-cp313t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fac217cd9de8581a854b0ac734c50fd1fa4b8d912396c1fc2fce7c230efe3a7", size = 75534, upload-time = "2025-12-06T13:24:12.433Z" },
-    { url = "https://files.pythonhosted.org/packages/91/3b/3e2f2b6e68e3d83ddb9fa799f3548fb7449765daec9bbd005a9fbe296d7f/pybase64-1.4.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:da1ee8fa04b283873de2d6e8fa5653e827f55b86bdf1a929c5367aaeb8d26f8a", size = 65399, upload-time = "2025-12-06T13:24:13.928Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/08/476ac5914c3b32e0274a2524fc74f01cbf4f4af4513d054e41574eb018f6/pybase64-1.4.3-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:b0bf8e884ee822ca7b1448eeb97fa131628fe0ff42f60cae9962789bd562727f", size = 60487, upload-time = "2025-12-06T13:24:15.177Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/b8/618a92915330cc9cba7880299b546a1d9dab1a21fd6c0292ee44a4fe608c/pybase64-1.4.3-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1bf749300382a6fd1f4f255b183146ef58f8e9cb2f44a077b3a9200dfb473a77", size = 63959, upload-time = "2025-12-06T13:24:16.854Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/52/af9d8d051652c3051862c442ec3861259c5cdb3fc69774bc701470bd2a59/pybase64-1.4.3-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:153a0e42329b92337664cfc356f2065248e6c9a1bd651bbcd6dcaf15145d3f06", size = 64874, upload-time = "2025-12-06T13:24:18.328Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/51/5381a7adf1f381bd184d33203692d3c57cf8ae9f250f380c3fecbdbe554b/pybase64-1.4.3-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:86ee56ac7f2184ca10217ed1c655c1a060273e233e692e9086da29d1ae1768db", size = 58572, upload-time = "2025-12-06T13:24:19.417Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/f0/578ee4ffce5818017de4fdf544e066c225bc435e73eb4793cde28a689d0b/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0e71a4db76726bf830b47477e7d830a75c01b2e9b01842e787a0836b0ba741e3", size = 63636, upload-time = "2025-12-06T13:24:20.497Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/ad/8ae94814bf20159ea06310b742433e53d5820aa564c9fdf65bf2d79f8799/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:2ba7799ec88540acd9861b10551d24656ca3c2888ecf4dba2ee0a71544a8923f", size = 56193, upload-time = "2025-12-06T13:24:21.559Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/31/6438cfcc3d3f0fa84d229fa125c243d5094e72628e525dfefadf3bcc6761/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2860299e4c74315f5951f0cf3e72ba0f201c3356c8a68f95a3ab4e620baf44e9", size = 72655, upload-time = "2025-12-06T13:24:22.673Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/0d/2bbc9e9c3fc12ba8a6e261482f03a544aca524f92eae0b4908c0a10ba481/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:bb06015db9151f0c66c10aae8e3603adab6b6cd7d1f7335a858161d92fc29618", size = 62471, upload-time = "2025-12-06T13:24:23.8Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/0b/34d491e7f49c1dbdb322ea8da6adecda7c7cd70b6644557c6e4ca5c6f7c7/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:242512a070817272865d37c8909059f43003b81da31f616bb0c391ceadffe067", size = 58119, upload-time = "2025-12-06T13:24:24.994Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/17/c21d0cde2a6c766923ae388fc1f78291e1564b0d38c814b5ea8a0e5e081c/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:5d8277554a12d3e3eed6180ebda62786bf9fc8d7bb1ee00244258f4a87ca8d20", size = 60791, upload-time = "2025-12-06T13:24:26.046Z" },
-    { url = "https://files.pythonhosted.org/packages/92/b2/eaa67038916a48de12b16f4c384bcc1b84b7ec731b23613cb05f27673294/pybase64-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f40b7ddd698fc1e13a4b64fbe405e4e0e1279e8197e37050e24154655f5f7c4e", size = 74701, upload-time = "2025-12-06T13:24:27.466Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/80/4bd3dff423e5a91f667ca41982dc0b79495b90ec0c0f5d59aca513e50f8c/pybase64-1.4.3-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:015bb586a1ea1467f69d57427abe587469392215f59db14f1f5c39b52fdafaf5", size = 33835, upload-time = "2025-12-06T13:24:31.767Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/71/cf62b261d431857e8e054537a5c3c24caafa331de30daede7b2c6c558501/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8f183ac925a48046abe047360fe3a1b28327afb35309892132fe1915d62fb282", size = 30939, upload-time = "2025-12-06T13:24:34.001Z" },
-    { url = "https://files.pythonhosted.org/packages/24/3e/d12f92a3c1f7c6ab5d53c155bff9f1084ba997a37a39a4f781ccba9455f3/pybase64-1.4.3-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30bf3558e24dcce4da5248dcf6d73792adfcf4f504246967e9db155be4c439ad", size = 31401, upload-time = "2025-12-06T13:24:35.11Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/3d/9c27440031fea0d05146f8b70a460feb95d8b4e3d9ca8f45c972efb4c3d3/pybase64-1.4.3-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a674b419de318d2ce54387dd62646731efa32b4b590907800f0bd40675c1771d", size = 38075, upload-time = "2025-12-06T13:24:36.53Z" },
-    { url = "https://files.pythonhosted.org/packages/db/26/b136a4b65e5c94ff06217f7726478df3f31ab1c777c2c02cf698e748183f/pybase64-1.4.3-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:b51204d349a4b208287a8aa5b5422be3baa88abf6cc8ff97ccbda34919bbc857", size = 68460, upload-time = "2025-12-06T13:24:41.735Z" },
-    { url = "https://files.pythonhosted.org/packages/68/6d/84ce50e7ee1ae79984d689e05a9937b2460d4efa1e5b202b46762fb9036c/pybase64-1.4.3-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:30f2fd53efecbdde4bdca73a872a68dcb0d1bf8a4560c70a3e7746df973e1ef3", size = 71688, upload-time = "2025-12-06T13:24:42.908Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/57/6743e420416c3ff1b004041c85eb0ebd9c50e9cf05624664bfa1dc8b5625/pybase64-1.4.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0932b0c5cfa617091fd74f17d24549ce5de3628791998c94ba57be808078eeaf", size = 60040, upload-time = "2025-12-06T13:24:44.37Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/68/733324e28068a89119af2921ce548e1c607cc5c17d354690fc51c302e326/pybase64-1.4.3-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:acb61f5ab72bec808eb0d4ce8b87ec9f38d7d750cb89b1371c35eb8052a29f11", size = 56478, upload-time = "2025-12-06T13:24:45.815Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/9e/f3f4aa8cfe3357a3cdb0535b78eb032b671519d3ecc08c58c4c6b72b5a91/pybase64-1.4.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:2bc2d5bc15168f5c04c53bdfe5a1e543b2155f456ed1e16d7edce9ce73842021", size = 59463, upload-time = "2025-12-06T13:24:46.938Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/d1/53286038e1f0df1cf58abcf4a4a91b0f74ab44539c2547b6c31001ddd054/pybase64-1.4.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:8a7bc3cd23880bdca59758bcdd6f4ef0674f2393782763910a7466fab35ccb98", size = 60360, upload-time = "2025-12-06T13:24:48.039Z" },
-    { url = "https://files.pythonhosted.org/packages/00/9a/5cc6ce95db2383d27ff4d790b8f8b46704d360d701ab77c4f655bcfaa6a7/pybase64-1.4.3-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:ad15acf618880d99792d71e3905b0e2508e6e331b76a1b34212fa0f11e01ad28", size = 54999, upload-time = "2025-12-06T13:24:49.547Z" },
-    { url = "https://files.pythonhosted.org/packages/64/e7/c3c1d09c3d7ae79e3aa1358c6d912d6b85f29281e47aa94fc0122a415a2f/pybase64-1.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:448158d417139cb4851200e5fee62677ae51f56a865d50cda9e0d61bda91b116", size = 58736, upload-time = "2025-12-06T13:24:50.641Z" },
-    { url = "https://files.pythonhosted.org/packages/db/d5/0baa08e3d8119b15b588c39f0d39fd10472f0372e3c54ca44649cbefa256/pybase64-1.4.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:9058c49b5a2f3e691b9db21d37eb349e62540f9f5fc4beabf8cbe3c732bead86", size = 52298, upload-time = "2025-12-06T13:24:51.791Z" },
-    { url = "https://files.pythonhosted.org/packages/00/87/fc6f11474a1de7e27cd2acbb8d0d7508bda3efa73dfe91c63f968728b2a3/pybase64-1.4.3-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ce561724f6522907a66303aca27dce252d363fcd85884972d348f4403ba3011a", size = 69049, upload-time = "2025-12-06T13:24:53.253Z" },
-    { url = "https://files.pythonhosted.org/packages/69/9d/7fb5566f669ac18b40aa5fc1c438e24df52b843c1bdc5da47d46d4c1c630/pybase64-1.4.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:63316560a94ac449fe86cb8b9e0a13714c659417e92e26a5cbf085cd0a0c838d", size = 57952, upload-time = "2025-12-06T13:24:54.342Z" },
-    { url = "https://files.pythonhosted.org/packages/de/cc/ceb949232dbbd3ec4ee0190d1df4361296beceee9840390a63df8bc31784/pybase64-1.4.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:7ecd796f2ac0be7b73e7e4e232b8c16422014de3295d43e71d2b19fd4a4f5368", size = 54484, upload-time = "2025-12-06T13:24:55.774Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/69/659f3c8e6a5d7b753b9c42a4bd9c42892a0f10044e9c7351a4148d413a33/pybase64-1.4.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d01e102a12fb2e1ed3dc11611c2818448626637857ec3994a9cf4809dfd23477", size = 56542, upload-time = "2025-12-06T13:24:57Z" },
-    { url = "https://files.pythonhosted.org/packages/85/2c/29c9e6c9c82b72025f9676f9e82eb1fd2339ad038cbcbf8b9e2ac02798fc/pybase64-1.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ebff797a93c2345f22183f454fd8607a34d75eca5a3a4a969c1c75b304cee39d", size = 71045, upload-time = "2025-12-06T13:24:58.179Z" },
-    { url = "https://files.pythonhosted.org/packages/43/04/8b15c34d3c2282f1c1b0850f1113a249401b618a382646a895170bc9b5e7/pybase64-1.4.3-cp314-cp314t-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:a5ae04ea114c86eb1da1f6e18d75f19e3b5ae39cb1d8d3cd87c29751a6a22780", size = 72474, upload-time = "2025-12-06T13:25:06.434Z" },
-    { url = "https://files.pythonhosted.org/packages/42/00/f34b4d11278f8fdc68bc38f694a91492aa318f7c6f1bd7396197ac0f8b12/pybase64-1.4.3-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1755b3dce3a2a5c7d17ff6d4115e8bee4a1d5aeae74469db02e47c8f477147da", size = 75706, upload-time = "2025-12-06T13:25:07.636Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/5d/71747d4ad7fe16df4c4c852bdbdeb1f2cf35677b48d7c34d3011a7a6ad3a/pybase64-1.4.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb852f900e27ffc4ec1896817535a0fa19610ef8875a096b59f21d0aa42ff172", size = 65589, upload-time = "2025-12-06T13:25:08.809Z" },
-    { url = "https://files.pythonhosted.org/packages/49/b1/d1e82bd58805bb5a3a662864800bab83a83a36ba56e7e3b1706c708002a5/pybase64-1.4.3-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:9cf21ea8c70c61eddab3421fbfce061fac4f2fb21f7031383005a1efdb13d0b9", size = 60670, upload-time = "2025-12-06T13:25:10.04Z" },
-    { url = "https://files.pythonhosted.org/packages/15/67/16c609b7a13d1d9fc87eca12ba2dce5e67f949eeaab61a41bddff843cbb0/pybase64-1.4.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:afff11b331fdc27692fc75e85ae083340a35105cea1a3c4552139e2f0e0d174f", size = 64194, upload-time = "2025-12-06T13:25:11.48Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/11/37bc724e42960f0106c2d33dc957dcec8f760c91a908cc6c0df7718bc1a8/pybase64-1.4.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9a5143df542c1ce5c1f423874b948c4d689b3f05ec571f8792286197a39ba02", size = 64984, upload-time = "2025-12-06T13:25:12.645Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/66/b2b962a6a480dd5dae3029becf03ea1a650d326e39bf1c44ea3db78bb010/pybase64-1.4.3-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:d62e9861019ad63624b4a7914dff155af1cc5d6d79df3be14edcaedb5fdad6f9", size = 58750, upload-time = "2025-12-06T13:25:13.848Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/15/9b6d711035e29b18b2e1c03d47f41396d803d06ef15b6c97f45b75f73f04/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:84cfd4d92668ef5766cc42a9c9474b88960ac2b860767e6e7be255c6fddbd34a", size = 63816, upload-time = "2025-12-06T13:25:15.356Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/21/e2901381ed0df62e2308380f30d9c4d87d6b74e33a84faed3478d33a7197/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:60fc025437f9a7c2cc45e0c19ed68ed08ba672be2c5575fd9d98bdd8f01dd61f", size = 56348, upload-time = "2025-12-06T13:25:16.559Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/16/3d788388a178a0407aa814b976fe61bfa4af6760d9aac566e59da6e4a8b4/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:edc8446196f04b71d3af76c0bd1fe0a45066ac5bffecca88adb9626ee28c266f", size = 72842, upload-time = "2025-12-06T13:25:18.055Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/63/c15b1f8bd47ea48a5a2d52a4ec61f037062932ea6434ab916107b58e861e/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e99f6fa6509c037794da57f906ade271f52276c956d00f748e5b118462021d48", size = 62651, upload-time = "2025-12-06T13:25:19.191Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/b8/f544a2e37c778d59208966d4ef19742a0be37c12fc8149ff34483c176616/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d94020ef09f624d841aa9a3a6029df8cf65d60d7a6d5c8687579fa68bd679b65", size = 58295, upload-time = "2025-12-06T13:25:20.822Z" },
-    { url = "https://files.pythonhosted.org/packages/03/99/1fae8a3b7ac181e36f6e7864a62d42d5b1f4fa7edf408c6711e28fba6b4d/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:f64ce70d89942a23602dee910dec9b48e5edf94351e1b378186b74fcc00d7f66", size = 60960, upload-time = "2025-12-06T13:25:22.099Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/9e/cd4c727742345ad8384569a4466f1a1428f4e5cc94d9c2ab2f53d30be3fe/pybase64-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8ea99f56e45c469818b9781903be86ba4153769f007ba0655fa3b46dc332803d", size = 74863, upload-time = "2025-12-06T13:25:23.442Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/8f/43c3bb11ca9bacf81cb0b7a71500bb65b2eda6d5fe07433c09b543de97f3/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5c29a582b0ea3936d02bd6fe9bf674ab6059e6e45ab71c78404ab2c913224414", size = 43461, upload-time = "2025-12-06T13:26:28.906Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/4c/2a5258329200be57497d3972b5308558c6de42e3749c6cc2aa1cbe34b25a/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6b664758c804fa919b4f1257aa8cf68e95db76fc331de5f70bfc3a34655afe1", size = 36058, upload-time = "2025-12-06T13:26:30.092Z" },
-]
-
-[[package]]
-name = "pycountry"
-version = "26.2.16"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/de/1d/061b9e7a48b85cfd69f33c33d2ef784a531c359399ad764243399673c8f5/pycountry-26.2.16.tar.gz", hash = "sha256:5b6027d453fcd6060112b951dd010f01f168b51b4bf8a1f1fc8c95c8d94a0801", size = 7711342, upload-time = "2026-02-17T03:42:52.367Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9c/42/7703bd45b62fecd44cd7d3495423097e2f7d28bc2e99e7c1af68892ab157/pycountry-26.2.16-py3-none-any.whl", hash = "sha256:115c4baf7cceaa30f59a4694d79483c9167dbce7a9de4d3d571c5f3ea77c305a", size = 8044600, upload-time = "2026-02-17T03:42:49.777Z" },
-]
-
 [[package]]
 name = "pycparser"
 version = "3.0"
@@ -5634,11 +5142,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262, upload-time = "2026-05-06T13:43:02.641Z" },
 ]
 
-[package.optional-dependencies]
-email = [
-    { name = "email-validator", marker = "sys_platform == 'linux'" },
-]
-
 [[package]]
 name = "pydantic-core"
 version = "2.46.4"
@@ -5714,38 +5217,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/c3/7c8b240552251faf6b3a957db200fcfbbcec36763c050428b601e0c9b83b/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c603d540afdd6b80eb39f078f33ebd46211f02f33e34a32d9f053bba711de0", size = 2147590, upload-time = "2026-05-06T13:39:29.883Z" },
 ]
 
-[[package]]
-name = "pydantic-extra-types"
-version = "2.11.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/66/71/dba38ee2651f84f7842206adbd2233d8bbdb59fb85e9fa14232486a8c471/pydantic_extra_types-2.11.1.tar.gz", hash = "sha256:46792d2307383859e923d8fcefa82108b1a141f8a9c0198982b3832ab5ef1049", size = 172002, upload-time = "2026-03-16T08:08:03.92Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl", hash = "sha256:1722ea2bddae5628ace25f2aa685b69978ef533123e5638cfbddb999e0100ec1", size = 79526, upload-time = "2026-03-16T08:08:02.533Z" },
-]
-
-[package.optional-dependencies]
-pycountry = [
-    { name = "pycountry", marker = "sys_platform == 'linux'" },
-]
-
-[[package]]
-name = "pydantic-settings"
-version = "2.14.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "python-dotenv", marker = "sys_platform == 'linux'" },
-    { name = "typing-inspection", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/07/60/1d1e59c9c90d54591469ada7d268251f71c24bdb765f1a8a832cee8c6653/pydantic_settings-2.14.1.tar.gz", hash = "sha256:e874d3bec7e787b0c9958277956ed9b4dd5de6a80e162188fdaff7c5e26fd5fa", size = 235551, upload-time = "2026-05-08T13:40:06.542Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/8d/f1af3832f5e6eb13ba94ee809e72b8ecb5eef226d27ee0bef7d963d943c7/pydantic_settings-2.14.1-py3-none-any.whl", hash = "sha256:6e3c7edfd8277687cdc598f56e5cff0e9bfff0910a3749deaa8d4401c3a2b9de", size = 60964, upload-time = "2026-05-08T13:40:04.958Z" },
-]
-
 [[package]]
 name = "pygame"
 version = "2.6.1"
@@ -6076,15 +5547,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/27/be/0631a861af4d1c875f096c07d34e9a63639560a717130e7a87cbc82b7e3f/python_json_logger-4.1.0-py3-none-any.whl", hash = "sha256:132994765cf75bf44554be9aa49b06ef2345d23661a96720262716438141b6b2", size = 15021, upload-time = "2026-03-29T04:39:55.266Z" },
 ]
 
-[[package]]
-name = "python-multipart"
-version = "0.0.29"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4e/fe/70bd71a6738b09a0bdf6480ca6436b167469ca4578b2a0efbe390b4b0e70/python_multipart-0.0.29.tar.gz", hash = "sha256:643e93849196645e2dbdd81a0f8829a23123ad7f797a84a364c6fb3563f18904", size = 45678, upload-time = "2026-05-17T17:29:47.654Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8f/cb/769cfc37177252872a45a71f3fbdde9d51b471a3f3c14bfe95dde3407386/python_multipart-0.0.29-py3-none-any.whl", hash = "sha256:2ddcc971cef266225f54f552d8fa10bcfbb1f14446caec199060daac59ff2d69", size = 29640, upload-time = "2026-05-17T17:29:45.69Z" },
-]
-
 [[package]]
 name = "python-xlib"
 version = "0.33"
@@ -6240,34 +5702,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120, upload-time = "2025-09-23T09:38:56.297Z" },
 ]
 
-[[package]]
-name = "ray"
-version = "2.55.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click", marker = "sys_platform == 'linux'" },
-    { name = "filelock", marker = "sys_platform == 'linux'" },
-    { name = "jsonschema", marker = "sys_platform == 'linux'" },
-    { name = "msgpack", marker = "sys_platform == 'linux'" },
-    { name = "packaging", marker = "sys_platform == 'linux'" },
-    { name = "protobuf", marker = "sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "sys_platform == 'linux'" },
-    { name = "requests", marker = "sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f1/13/0db535102d0256b350ca116d8987588aca1a1f9ebb4638e1e1ff88bbcef8/ray-2.55.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:26541f69bb55607ef8335baac75b2ed12ff2ce02d56313219b29eda003039221", size = 72910802, upload-time = "2026-04-22T20:10:04.382Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/f8/fffadf3f4285eebd460e4d7f2ed1c0cd641ed89613c3f49eb881ee9fa7e2/ray-2.55.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:263705f6bab29e7622a94f82da25fd7f9cead76cdf89a07aab28f79cdf8f9d95", size = 73765203, upload-time = "2026-04-22T20:10:10.495Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/13/87deecc090c672e45a0cf6f5eef511de448b93f37ef18fd10eb8e8557a0d/ray-2.55.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:b415d590e062f248907e0fe42994943f11726b7178fcf4b1cf5546721fb1a5f8", size = 72818676, upload-time = "2026-04-22T20:10:26.705Z" },
-    { url = "https://files.pythonhosted.org/packages/71/d7/fc95d3b8824c62105c64aa1b59c59600b581f608d78a2af753e010936dc9/ray-2.55.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:1380e043eb57cde69b7e9199c6f2558ceeb8f0fc41c97d1d5e50ea042115f302", size = 73678908, upload-time = "2026-04-22T20:10:32.795Z" },
-    { url = "https://files.pythonhosted.org/packages/94/62/607a8859520ce350861425f11f8e15d66c15ee33e6aac812f9e2889b5df4/ray-2.55.1-cp314-cp314-manylinux2014_aarch64.whl", hash = "sha256:4e618d61e1b14b6fde9a586151f3fd9d435b0b85048b997bcaa7f4a533747b2b", size = 72814044, upload-time = "2026-04-22T20:10:46.985Z" },
-    { url = "https://files.pythonhosted.org/packages/04/5a/0699bef04a72d7dc54462960d07ef7a19cd8b1e09979880aba2b6d13cca2/ray-2.55.1-cp314-cp314-manylinux2014_x86_64.whl", hash = "sha256:156ed3e72ad95b645d2006cd71a8dddbcc89b56bfc00027f6225adf78bd9cb74", size = 73644244, upload-time = "2026-04-22T20:10:52.973Z" },
-]
-
-[package.optional-dependencies]
-cgraph = [
-    { name = "cupy-cuda12x", marker = "sys_platform == 'linux'" },
-]
-
 [[package]]
 name = "reachy2-sdk"
 version = "1.0.15"
@@ -6510,68 +5944,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
 ]
 
-[[package]]
-name = "rich-toolkit"
-version = "0.19.10"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click", marker = "sys_platform == 'linux'" },
-    { name = "rich", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/fa/02/32217f3657ae91a0ea7cf1d74ade78f44352f830d00c468f753ddb3d4980/rich_toolkit-0.19.10.tar.gz", hash = "sha256:dc2e8c515ef9fbb4894e62bd41a2d2960dd7c2f505b5084894604d5ccfee3f09", size = 198167, upload-time = "2026-05-21T10:11:42.397Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/35/84/a005adcb4d1e6846ba3d62768090c3b943e3f6d8dc5c47af64f33584c4a7/rich_toolkit-0.19.10-py3-none-any.whl", hash = "sha256:93a41f67a09aefe90379f1729495c2fee9ccbcc8cfda48e2ca2ae54a995e32b1", size = 33907, upload-time = "2026-05-21T10:11:43.578Z" },
-]
-
-[[package]]
-name = "rignore"
-version = "0.7.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e5/f5/8bed2310abe4ae04b67a38374a4d311dd85220f5d8da56f47ae9361be0b0/rignore-0.7.6.tar.gz", hash = "sha256:00d3546cd793c30cb17921ce674d2c8f3a4b00501cb0e3dd0e82217dbeba2671", size = 57140, upload-time = "2025-11-05T21:41:21.968Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4a/c8/dea564b36dedac8de21c18e1851789545bc52a0c22ece9843444d5608a6a/rignore-0.7.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bda49950d405aa8d0ebe26af807c4e662dd281d926530f03f29690a2e07d649a", size = 897821, upload-time = "2025-11-05T20:40:52.613Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/2b/ee96db17ac1835e024c5d0742eefb7e46de60020385ac883dd3d1cde2c1f/rignore-0.7.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5fd5ab3840b8c16851d327ed06e9b8be6459702a53e5ab1fc4073b684b3789e", size = 873963, upload-time = "2025-11-05T20:41:07.49Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/8c/ad5a57bbb9d14d5c7e5960f712a8a0b902472ea3f4a2138cbf70d1777b75/rignore-0.7.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ced2a248352636a5c77504cb755dc02c2eef9a820a44d3f33061ce1bb8a7f2d2", size = 1169216, upload-time = "2025-11-05T20:41:23.73Z" },
-    { url = "https://files.pythonhosted.org/packages/80/e6/5b00bc2a6bc1701e6878fca798cf5d9125eb3113193e33078b6fc0d99123/rignore-0.7.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a04a3b73b75ddc12c9c9b21efcdaab33ca3832941d6f1d67bffd860941cd448a", size = 942942, upload-time = "2025-11-05T20:41:39.393Z" },
-    { url = "https://files.pythonhosted.org/packages/85/e5/7f99bd0cc9818a91d0e8b9acc65b792e35750e3bdccd15a7ee75e64efca4/rignore-0.7.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d24321efac92140b7ec910ac7c53ab0f0c86a41133d2bb4b0e6a7c94967f44dd", size = 959787, upload-time = "2025-11-05T20:42:09.765Z" },
-    { url = "https://files.pythonhosted.org/packages/55/54/2ffea79a7c1eabcede1926347ebc2a81bc6b81f447d05b52af9af14948b9/rignore-0.7.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73c7aa109d41e593785c55fdaa89ad80b10330affa9f9d3e3a51fa695f739b20", size = 984245, upload-time = "2025-11-05T20:41:54.062Z" },
-    { url = "https://files.pythonhosted.org/packages/41/f7/e80f55dfe0f35787fa482aa18689b9c8251e045076c35477deb0007b3277/rignore-0.7.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1734dc49d1e9501b07852ef44421f84d9f378da9fbeda729e77db71f49cac28b", size = 1078647, upload-time = "2025-11-05T21:40:13.463Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/cf/2c64f0b6725149f7c6e7e5a909d14354889b4beaadddaa5fff023ec71084/rignore-0.7.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5719ea14ea2b652c0c0894be5dfde954e1853a80dea27dd2fbaa749618d837f5", size = 1139186, upload-time = "2025-11-05T21:40:31.27Z" },
-    { url = "https://files.pythonhosted.org/packages/75/95/a86c84909ccc24af0d094b50d54697951e576c252a4d9f21b47b52af9598/rignore-0.7.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e23424fc7ce35726854f639cb7968151a792c0c3d9d082f7f67e0c362cfecca", size = 1117604, upload-time = "2025-11-05T21:40:48.07Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/5e/13b249613fd5d18d58662490ab910a9f0be758981d1797789913adb4e918/rignore-0.7.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3efdcf1dd84d45f3e2bd2f93303d9be103888f56dfa7c3349b5bf4f0657ec696", size = 1127725, upload-time = "2025-11-05T21:41:05.804Z" },
-    { url = "https://files.pythonhosted.org/packages/36/31/b65b837e39c3f7064c426754714ac633b66b8c2290978af9d7f513e14aa9/rignore-0.7.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1ad295537041dc2ed4b540fb1a3906bd9ede6ccdad3fe79770cd89e04e3c73c", size = 897406, upload-time = "2025-11-05T20:40:53.854Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/58/1970ce006c427e202ac7c081435719a076c478f07b3a23f469227788dc23/rignore-0.7.6-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f782dbd3a65a5ac85adfff69e5c6b101285ef3f845c3a3cae56a54bebf9fe116", size = 874050, upload-time = "2025-11-05T20:41:08.922Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/00/eb45db9f90137329072a732273be0d383cb7d7f50ddc8e0bceea34c1dfdf/rignore-0.7.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65cece3b36e5b0826d946494734c0e6aaf5a0337e18ff55b071438efe13d559e", size = 1167835, upload-time = "2025-11-05T20:41:24.997Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/f1/6f1d72ddca41a64eed569680587a1236633587cc9f78136477ae69e2c88a/rignore-0.7.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d7e4bb66c13cd7602dc8931822c02dfbbd5252015c750ac5d6152b186f0a8be0", size = 941945, upload-time = "2025-11-05T20:41:40.628Z" },
-    { url = "https://files.pythonhosted.org/packages/48/6f/2f178af1c1a276a065f563ec1e11e7a9e23d4996fd0465516afce4b5c636/rignore-0.7.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297e500c15766e196f68aaaa70e8b6db85fa23fdc075b880d8231fdfba738cd7", size = 959067, upload-time = "2025-11-05T20:42:11.09Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/db/423a81c4c1e173877c7f9b5767dcaf1ab50484a94f60a0b2ed78be3fa765/rignore-0.7.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a07084211a8d35e1a5b1d32b9661a5ed20669970b369df0cf77da3adea3405de", size = 984438, upload-time = "2025-11-05T20:41:55.443Z" },
-    { url = "https://files.pythonhosted.org/packages/31/eb/c4f92cc3f2825d501d3c46a244a671eb737fc1bcf7b05a3ecd34abb3e0d7/rignore-0.7.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:181eb2a975a22256a1441a9d2f15eb1292839ea3f05606620bd9e1938302cf79", size = 1078365, upload-time = "2025-11-05T21:40:15.148Z" },
-    { url = "https://files.pythonhosted.org/packages/26/09/99442f02794bd7441bfc8ed1c7319e890449b816a7493b2db0e30af39095/rignore-0.7.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7bbcdc52b5bf9f054b34ce4af5269df5d863d9c2456243338bc193c28022bd7b", size = 1139066, upload-time = "2025-11-05T21:40:32.771Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/88/bcfc21e520bba975410e9419450f4b90a2ac8236b9a80fd8130e87d098af/rignore-0.7.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f2e027a6da21a7c8c0d87553c24ca5cc4364def18d146057862c23a96546238e", size = 1118036, upload-time = "2025-11-05T21:40:49.646Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/25/d37215e4562cda5c13312636393aea0bafe38d54d4e0517520a4cc0753ec/rignore-0.7.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee4a18b82cbbc648e4aac1510066682fe62beb5dc88e2c67c53a83954e541360", size = 1127550, upload-time = "2025-11-05T21:41:07.648Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/0f/348c829ea2d8d596e856371b14b9092f8a5dfbb62674ec9b3f67e4939a9d/rignore-0.7.6-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ce2268837c3600f82ab8db58f5834009dc638ee17103582960da668963bebc5", size = 899044, upload-time = "2025-11-05T20:40:55.336Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/30/2e1841a19b4dd23878d73edd5d82e998a83d5ed9570a89675f140ca8b2ad/rignore-0.7.6-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:690a3e1b54bfe77e89c4bacb13f046e642f8baadafc61d68f5a726f324a76ab6", size = 874144, upload-time = "2025-11-05T20:41:10.195Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/bf/0ce9beb2e5f64c30e3580bef09f5829236889f01511a125f98b83169b993/rignore-0.7.6-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09d12ac7a0b6210c07bcd145007117ebd8abe99c8eeb383e9e4673910c2754b2", size = 1168062, upload-time = "2025-11-05T20:41:26.511Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/8b/571c178414eb4014969865317da8a02ce4cf5241a41676ef91a59aab24de/rignore-0.7.6-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a2b2b74a8c60203b08452479b90e5ce3dbe96a916214bc9eb2e5af0b6a9beb0", size = 942542, upload-time = "2025-11-05T20:41:41.838Z" },
-    { url = "https://files.pythonhosted.org/packages/19/62/7a3cf601d5a45137a7e2b89d10c05b5b86499190c4b7ca5c3c47d79ee519/rignore-0.7.6-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fc5a531ef02131e44359419a366bfac57f773ea58f5278c2cdd915f7d10ea94", size = 958739, upload-time = "2025-11-05T20:42:12.463Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/1f/4261f6a0d7caf2058a5cde2f5045f565ab91aa7badc972b57d19ce58b14e/rignore-0.7.6-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7a1f77d9c4cd7e76229e252614d963442686bfe12c787a49f4fe481df49e7a9", size = 984138, upload-time = "2025-11-05T20:41:56.775Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/bf/628dfe19c75e8ce1f45f7c248f5148b17dfa89a817f8e3552ab74c3ae812/rignore-0.7.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ead81f728682ba72b5b1c3d5846b011d3e0174da978de87c61645f2ed36659a7", size = 1079299, upload-time = "2025-11-05T21:40:16.639Z" },
-    { url = "https://files.pythonhosted.org/packages/af/a5/be29c50f5c0c25c637ed32db8758fdf5b901a99e08b608971cda8afb293b/rignore-0.7.6-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:12ffd50f520c22ffdabed8cd8bfb567d9ac165b2b854d3e679f4bcaef11a9441", size = 1139618, upload-time = "2025-11-05T21:40:34.507Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/40/3c46cd7ce4fa05c20b525fd60f599165e820af66e66f2c371cd50644558f/rignore-0.7.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:e5a16890fbe3c894f8ca34b0fcacc2c200398d4d46ae654e03bc9b3dbf2a0a72", size = 1117626, upload-time = "2025-11-05T21:40:51.494Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/b9/aea926f263b8a29a23c75c2e0d8447965eb1879d3feb53cfcf84db67ed58/rignore-0.7.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3abab3bf99e8a77488ef6c7c9a799fac22224c28fe9f25cc21aa7cc2b72bfc0b", size = 1128144, upload-time = "2025-11-05T21:41:09.169Z" },
-    { url = "https://files.pythonhosted.org/packages/71/30/054880b09c0b1b61d17eeb15279d8bf729c0ba52b36c3ada52fb827cbb3c/rignore-0.7.6-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bec3994665a44454df86deb762061e05cd4b61e3772f5b07d1882a8a0d2748d5", size = 897611, upload-time = "2025-11-05T20:40:56.475Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/40/b2d1c169f833d69931bf232600eaa3c7998ba4f9a402e43a822dad2ea9f2/rignore-0.7.6-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:26cba2edfe3cff1dfa72bddf65d316ddebf182f011f2f61538705d6dbaf54986", size = 873875, upload-time = "2025-11-05T20:41:11.561Z" },
-    { url = "https://files.pythonhosted.org/packages/55/59/ca5ae93d83a1a60e44b21d87deb48b177a8db1b85e82fc8a9abb24a8986d/rignore-0.7.6-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ffa86694fec604c613696cb91e43892aa22e1fec5f9870e48f111c603e5ec4e9", size = 1167245, upload-time = "2025-11-05T20:41:28.29Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/52/cf3dce392ba2af806cba265aad6bcd9c48bb2a6cb5eee448d3319f6e505b/rignore-0.7.6-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48efe2ed95aa8104145004afb15cdfa02bea5cdde8b0344afeb0434f0d989aa2", size = 941750, upload-time = "2025-11-05T20:41:43.111Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/be/3f344c6218d779395e785091d05396dfd8b625f6aafbe502746fcd880af2/rignore-0.7.6-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dcae43eb44b7f2457fef7cc87f103f9a0013017a6f4e62182c565e924948f21", size = 958896, upload-time = "2025-11-05T20:42:13.784Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/34/d3fa71938aed7d00dcad87f0f9bcb02ad66c85d6ffc83ba31078ce53646a/rignore-0.7.6-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2cd649a7091c0dad2f11ef65630d30c698d505cbe8660dd395268e7c099cc99f", size = 983992, upload-time = "2025-11-05T20:41:58.022Z" },
-    { url = "https://files.pythonhosted.org/packages/24/a4/52a697158e9920705bdbd0748d59fa63e0f3233fb92e9df9a71afbead6ca/rignore-0.7.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42de84b0289d478d30ceb7ae59023f7b0527786a9a5b490830e080f0e4ea5aeb", size = 1078181, upload-time = "2025-11-05T21:40:18.151Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/65/aa76dbcdabf3787a6f0fd61b5cc8ed1e88580590556d6c0207960d2384bb/rignore-0.7.6-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:875a617e57b53b4acbc5a91de418233849711c02e29cc1f4f9febb2f928af013", size = 1139232, upload-time = "2025-11-05T21:40:35.966Z" },
-    { url = "https://files.pythonhosted.org/packages/08/44/31b31a49b3233c6842acc1c0731aa1e7fb322a7170612acf30327f700b44/rignore-0.7.6-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8703998902771e96e49968105207719f22926e4431b108450f3f430b4e268b7c", size = 1117349, upload-time = "2025-11-05T21:40:53.013Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/ae/1b199a2302c19c658cf74e5ee1427605234e8c91787cfba0015f2ace145b/rignore-0.7.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:602ef33f3e1b04c1e9a10a3c03f8bc3cef2d2383dcc250d309be42b49923cabc", size = 1127702, upload-time = "2025-11-05T21:41:10.881Z" },
-]
-
 [[package]]
 name = "robomimic"
 version = "0.2.0"
@@ -6586,8 +5958,8 @@ dependencies = [
     { name = "tensorboard", marker = "sys_platform == 'linux'" },
     { name = "tensorboardx", marker = "sys_platform == 'linux'" },
     { name = "termcolor", marker = "sys_platform == 'linux'" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.23.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.26.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "tqdm", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/c3/44b1d1ea4bcb4bbed43d19e09505f4142714451ded74020d4f679cdc89fb/robomimic-0.2.0.tar.gz", hash = "sha256:ee3bb5cf9c3e1feead6b57b43c5db738fd0a8e0c015fdf6419808af8fffdc463", size = 192919, upload-time = "2021-12-17T19:00:33.279Z" }
@@ -6836,24 +6208,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1c/78/504fdd027da3b84ff1aecd9f6957e65f35134534ccc6da8628eb71e76d3f/send2trash-2.1.0-py3-none-any.whl", hash = "sha256:0da2f112e6d6bb22de6aa6daa7e144831a4febf2a87261451c4ad849fe9a873c", size = 17610, upload-time = "2026-01-14T06:27:35.218Z" },
 ]
 
-[[package]]
-name = "sentencepiece"
-version = "0.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/fa/d3d5ebcba3cb9e6d3775a096251860c41a6bc53a1b9461151df83fe93255/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99f955df238021bf11f0fc37cdb54fd5e5b5f7fd30ecc3d93fb48b6815437167", size = 1316273, upload-time = "2025-08-12T06:59:44.476Z" },
-    { url = "https://files.pythonhosted.org/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cdfecef430d985f1c2bcbfff3defd1d95dae876fbd0173376012d2d7d24044b", size = 1387881, upload-time = "2025-08-12T06:59:46.09Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/d2/f552be5928105588f4f4d66ee37dd4c61460d8097e62d0e2e0eec41bc61d/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b670879c370d350557edabadbad1f6561a9e6968126e6debca4029e5547820", size = 1316271, upload-time = "2025-08-12T06:59:58.109Z" },
-    { url = "https://files.pythonhosted.org/packages/96/df/0cfe748ace5485be740fed9476dee7877f109da32ed0d280312c94ec259f/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7f0fd2f2693309e6628aeeb2e2faf6edd221134dfccac3308ca0de01f8dab47", size = 1387882, upload-time = "2025-08-12T07:00:00.701Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/85/c72fd1f3c7a6010544d6ae07f8ddb38b5e2a7e33bd4318f87266c0bbafbf/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b81a24733726e3678d2db63619acc5a8dccd074f7aa7a54ecd5ca33ca6d2d596", size = 1315722, upload-time = "2025-08-12T07:00:12.989Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/e8/661e5bd82a8aa641fd6c1020bd0e890ef73230a2b7215ddf9c8cd8e941c2/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0a81799d0a68d618e89063fb423c3001a034c893069135ffe51fee439ae474d6", size = 1387452, upload-time = "2025-08-12T07:00:15.088Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ad/d5c7075f701bd97971d7c2ac2904f227566f51ef0838dfbdfdccb58cd212/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1855f57db07b51fb51ed6c9c452f570624d2b169b36f0f79ef71a6e6c618cd8b", size = 1316247, upload-time = "2025-08-12T07:00:26.435Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/03/35fbe5f3d9a7435eebd0b473e09584bd3cc354ce118b960445b060d33781/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01e6912125cb45d3792f530a4d38f8e21bf884d6b4d4ade1b2de5cf7a8d2a52b", size = 1387894, upload-time = "2025-08-12T07:00:28.339Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/23/195b2e7ec85ebb6a547969f60b723c7aca5a75800ece6cc3f41da872d14e/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:010f025a544ef770bb395091d57cb94deb9652d8972e0d09f71d85d5a0816c8c", size = 1315721, upload-time = "2025-08-12T07:00:42.914Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/aa/553dbe4178b5f23eb28e59393dddd64186178b56b81d9b8d5c3ff1c28395/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:733e59ff1794d26db706cd41fc2d7ca5f6c64a820709cb801dc0ea31780d64ab", size = 1387458, upload-time = "2025-08-12T07:00:44.56Z" },
-]
-
 [[package]]
 name = "sentry-sdk"
 version = "2.59.0"
@@ -6867,44 +6221,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bf/00/b8cc413748fb6383d1582e7cda51314f99743351c462a92dc690d5b5853b/sentry_sdk-2.59.0-py2.py3-none-any.whl", hash = "sha256:abcf65ee9a9d9cdebf9ad369782408ecca9c1c792686ef06ba34f5ab233527fe", size = 468432, upload-time = "2026-05-04T12:19:04.741Z" },
 ]
 
-[[package]]
-name = "setproctitle"
-version = "1.3.7"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8d/48/49393a96a2eef1ab418b17475fb92b8fcfad83d099e678751b05472e69de/setproctitle-1.3.7.tar.gz", hash = "sha256:bc2bc917691c1537d5b9bca1468437176809c7e11e5694ca79a9ca12345dcb9e", size = 27002, upload-time = "2025-09-05T12:51:25.278Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d0/99/71630546b9395b095f4082be41165d1078204d1696c2d9baade3de3202d0/setproctitle-1.3.7-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2906b6c7959cdb75f46159bf0acd8cc9906cf1361c9e1ded0d065fe8f9039629", size = 32932, upload-time = "2025-09-05T12:49:39.271Z" },
-    { url = "https://files.pythonhosted.org/packages/50/22/cee06af4ffcfb0e8aba047bd44f5262e644199ae7527ae2c1f672b86495c/setproctitle-1.3.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6915964a6dda07920a1159321dcd6d94fc7fc526f815ca08a8063aeca3c204f1", size = 33736, upload-time = "2025-09-05T12:49:40.565Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/00/a5949a8bb06ef5e7df214fc393bb2fb6aedf0479b17214e57750dfdd0f24/setproctitle-1.3.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cff72899861c765bd4021d1ff1c68d60edc129711a2fdba77f9cb69ef726a8b6", size = 35605, upload-time = "2025-09-05T12:49:42.362Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/3a/50caca532a9343828e3bf5778c7a84d6c737a249b1796d50dd680290594d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b7cb05bd446687ff816a3aaaf831047fc4c364feff7ada94a66024f1367b448c", size = 33143, upload-time = "2025-09-05T12:49:43.515Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/14/b843a251296ce55e2e17c017d6b9f11ce0d3d070e9265de4ecad948b913d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3a57b9a00de8cae7e2a1f7b9f0c2ac7b69372159e16a7708aa2f38f9e5cc987a", size = 34434, upload-time = "2025-09-05T12:49:45.31Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/b7/06145c238c0a6d2c4bc881f8be230bb9f36d2bf51aff7bddcb796d5eed67/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d8828b356114f6b308b04afe398ed93803d7fca4a955dd3abe84430e28d33739", size = 32795, upload-time = "2025-09-05T12:49:46.419Z" },
-    { url = "https://files.pythonhosted.org/packages/87/ed/0a4f00315bc02510395b95eec3d4aa77c07192ee79f0baae77ea7b9603d8/setproctitle-1.3.7-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0564a936ea687cd24dffcea35903e2a20962aa6ac20e61dd3a207652401492dd", size = 33284, upload-time = "2025-09-05T12:49:52.741Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/e4/adf3c4c0a2173cb7920dc9df710bcc67e9bcdbf377e243b7a962dc31a51a/setproctitle-1.3.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5d1cb3f81531f0eb40e13246b679a1bdb58762b170303463cb06ecc296f26d0", size = 34104, upload-time = "2025-09-05T12:49:54.416Z" },
-    { url = "https://files.pythonhosted.org/packages/52/4f/6daf66394152756664257180439d37047aa9a1cfaa5e4f5ed35e93d1dc06/setproctitle-1.3.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a7d159e7345f343b44330cbba9194169b8590cb13dae940da47aa36a72aa9929", size = 35982, upload-time = "2025-09-05T12:49:56.295Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/62/f2c0595403cf915db031f346b0e3b2c0096050e90e0be658a64f44f4278a/setproctitle-1.3.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0b5074649797fd07c72ca1f6bff0406f4a42e1194faac03ecaab765ce605866f", size = 33150, upload-time = "2025-09-05T12:49:58.025Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/29/10dd41cde849fb2f9b626c846b7ea30c99c81a18a5037a45cc4ba33c19a7/setproctitle-1.3.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:61e96febced3f61b766115381d97a21a6265a0f29188a791f6df7ed777aef698", size = 34463, upload-time = "2025-09-05T12:49:59.424Z" },
-    { url = "https://files.pythonhosted.org/packages/71/3c/cedd8eccfaf15fb73a2c20525b68c9477518917c9437737fa0fda91e378f/setproctitle-1.3.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:047138279f9463f06b858e579cc79580fbf7a04554d24e6bddf8fe5dddbe3d4c", size = 32848, upload-time = "2025-09-05T12:50:01.107Z" },
-    { url = "https://files.pythonhosted.org/packages/52/09/f366eca0973cfbac1470068d1313fa3fe3de4a594683385204ec7f1c4101/setproctitle-1.3.7-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c18246d88e227a5b16248687514f95642505000442165f4b7db354d39d0e4c29", size = 34490, upload-time = "2025-09-05T12:50:04.948Z" },
-    { url = "https://files.pythonhosted.org/packages/71/36/611fc2ed149fdea17c3677e1d0df30d8186eef9562acc248682b91312706/setproctitle-1.3.7-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7081f193dab22df2c36f9fc6d113f3793f83c27891af8fe30c64d89d9a37e152", size = 35267, upload-time = "2025-09-05T12:50:06.015Z" },
-    { url = "https://files.pythonhosted.org/packages/88/a4/64e77d0671446bd5a5554387b69e1efd915274686844bea733714c828813/setproctitle-1.3.7-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9cc9b901ce129350637426a89cfd650066a4adc6899e47822e2478a74023ff7c", size = 37376, upload-time = "2025-09-05T12:50:07.484Z" },
-    { url = "https://files.pythonhosted.org/packages/89/bc/ad9c664fe524fb4a4b2d3663661a5c63453ce851736171e454fa2cdec35c/setproctitle-1.3.7-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:80e177eff2d1ec172188d0d7fd9694f8e43d3aab76a6f5f929bee7bf7894e98b", size = 33963, upload-time = "2025-09-05T12:50:09.056Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/01/a36de7caf2d90c4c28678da1466b47495cbbad43badb4e982d8db8167ed4/setproctitle-1.3.7-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:23e520776c445478a67ee71b2a3c1ffdafbe1f9f677239e03d7e2cc635954e18", size = 35550, upload-time = "2025-09-05T12:50:10.791Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/68/17e8aea0ed5ebc17fbf03ed2562bfab277c280e3625850c38d92a7b5fcd9/setproctitle-1.3.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5fa1953126a3b9bd47049d58c51b9dac72e78ed120459bd3aceb1bacee72357c", size = 33727, upload-time = "2025-09-05T12:50:12.032Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/26/8e3bb082992f19823d831f3d62a89409deb6092e72fc6940962983ffc94f/setproctitle-1.3.7-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fcb966a6c57cf07cc9448321a08f3be6b11b7635be502669bc1d8745115d7e7f", size = 33180, upload-time = "2025-09-05T12:50:20.395Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/af/ae692a20276d1159dd0cf77b0bcf92cbb954b965655eb4a69672099bb214/setproctitle-1.3.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46178672599b940368d769474fe13ecef1b587d58bb438ea72b9987f74c56ea5", size = 34043, upload-time = "2025-09-05T12:50:22.454Z" },
-    { url = "https://files.pythonhosted.org/packages/34/b2/6a092076324dd4dac1a6d38482bedebbff5cf34ef29f58585ec76e47bc9d/setproctitle-1.3.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7f9e9e3ff135cbcc3edd2f4cf29b139f4aca040d931573102742db70ff428c17", size = 35892, upload-time = "2025-09-05T12:50:23.937Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/1a/8836b9f28cee32859ac36c3df85aa03e1ff4598d23ea17ca2e96b5845a8f/setproctitle-1.3.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:14c7eba8d90c93b0e79c01f0bd92a37b61983c27d6d7d5a3b5defd599113d60e", size = 32898, upload-time = "2025-09-05T12:50:25.617Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/22/8fabdc24baf42defb599714799d8445fe3ae987ec425a26ec8e80ea38f8e/setproctitle-1.3.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9e64e98077fb30b6cf98073d6c439cd91deb8ebbf8fc62d9dbf52bd38b0c6ac0", size = 34308, upload-time = "2025-09-05T12:50:26.827Z" },
-    { url = "https://files.pythonhosted.org/packages/15/1b/b9bee9de6c8cdcb3b3a6cb0b3e773afdb86bbbc1665a3bfa424a4294fda2/setproctitle-1.3.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b91387cc0f02a00ac95dcd93f066242d3cca10ff9e6153de7ee07069c6f0f7c8", size = 32536, upload-time = "2025-09-05T12:50:28.5Z" },
-    { url = "https://files.pythonhosted.org/packages/21/9c/980b01f50d51345dd513047e3ba9e96468134b9181319093e61db1c47188/setproctitle-1.3.7-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1403d2abfd32790b6369916e2313dffbe87d6b11dca5bbd898981bcde48e7a2b", size = 34744, upload-time = "2025-09-05T12:50:32.777Z" },
-    { url = "https://files.pythonhosted.org/packages/86/b4/82cd0c86e6d1c4538e1a7eb908c7517721513b801dff4ba3f98ef816a240/setproctitle-1.3.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e7c5bfe4228ea22373e3025965d1a4116097e555ee3436044f5c954a5e63ac45", size = 35589, upload-time = "2025-09-05T12:50:34.13Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/4f/9f6b2a7417fd45673037554021c888b31247f7594ff4bd2239918c5cd6d0/setproctitle-1.3.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:585edf25e54e21a94ccb0fe81ad32b9196b69ebc4fc25f81da81fb8a50cca9e4", size = 37698, upload-time = "2025-09-05T12:50:35.524Z" },
-    { url = "https://files.pythonhosted.org/packages/20/92/927b7d4744aac214d149c892cb5fa6dc6f49cfa040cb2b0a844acd63dcaf/setproctitle-1.3.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:96c38cdeef9036eb2724c2210e8d0b93224e709af68c435d46a4733a3675fee1", size = 34201, upload-time = "2025-09-05T12:50:36.697Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/0c/fd4901db5ba4b9d9013e62f61d9c18d52290497f956745cd3e91b0d80f90/setproctitle-1.3.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:45e3ef48350abb49cf937d0a8ba15e42cee1e5ae13ca41a77c66d1abc27a5070", size = 35801, upload-time = "2025-09-05T12:50:38.314Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/e3/54b496ac724e60e61cc3447f02690105901ca6d90da0377dffe49ff99fc7/setproctitle-1.3.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:1fae595d032b30dab4d659bece20debd202229fce12b55abab978b7f30783d73", size = 33958, upload-time = "2025-09-05T12:50:39.841Z" },
-]
-
 [[package]]
 name = "setuptools"
 version = "79.0.1"
@@ -7058,21 +6374,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
-[[package]]
-name = "soundfile"
-version = "0.13.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cffi", marker = "sys_platform == 'linux'" },
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
-    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
-    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
-]
-
 [[package]]
 name = "soupsieve"
 version = "2.8.3"
@@ -7082,21 +6383,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
 ]
 
-[[package]]
-name = "soxr"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ed/11/27cebce4a108f77afea7c80545115536b45e3f11ebfb914f638fdd9ba847/soxr-1.1.0.tar.gz", hash = "sha256:9f228ae21c78fa9359ca98d8a5e8e91f30639e438e574133dace62c5b5309e44", size = 173067, upload-time = "2026-05-03T00:15:18.214Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/2b/2e5eba817a762a2ec589ff165b8bc5955b25a0ad140045f7cd8e45410543/soxr-1.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf98c0d7b7d5ef5bf072fee8d3020e8b664f2d195933ea7bc5089267c2e22a06", size = 206529, upload-time = "2026-05-03T00:14:57.646Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/f1/0e55195893228609c9a08c3b13b7a83a46c3a992cd00d3304f0f320cfb07/soxr-1.1.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b033078e86f3c4a658e5697fac8995764fad9e799563616b630136b613167f1", size = 240413, upload-time = "2026-05-03T00:14:59.363Z" },
-    { url = "https://files.pythonhosted.org/packages/39/e4/80cd9aae0645513db1076d4384e8b2d895faf5009218b4a04348012c54fc/soxr-1.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:52c9ca84e3dc656d83acc424574770e20ea8e0704dc3842d4e27b0fe9d3ba449", size = 211405, upload-time = "2026-05-03T00:15:05.395Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/d6/cc3c80ac9b2289da4cf46c5d53b05e4327e6f5560a25868d06f9e2213af1/soxr-1.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4977323ef9c3aa3c2a26ff5fe0191c84b8fd759daf7afb1f25a91a55ad8b730", size = 244617, upload-time = "2026-05-03T00:15:07.134Z" },
-]
-
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@@ -7286,7 +6572,7 @@ name = "thop"
 version = "0.1.1.post2209072238"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/bb/0f/72beeab4ff5221dc47127c80f8834b4bcd0cb36f6ba91c0b1d04a1233403/thop-0.1.1.post2209072238-py3-none-any.whl", hash = "sha256:01473c225231927d2ad718351f78ebf7cffe6af3bed464c4f1ba1ef0f7cdda27", size = 15443, upload-time = "2022-09-07T14:38:37.211Z" },
@@ -7304,38 +6590,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b4/af/ce4df3ca29122d219c45d3e86e5ff9a9df03b8cf31afd76817b662c803a3/tifffile-2026.5.2-py3-none-any.whl", hash = "sha256:5129b53b826e768a5b1af26b765eeea75c2d0a227d2d12849617e0737588e105", size = 266420, upload-time = "2026-05-02T20:19:29.814Z" },
 ]
 
-[[package]]
-name = "tiktoken"
-version = "0.13.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "regex", marker = "sys_platform == 'linux'" },
-    { name = "requests", marker = "sys_platform == 'linux'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e4/e5/5f3cb2159769d0f4324c0e9e87f9de3c4b1cd45848a96b2eb3566ad5ca77/tiktoken-0.13.0.tar.gz", hash = "sha256:c9435714c3a84c2319499de9a300c0e604449dd0799ff246458b3bb6a7f433c1", size = 38986, upload-time = "2026-05-15T04:51:27.153Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/74/ed/6bb8d05b9f731f749fee5c6f5ca63e981143c826a5985877330507bd13b7/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3f277ebea5edd7b8bf03c6f9431e1d67d517530115572b2dc1d465326e8f88c7", size = 1115741, upload-time = "2026-05-15T04:50:34.475Z" },
-    { url = "https://files.pythonhosted.org/packages/34/de/2ca96b07a82d972b74fe4b46de055b79c904e45c7eab699354a0bfa697dc/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a116178fa7e1b4065bff05214360373a65cac22f965be7b3f73d00a0dbfe7649", size = 1136523, upload-time = "2026-05-15T04:50:35.782Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/dc/9dafec002c2d4424378563cf4cf5c7fb93631d2a55013c8b87554ee4012c/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2c397ddda233208345b01bd30f2fca79ff730e55731d0108a603f9bc57f6af3b", size = 1181954, upload-time = "2026-05-15T04:50:36.99Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/d0/1f8578c45b2f24759b46f0b50d31878c63c73e6bf0f2227e10ec5c5408dc/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95097e4f89b06403976e498abf61a0ee73a7497e73fb599cb211d8197a054d91", size = 1240069, upload-time = "2026-05-15T04:50:38.221Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/8b/96cc178cc584e65d363134500f297790b06cd48cdeb1e8fcf7bbe60f4715/tiktoken-0.13.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:125bc05005e747f993a83dc67934249932d6e4209854452cd4c0b1d53fba3ba2", size = 1116355, upload-time = "2026-05-15T04:50:43.564Z" },
-    { url = "https://files.pythonhosted.org/packages/86/f5/bab735d2c72ea55404b295d02d092644eb5f7cc6205e34d35eb9abfb9ab2/tiktoken-0.13.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5e6358911cab4adee6712da27d65573496a4f68cf8a2b5fca6a4ad10fc5748cf", size = 1135772, upload-time = "2026-05-15T04:50:44.782Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/b9/6de04ebdf904edfaad87788011b3735087a0c9ea671b9027e1e4e965e8c8/tiktoken-0.13.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:975cbd78d085d75d26b59660e262736dcaed1e35f8f142cd6291025c01d25486", size = 1182415, upload-time = "2026-05-15T04:50:46.422Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/9c/470a05f3b1caf038f44880e334d47ab674e0c80d514c66b375d14d5afa10/tiktoken-0.13.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:75ab9bc99fa020a4c283424590ecd7f3afd70c1c281cb3fa3192a6c3af9f9615", size = 1239879, upload-time = "2026-05-15T04:50:48.052Z" },
-    { url = "https://files.pythonhosted.org/packages/72/74/ca1541b053e7648254d2e4b42a253e1bb4359f2c91a0a8d49228c794e1a0/tiktoken-0.13.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7de52e3f566d19b3b11bd37eea552c6c305ad74081f736882bd44d148ed4c48d", size = 1115518, upload-time = "2026-05-15T04:50:53.543Z" },
-    { url = "https://files.pythonhosted.org/packages/46/e3/93825eaf5a4a504795b787e5d5dea07fbeb3dabf97aa7b450be8bde59c89/tiktoken-0.13.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:51384448aa508e4df84c0f7c1dc3211c7f7b8096325660ee5fc82f3e11b381ce", size = 1136867, upload-time = "2026-05-15T04:50:55.191Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/46/002b68de6827091d5ae90b048f326e8aad8d953520950e5ce1508879414f/tiktoken-0.13.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e28157350f7ebf35008dd8e9e0fdb621f976e4230c881099c85e8cf07eaa50e2", size = 1181826, upload-time = "2026-05-15T04:50:56.296Z" },
-    { url = "https://files.pythonhosted.org/packages/db/c6/d393e3185a276505182f7abd93fe714f3c444a2be9180798fa052347504e/tiktoken-0.13.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:165cf1820ea4a354985c2490a5205d4cc74661c934aca79dd0368232fff94e0f", size = 1239489, upload-time = "2026-05-15T04:50:57.918Z" },
-    { url = "https://files.pythonhosted.org/packages/94/b0/c8ae9aff00d625c50659b4513e707a0462c4bf5d4d6cc1b802103225c02e/tiktoken-0.13.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:32e0c12305105002c047b3bb1070b0dd9a73b0cb3b2856a8972b810e7a4f5881", size = 1116036, upload-time = "2026-05-15T04:51:04.082Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/ac/6a5dddd1d0a6018ecb389bd0353e6b4a515eb4d2286611bd0ace1937b9e1/tiktoken-0.13.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:5ba5fd62507a932d1241346179e3b39bc7bf7408f03c272652d93b3bedf5db24", size = 1135544, upload-time = "2026-05-15T04:51:05.229Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/b8/585032b4384b2f7dcdaddcb52865c83a701a420d09e3c2b4a2be1c450c57/tiktoken-0.13.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d108bc2d470fc53c8ecd24f2c0fd2b5f98c33e87cdb6aa2e9b8c5dced703d273", size = 1182217, upload-time = "2026-05-15T04:51:06.517Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/b6/993ff1ded3958215fd341a847b8e5ffeb5de473f435296870d314fc91ac4/tiktoken-0.13.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cb99cb5127449f58d0a2d5f5ccfb390d8dbdfd919c221246caaee29d8725ed51", size = 1239404, upload-time = "2026-05-15T04:51:07.843Z" },
-    { url = "https://files.pythonhosted.org/packages/49/b7/2ab43f62788a9266187a9bfc1d3af99ad83e5eaa25fbef168a69cd5ad14f/tiktoken-0.13.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:2b920b35805cd64585a37c3dc7ce65fba4d2d36016be01e1d7942482ca29093a", size = 1115526, upload-time = "2026-05-15T04:51:12.608Z" },
-    { url = "https://files.pythonhosted.org/packages/64/39/1494321ed323ce7a14d88e3cd6cb9058625977df1c6961ddc492bd10a9f3/tiktoken-0.13.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:493af3aa28a4aaf2e3d2600a2ee717252c9bf5ab38fff94eb5a02db5ab77e5ad", size = 1136466, upload-time = "2026-05-15T04:51:13.926Z" },
-    { url = "https://files.pythonhosted.org/packages/96/d9/dfd086aa2d918c563a140720e0ce296cada1634efd2783d5cf51e05f984e/tiktoken-0.13.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6644c9c2b5cf3916f5a3641d7d12fdb3f006a7b3d9ff6acdaec44e29ab1ff91e", size = 1181863, upload-time = "2026-05-15T04:51:15.025Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/68/a18b4f307086954fdae32714cb4f85562e34f9d34ab206e61f1816aa6018/tiktoken-0.13.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5cb65b60b9408563676d874a3a4ee573370066f0dc4e29d84e82e989c6517424", size = 1239218, upload-time = "2026-05-15T04:51:16.103Z" },
-]
-
 [[package]]
 name = "timm"
 version = "1.0.27"
@@ -7344,10 +6598,10 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
-    { name = "torchvision", version = "0.23.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torchvision", version = "0.26.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torchvision", version = "0.26.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/08/54/ece85b0eef3700c90db8271a43669b05a0ebbe2edb1962329c34374a297e/timm-1.0.27.tar.gz", hash = "sha256:315dfe63186ca9fb7ff941268941231fd5be259f2b4bb4afa28560ae1015cb9a", size = 2439861, upload-time = "2026-05-08T19:38:36.844Z" }
 wheels = [
@@ -7401,58 +6655,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" },
 ]
 
-[[package]]
-name = "torch"
-version = "2.8.0+cu128"
-source = { registry = "https://download.pytorch.org/whl/cu128" }
-resolution-markers = [
-    "(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version == '3.14.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version < '3.13' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version >= '3.15' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version == '3.14.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "python_full_version >= '3.15' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.15' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version == '3.14.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.14.*' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "filelock", marker = "sys_platform == 'linux'" },
-    { name = "fsspec", marker = "sys_platform == 'linux'" },
-    { name = "jinja2", marker = "sys_platform == 'linux'" },
-    { name = "networkx", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-    { name = "sympy", marker = "sys_platform == 'linux'" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4354fc05bb79b208d6995a04ca1ceef6a9547b1c4334435574353d381c55087c", upload-time = "2025-10-01T23:51:02Z" },
-    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:3a852369a38dec343d45ecd0bc3660f79b88a23e0c878d18707f7c13bf49538f", upload-time = "2025-10-01T23:52:56Z" },
-    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4295a22d69408e93d25f51e8d5d579345b6b802383e9414b0f3853ed433d53ae", upload-time = "2025-10-01T23:54:56Z" },
-]
-
 [[package]]
 name = "torch"
 version = "2.11.0"
@@ -7510,19 +6712,54 @@ wheels = [
 ]
 
 [[package]]
-name = "torchaudio"
-version = "2.8.0"
-source = { registry = "https://pypi.org/simple" }
+name = "torch"
+version = "2.11.0+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+    "(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version == '3.14.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version == '3.13.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version < '3.13' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version >= '3.15' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version == '3.14.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "python_full_version >= '3.15' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.15' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version == '3.14.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.14.*' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
+    { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" },
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "fsspec", marker = "sys_platform == 'linux'" },
+    { name = "jinja2", marker = "sys_platform == 'linux'" },
+    { name = "networkx", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "sympy", marker = "sys_platform == 'linux'" },
+    { name = "triton", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/0d/24dad878784f1edd62862f27173781669f0c71eb46368636787d1e364188/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:862e2e40bf09d865e5df080a84c1a39bbcef40e43140f4b1737eb3a389d3b38f", size = 1692930, upload-time = "2025-08-06T14:58:41.312Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/a6/84d80f34472503e9eb82245d7df501c59602d75d7360e717fb9b84f91c5e/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:93a8583f280fe83ba021aa713319381ea71362cc87b67ee38e97a43cb2254aee", size = 4014607, upload-time = "2025-08-06T14:58:47.234Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/a3/1c79a8ef29fe403b83bdfc033db852bc2a888b80c406325e5c6fb37a7f2d/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:09535a9b727c0793cd07c1ace99f3f353626281bcc3e30c2f2314e3ebc9d3f96", size = 1692755, upload-time = "2025-08-06T14:58:50.868Z" },
-    { url = "https://files.pythonhosted.org/packages/49/df/61941198e9ac6bcebfdd57e1836e4f3c23409308e3d8d7458f0198a6a366/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2a85b124494736241884372fe1c6dd8c15e9bc1931bd325838c5c00238c7378", size = 4013897, upload-time = "2025-08-06T14:59:01.66Z" },
-    { url = "https://files.pythonhosted.org/packages/71/76/40fec21b65bccfdc5c8cdb9d511033ab07a7ad4b05f0a5b07f85c68279fc/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1951f10ed092f2dda57634f6a3950ef21c9d9352551aa84a9fccd51bbda18095", size = 1704199, upload-time = "2025-08-06T14:58:43.594Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/53/95c3363413c2f2009f805144160b093a385f641224465fbcd717449c71fb/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4f7d97494698d98854129349b12061e8c3398d33bd84c929fa9aed5fd1389f73", size = 4020596, upload-time = "2025-08-06T14:59:03.031Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9c8f38efee365cb9d334de8a83ce52fc7e5fc9e5a7b0853285efa1b69e00b0f2", upload-time = "2026-04-27T17:41:30Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d252cf975fb18c94a85336323ad425f473df56dab35a44b00399bd70c7a3b997", upload-time = "2026-04-27T17:42:06Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7db3580106bba044da5b8950f3fb8fe5f31999eaab3f6a3aa2ac5d202c3684d2", upload-time = "2026-04-27T17:45:35Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:db964b33c55035a72ab3e2162287af8f1cc276039c65d015740cc88c26dcedf7", upload-time = "2026-04-27T17:46:18Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd1cf1005c5fe419194ee294b7b584ba5ad0f2fb1778b3fe5a7b9c3f4617ddbc", upload-time = "2026-04-27T17:50:01Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:74b628dbc71603977b09f4e140792c6e997081a35ef3421555f3f6e201b81210", upload-time = "2026-04-27T17:50:42Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:baa52f7b8a53cab16587b10f1c27d1000ca033f97236878b685b75d5a1b92408", upload-time = "2026-04-27T17:54:24Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d389a850677f0d24dafae1573644034428d8d3b9c80b51d55ba62fed7e6c8777", upload-time = "2026-04-27T17:55:03Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:06849e9311dbb0617c97557d9c26c99a9e1c4f2ac9cb8e9b6d9b420d522acb91", upload-time = "2026-04-27T17:58:48Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:169a9987e1f84f0c5eee07544b3a34827a163ac9180e23abf0c3548f1335762c", upload-time = "2026-04-27T17:59:26Z" },
 ]
 
 [[package]]
@@ -7550,47 +6787,14 @@ version = "0.2.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "scipy" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/87/ec/a40aa124660f0ee65e6760cb53df6a82ad91a1a3ef1da5e747f1336644dd/torchdiffeq-0.2.5.tar.gz", hash = "sha256:b50d3760d13fd138dcceac651f4b80396f44fefcebd037a033fecfeaa9cc12e7", size = 31197, upload-time = "2024-11-21T20:20:11.552Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/35/537f64f2d0b3cfebaae0f903b4e3a3b239abcc99d0f73cb15b9cee9b8212/torchdiffeq-0.2.5-py3-none-any.whl", hash = "sha256:aa1db4bed13bd04952f28a53cdf4336d1ab60417c1d9698d7a239fec1cf2bcf8", size = 32902, upload-time = "2024-11-21T20:20:09.938Z" },
 ]
 
-[[package]]
-name = "torchvision"
-version = "0.23.0+cu128"
-source = { registry = "https://download.pytorch.org/whl/cu128" }
-resolution-markers = [
-    "(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version == '3.14.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version < '3.13' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')",
-    "(python_full_version >= '3.15' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version == '3.14.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "(python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'linux')",
-    "python_full_version >= '3.15' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.15' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version == '3.14.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.14.*' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "pillow", marker = "sys_platform == 'linux'" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.23.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9cb3c13997afcb44057ca10d943c6c4cba3068afde0f370965abce9c89fcffa9", upload-time = "2025-08-05T20:11:52Z" },
-    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.23.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c63982f1973ba677b37e6663df0e07cb5381459b6f0572c2ca95eebd8dfeb742", upload-time = "2025-08-05T20:11:52Z" },
-    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.23.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:0d6ff6489eb71e4c0bb08cf7cb253298c2520458b1bd67036733652acfa87f00", upload-time = "2025-08-05T20:11:52Z" },
-]
-
 [[package]]
 name = "torchvision"
 version = "0.26.0"
@@ -7643,6 +6847,46 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/6a/09f3844c10643f6c0de5d95abc863420cfaf194c88c7dffd0ac523e2015f/torchvision-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e9d0e022c19a78552fb055d0414d47fecb4a649309b9968573daea160ba6869c", size = 4454275, upload-time = "2026-03-23T18:12:27.487Z" },
 ]
 
+[[package]]
+name = "torchvision"
+version = "0.26.0+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+    "(python_full_version >= '3.15' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version == '3.14.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version == '3.13.*' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version < '3.13' and platform_machine == 'AMD64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')",
+    "(python_full_version >= '3.15' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.15' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version == '3.14.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.14.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "(python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'linux')",
+    "python_full_version >= '3.15' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.15' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version == '3.14.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.14.*' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine != 'AMD64' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 's390x' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:63e35234aed13b6edda37056f417b5c281249669db631e706811917af36b21d7", upload-time = "2026-04-09T23:21:35Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ccf26b4b659cfce6f2208cb8326071d51c70219a34856dfdf468d1e19af52c0d", upload-time = "2026-03-23T15:36:22Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c4a9cacd521f2a4df0bcd9d8e96704771b928f478f1f3067e4085bb53a1da298", upload-time = "2026-04-09T23:21:37Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:cb1f6184a7ba30fba40580e1a01a6604a86c55e79fdda187f40116ee680441ec", upload-time = "2026-03-23T15:36:22Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e594732552a8c2fee2ace9c6475c6c6904fc44ccca622ee6765a89a045416a44", upload-time = "2026-04-09T23:21:38Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6168abc019803ac9e97efce27eafd2fdb33db04dcc54a86039537729e5047b29", upload-time = "2026-03-23T15:36:23Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:b3865fa227661dd75b7b28c96d3d14e739bd08bf0614132758922fe0e7206f91", upload-time = "2026-04-09T23:21:39Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:aac647c9130f1f25f5c8f5bca3d95cfd96bdfac93ab54529690b088e64e4fa64", upload-time = "2026-03-23T15:36:23Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:e2ee9e16ee4518292694537fcbd20d2d27044e381d92b864f637e82795796a84", upload-time = "2026-04-09T23:21:40Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.26.0%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:b5772c55bfda4377df8f1930d43c4e0231ef231b0228eade4b227c8d3ba6e34e", upload-time = "2026-03-23T15:36:23Z" },
+]
+
 [[package]]
 name = "tornado"
 version = "6.5.5"
@@ -7715,15 +6959,19 @@ wheels = [
 
 [[package]]
 name = "triton"
-version = "3.4.0"
+version = "3.6.0"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" },
-    { url = "https://files.pythonhosted.org/packages/30/7b/0a685684ed5322d2af0bddefed7906674f67974aa88b0fae6e82e3b766f6/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00be2964616f4c619193cb0d1b29a99bd4b001d7dc333816073f92cf2a8ccdeb", size = 155569223, upload-time = "2025-07-30T19:58:44.017Z" },
-    { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780, upload-time = "2025-07-30T19:58:51.171Z" },
+    { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" },
+    { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" },
+    { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" },
+    { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" },
+    { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" },
 ]
 
 [[package]]
@@ -7882,73 +7130,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/5b/885f479093f6627669d39b57bc3d4e674da532e1a4b247d473a61d8d2118/virtualenv-21.3.2-py3-none-any.whl", hash = "sha256:c58ea748fa50bb2a4367da5ba3d30b02458ed40b4ea888faad94021f3309f764", size = 7594558, upload-time = "2026-05-12T14:44:15.193Z" },
 ]
 
-[[package]]
-name = "vllm"
-version = "0.11.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "aiohttp", marker = "sys_platform == 'linux'" },
-    { name = "blake3", marker = "sys_platform == 'linux'" },
-    { name = "cachetools", marker = "sys_platform == 'linux'" },
-    { name = "cbor2", marker = "sys_platform == 'linux'" },
-    { name = "cloudpickle", marker = "sys_platform == 'linux'" },
-    { name = "compressed-tensors", marker = "sys_platform == 'linux'" },
-    { name = "depyf", marker = "sys_platform == 'linux'" },
-    { name = "diskcache", marker = "sys_platform == 'linux'" },
-    { name = "einops", marker = "sys_platform == 'linux'" },
-    { name = "fastapi", extra = ["standard"], marker = "sys_platform == 'linux'" },
-    { name = "filelock", marker = "sys_platform == 'linux'" },
-    { name = "gguf", marker = "sys_platform == 'linux'" },
-    { name = "lark", version = "1.2.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-    { name = "llguidance", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "lm-format-enforcer", marker = "sys_platform == 'linux'" },
-    { name = "mistral-common", extra = ["audio", "image"], marker = "sys_platform == 'linux'" },
-    { name = "msgspec", marker = "sys_platform == 'linux'" },
-    { name = "ninja", marker = "sys_platform == 'linux'" },
-    { name = "numba", marker = "sys_platform == 'linux'" },
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "openai", marker = "sys_platform == 'linux'" },
-    { name = "openai-harmony", marker = "sys_platform == 'linux'" },
-    { name = "opencv-python-headless", marker = "sys_platform == 'linux'" },
-    { name = "outlines-core", marker = "sys_platform == 'linux'" },
-    { name = "partial-json-parser", marker = "sys_platform == 'linux'" },
-    { name = "pillow", marker = "sys_platform == 'linux'" },
-    { name = "prometheus-client", marker = "sys_platform == 'linux'" },
-    { name = "prometheus-fastapi-instrumentator", marker = "sys_platform == 'linux'" },
-    { name = "protobuf", marker = "sys_platform == 'linux'" },
-    { name = "psutil", marker = "sys_platform == 'linux'" },
-    { name = "py-cpuinfo", marker = "sys_platform == 'linux'" },
-    { name = "pybase64", marker = "sys_platform == 'linux'" },
-    { name = "pydantic", marker = "sys_platform == 'linux'" },
-    { name = "python-json-logger", marker = "sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "sys_platform == 'linux'" },
-    { name = "pyzmq", marker = "sys_platform == 'linux'" },
-    { name = "ray", extra = ["cgraph"], marker = "sys_platform == 'linux'" },
-    { name = "regex", marker = "sys_platform == 'linux'" },
-    { name = "requests", marker = "sys_platform == 'linux'" },
-    { name = "scipy", marker = "sys_platform == 'linux'" },
-    { name = "sentencepiece", marker = "sys_platform == 'linux'" },
-    { name = "setproctitle", marker = "sys_platform == 'linux'" },
-    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-    { name = "six", marker = "sys_platform == 'linux'" },
-    { name = "tiktoken", marker = "sys_platform == 'linux'" },
-    { name = "tokenizers", marker = "sys_platform == 'linux'" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
-    { name = "torchaudio", marker = "sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.23.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
-    { name = "tqdm", marker = "sys_platform == 'linux'" },
-    { name = "transformers", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
-    { name = "watchfiles", marker = "sys_platform == 'linux'" },
-    { name = "xformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "xgrammar", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/82/5a/36d2351206f4d8d871b10780f874d03957985e08298d430cc837723e07af/vllm-0.11.0.tar.gz", hash = "sha256:f435a64c24e9c4178d657a76f8edd8548ddc444012f7d06a9f79ac3a6392bfae", size = 10822208, upload-time = "2025-10-04T01:39:57.798Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/33/d19e0763c34392ec956534536fa837c060495bfff31ed83452135ea7608d/vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:3861c75ff2b12e24f6d179ff5c084d791b42ded8675d76c8706697c79f68cd62", size = 438217982, upload-time = "2025-10-04T01:39:32.382Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/bf/973444bb959fc7acbbeb3d226bd4d135dcd49b6af174b29aab1b50e2d710/vllm-0.11.0-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:52369c9ee949944354bdc7afc88ded2d1ed02b098bf90db06cf80098a19787b7", size = 401003969, upload-time = "2025-10-04T01:39:50.251Z" },
-]
-
 [[package]]
 name = "wandb"
 version = "0.24.2"
@@ -8199,39 +7380,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
-[[package]]
-name = "xformers"
-version = "0.0.32.post1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6f/33/3b9c4d3d5b2da453d27de891df4ad653ac5795324961aa3a5c15b0353fe6/xformers-0.0.32.post1.tar.gz", hash = "sha256:1de84a45c497c8d92326986508d81f4b0a8c6be4d3d62a29b8ad6048a6ab51e1", size = 12106196, upload-time = "2025-08-14T18:07:45.486Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6b/df/6817346f1a77278315d5fe1fc9f239ba3282ba36e8ab3256babd448dde62/xformers-0.0.32.post1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5f245b5555188da112070d8fefb6b7ae1ae47422856521d66c837e9d2352fbe4", size = 117199943, upload-time = "2025-08-14T18:07:34.78Z" },
-]
-
-[[package]]
-name = "xgrammar"
-version = "0.1.25"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "ninja", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "numpy", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pydantic", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "transformers", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f2/a9/dc3c63cf7f082d183711e46ef34d10d8a135c2319dc581905d79449f52ea/xgrammar-0.1.25.tar.gz", hash = "sha256:70ce16b27e8082f20808ed759b0733304316facc421656f0f30cfce514b5b77a", size = 2297187, upload-time = "2025-09-21T05:58:58.942Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/99/9c/39bb38680be3b6d6aa11b8a46a69fb43e2537d6728710b299fa9fc231ff0/xgrammar-0.1.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c519518ebc65f75053123baaf23776a21bda58f64101a64c2fc4aa467c9cd480", size = 8519097, upload-time = "2025-09-21T05:58:40.831Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/c2/695797afa9922c30c45aa94e087ad33a9d87843f269461b622a65a39022a/xgrammar-0.1.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47fdbfc6007df47de2142613220292023e88e4a570546b39591f053e4d9ec33f", size = 8712184, upload-time = "2025-09-21T05:58:43.142Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/48/df8c52a22f47f1e3237d9457fd6fefe8b9bca75a13a81d1901690260c86b/xgrammar-0.1.25-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a1a6a638167d704a22a0c9670e2176104c38e38c351286a07a77143e22f9053", size = 8710998, upload-time = "2025-09-21T05:58:47.731Z" },
-]
-
 [[package]]
 name = "xxhash"
 version = "3.7.0"

From a18d969753fc702f590c1af3b84534fbfa2ca418 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:21:17 +0200
Subject: [PATCH 31/45] tests(annotations): fix stale canned-VLM markers +
 action_record style assertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The annotation tests had never actually run in CI (collection failed on
the missing 'datasets' extra); now that they do, three stale assertions
surfaced against the evolved pipeline:

  * test_module1_plan_memory_subtask_smoke: the memory canned-responder
    marker 'Update the memory' no longer appears in module_1_memory.txt
    (now 'compressed semantic memory'), so the stub returned no memory
    row and the {subtask,plan,memory} subset check failed. Marker
    updated to match the current prompt.
  * test_module2_mid_episode_emits_paired_interjection_and_speech: the
    interjection marker 'Write ONE interjection' is now 'Write ONE
    compact interjection' in module_2_interjection.txt, so 0 interjections
    were emitted. Marker updated.
  * tests/datasets/test_language.py::test_style_registry_routes_columns:
    PERSISTENT_STYLES gained 'action_record' in this PR; add it to the
    expected set.

These are test/prompt-marker syncs — no production behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/annotations/test_modules.py | 12 +++++-------
 tests/datasets/test_language.py   |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py
index 189481169..021cd207f 100644
--- a/tests/annotations/test_modules.py
+++ b/tests/annotations/test_modules.py
@@ -88,7 +88,7 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
                     {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
                 ]
             },
-            "Update the memory": {"memory": "wiped the counter once"},
+            "compressed semantic memory": {"memory": "wiped the counter once"},
         },
     )
     module = PlanSubtasksMemoryModule(vlm=vlm, config=PlanConfig())
@@ -151,12 +151,10 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
         {
             "acknowledgement the robot": {"text": "OK."},
             # Marker matches the distinctive line of
-            # ``module_2_interjection.txt``. The old marker
-            # ("ONE realistic interruption") came from a previous prompt
-            # version that asked for counterfactual interjections; the
-            # current design anchors on subtask boundaries instead, so
-            # the prompt and its marker changed.
-            "Write ONE interjection": {
+            # ``module_2_interjection.txt`` ("Write ONE compact
+            # interjection ..."). Keep this in sync with that prompt's
+            # wording — the canned responder matches on substring.
+            "Write ONE compact interjection": {
                 "interjection": "now wipe the counter please",
                 "speech": "On it.",
             },
diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py
index 52c7b3708..2846dab1d 100644
--- a/tests/datasets/test_language.py
+++ b/tests/datasets/test_language.py
@@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
 
 
 def test_style_registry_routes_columns():
-    assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES
+    assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES
     assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES
     assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY
 

From b9a01873354c6cc483843d36e935b3a3bb71055d Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:28:40 +0200
Subject: [PATCH 32/45] =?UTF-8?q?annotate:=20drop=20local=20in-process=20V?=
 =?UTF-8?q?LM=20backends=20=E2=80=94=20HF=20Jobs=20(openai)=20only=20for?=
 =?UTF-8?q?=20now?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shipped workflow is Hugging Face Jobs (examples/annotations/run_hf_
job.py): it serves the model with vLLM in the vllm/vllm-openai image and
the pipeline talks to it over the OpenAI-compatible API. The in-process
vllm / transformers local backends added surface (and the vllm
one pinned an old torch) without being part of that path, so they're
removed for now.

  * vlm_client.make_vlm_client: keep only backend='openai' (+ 'stub'
    rejected with the usual guidance). Requesting 'vllm'/'transformers'
    now raises a clear 'not supported for now — use the HF Jobs flow'
    error. Removed _make_vllm_client and _make_transformers_client.
  * config: backend docstring updated (openai-only); default model_id
    bumped to Qwen/Qwen3.6-27B to match run_hf_job.
  * docs/annotation_pipeline.mdx: remove the '## Running locally'
    section; the launcher description now says one vLLM server per GPU
    over the OpenAI API, and the 'One Qwen-VL pass' note drops the
    'vLLM/transformers fallback' wording.

Tests are unaffected (they construct StubVlmClient directly; nothing
referenced the removed backends).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           |  43 +-----
 .../annotations/steerable_pipeline/config.py  |  10 +-
 .../steerable_pipeline/vlm_client.py          | 137 +++---------------
 3 files changed, 32 insertions(+), 158 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 05e4d103d..7fd27b1f2 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -48,38 +48,6 @@ anything already there. Implementations of those tools live under
 `src/lerobot/tools/`; one file per tool, registered via
 `TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide.
 
-## Running locally
-
-Install the extra and invoke the console script. Episode-level
-concurrency comes from `--executor.episode_parallelism` (default 16);
-that is the only knob the in-process executor exposes.
-
-```bash
-uv sync --extra annotations
-uv run lerobot-annotate \
-  --root=/path/to/dataset \
-  --vlm.model_id=Qwen/Qwen2.5-VL-7B-Instruct
-```
-
-The pipeline attaches actual camera footage to every `plan` /
-`interjections` / `vqa` prompt by default, decoded from the dataset's
-first `observation.images.*` stream. Override with
-`--vlm.camera_key=observation.images.<name>` to pin a specific
-viewpoint. Datasets with no video tracks fall back to text-only prompts
-automatically.
-
-**The `plan` module sees the whole episode as one video block.** Subtask
-decomposition gets a `{"type":"video", "video":[<frames>]}` block
-covering the entire demonstration; Qwen-VL pools temporally on its own
-and decides where to cut. There is no keyframe stride or count knob —
-`--plan.max_video_frames` (default 128) only caps the frames packed
-into the video block as a model-capacity bound. The `interjections`
-module attaches a short window of frames straddling the interjection
-timestamp. The `vqa` module grounds each VQA pair on a single frame —
-its `--vqa.K` knob sets how many consecutive frames each emission tick
-anchors, and every anchored frame gets its own VQA pair on that one
-frame (there is no per-pair frame window).
-
 ## Running on Hugging Face Jobs
 
 Distributed annotation is delegated to
@@ -91,10 +59,11 @@ HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 ```
 
 [`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
-spawns one `h200x2` job that:
+spawns a multi-GPU `h200` job that:
 
 1. installs the branch under test plus the annotation extras,
-2. boots two vllm servers (one per GPU) for the chosen model,
+2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the
+   chosen model, which the pipeline drives over the OpenAI-compatible API,
 3. runs the `plan` / `interjections` / `vqa` modules across the dataset
    via `lerobot-annotate`,
 4. uploads the annotated dataset to `--push_to_hub`.
@@ -126,9 +95,9 @@ Two things drive the scope:
    speech) only appear on the exact frame whose timestamp matches the
    emission. The pipeline writes timestamps taken straight from the
    source parquet — no floating-point recomputation.
-2. **One Qwen-VL pass.** All three modules share a single VLM client
-   (vLLM if available, transformers fallback) so the cost is one model
-   load per dataset, not three.
+2. **One Qwen-VL pass.** All three modules share a single VLM client (the
+   OpenAI-compatible client talking to the job's vLLM server) so the cost
+   is one model load per dataset, not three.
 
 ## Module independence and staged reruns
 
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index cdcf38072..470dccfc1 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -295,11 +295,13 @@ class VqaConfig:
 class VlmConfig:
     """Shared Qwen-VL client configuration."""
 
-    # One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests).
-    # ``openai`` talks to a local OpenAI-compatible server; the CLI
-    # auto-spawns one when ``auto_serve=True``.
+    # Only ``openai`` is supported for now (the in-process ``vllm`` /
+    # ``transformers`` local backends were removed — the shipped workflow
+    # is Hugging Face Jobs). ``openai`` talks to an OpenAI-compatible vLLM
+    # server; the CLI auto-spawns one in-job when ``auto_serve=True``.
+    # ``stub`` is for unit tests (construct ``StubVlmClient`` directly).
     backend: str = "openai"
-    model_id: str = "Qwen/Qwen3.6-35B-A3B-FP8"
+    model_id: str = "Qwen/Qwen3.6-27B"
 
     # OpenAI-compatible server endpoint; ``EMPTY`` works for local servers.
     api_base: str = "http://localhost:8000/v1"
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 8aa7d01c6..d0d9e56a9 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -192,132 +192,35 @@ class _GenericTextClient:
 
 
 def make_vlm_client(config: VlmConfig) -> VlmClient:
-    """Build the shared VLM client per the configured backend.
+    """Build the shared VLM client.
 
-    For ``stub``, callers should construct :class:`StubVlmClient` directly with
-    a responder callable. ``stub`` here is rejected to make accidental misuse
-    obvious.
+    Only the ``openai`` backend is supported for now. The shipped workflow
+    is Hugging Face Jobs (``examples/annotations/run_hf_job.py``): it boots
+    a vLLM server inside the ``vllm/vllm-openai`` image and the pipeline
+    talks to it over the OpenAI-compatible API (``--vlm.backend=openai``,
+    optionally auto-spawning the server via ``auto_serve`` /
+    ``serve_command``). The former in-process ``vllm`` / ``transformers``
+    backends were removed to keep the support surface to the HF Jobs path.
+
+    For ``stub``, construct :class:`StubVlmClient` directly with a responder
+    callable; it is rejected here to make accidental misuse obvious.
     """
+    if config.backend == "openai":
+        return _make_openai_client(config)
     if config.backend == "stub":
         raise ValueError(
             "Use StubVlmClient(...) directly for the stub backend; make_vlm_client builds real clients."
         )
-    if config.backend == "vllm":
-        return _make_vllm_client(config)
-    if config.backend == "transformers":
-        return _make_transformers_client(config)
-    if config.backend == "openai":
-        return _make_openai_client(config)
+    if config.backend in {"vllm", "transformers"}:
+        raise ValueError(
+            f"backend={config.backend!r} (in-process local model) is not supported for now — "
+            "only backend='openai' (the Hugging Face Jobs flow) is. Run the pipeline via "
+            "examples/annotations/run_hf_job.py, which serves the model with vLLM in the "
+            "vllm/vllm-openai image and talks to it over the OpenAI-compatible API."
+        )
     raise ValueError(f"Unknown VLM backend: {config.backend!r}")
 
 
-def _make_vllm_client(config: VlmConfig) -> VlmClient:
-    try:
-        from vllm import LLM, SamplingParams  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError(
-            "vllm is required for backend='vllm'. Install it separately with "
-            "`pip install vllm` (it is not a hard dependency of the "
-            "``annotations`` extra because it pins an older torch). The HF "
-            "Jobs launcher uses the vllm/vllm-openai image + backend='openai' "
-            "instead."
-        ) from exc
-    # Workaround for cuDNN 9.x + torch 2.8 conv3d regression that surfaces
-    # as CUDNN_STATUS_NOT_INITIALIZED in Qwen-VL vision-tower patch
-    # embedders. Setting LEROBOT_DISABLE_CUDNN=1 forces native PyTorch
-    # convolution kernels — slower but functional.
-    if os.environ.get("LEROBOT_DISABLE_CUDNN", "").lower() in {"1", "true", "yes"}:
-        import torch as _torch  # noqa: PLC0415  - optional GPU dep, deferred
-
-        _torch.backends.cudnn.enabled = False
-    llm_kwargs: dict[str, Any] = {
-        "model": config.model_id,
-        "tensor_parallel_size": config.tensor_parallel_size,
-        "gpu_memory_utilization": config.gpu_memory_utilization,
-        "trust_remote_code": config.trust_remote_code,
-    }
-    if config.max_model_len is not None:
-        llm_kwargs["max_model_len"] = config.max_model_len
-    llm = LLM(**llm_kwargs)
-
-    def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
-        # ``guided_decoding`` would speed up parsing but its API differs across
-        # vllm releases (dict vs GuidedDecodingParams). The _GenericTextClient
-        # wrapper already has a one-retry JSON-recovery path, so we skip it.
-        params = SamplingParams(max_tokens=max_tok, temperature=temp)
-        # ``llm.chat`` handles chat-template application + multimodal input
-        # extraction (image/video blocks) internally, which ``llm.generate``
-        # does not.
-        outputs = llm.chat([list(m) for m in batch], params)
-        return [o.outputs[0].text for o in outputs]
-
-    return _GenericTextClient(_gen, config)
-
-
-def _make_transformers_client(config: VlmConfig) -> VlmClient:
-    try:
-        import torch  # type: ignore[import-not-found]
-        import transformers  # type: ignore[import-not-found]
-        from transformers import AutoProcessor  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError("transformers + torch are required for backend='transformers'.") from exc
-    auto_cls = getattr(transformers, "AutoModelForImageTextToText", None) or getattr(
-        transformers, "AutoModelForVision2Seq", None
-    )
-    if auto_cls is None:
-        raise ImportError(
-            "Neither AutoModelForImageTextToText nor AutoModelForVision2Seq is available in this "
-            "transformers version. Install transformers>=4.45 (which has AutoModelForImageTextToText) "
-            "for VL models."
-        )
-    processor = AutoProcessor.from_pretrained(config.model_id, trust_remote_code=config.trust_remote_code)
-    use_accelerate = os.environ.get("LEROBOT_TRANSFORMERS_DEVICE_MAP", "manual") != "manual"
-    # ``device_map='auto'`` triggers a known std::bad_alloc on the Qwen3-VL
-    # post-load dispatch path (the alloc fails in accelerate's hook setup
-    # even with TBs of host RAM). Default to manual: load on CPU with
-    # ``low_cpu_mem_usage=True``, then ``.to("cuda")``. Set
-    # ``LEROBOT_TRANSFORMERS_DEVICE_MAP=auto`` to opt back into the old path.
-    if use_accelerate:
-        model = auto_cls.from_pretrained(
-            config.model_id,
-            torch_dtype="auto",
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            trust_remote_code=config.trust_remote_code,
-        )
-    else:
-        import torch as _torch  # noqa: PLC0415  - optional GPU dep, deferred
-
-        model = auto_cls.from_pretrained(
-            config.model_id,
-            torch_dtype=_torch.bfloat16,
-            low_cpu_mem_usage=True,
-            trust_remote_code=config.trust_remote_code,
-        )
-        model = model.to("cuda")
-    model.eval()
-
-    def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
-        outs: list[str] = []
-        for messages in batch:
-            text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-            inputs = processor(text=[text], return_tensors="pt").to(model.device)
-            with torch.no_grad():
-                gen = model.generate(
-                    **inputs,
-                    max_new_tokens=max_tok,
-                    temperature=temp,
-                    do_sample=temp > 0.0,
-                )
-            decoded = processor.batch_decode(
-                gen[:, inputs["input_ids"].shape[-1] :], skip_special_tokens=True
-            )[0]
-            outs.append(decoded)
-        return outs
-
-    return _GenericTextClient(_gen, config)
-
-
 def _make_openai_client(config: VlmConfig) -> VlmClient:
     """Backend that talks to any OpenAI-compatible server.
 

From 3a24e426df2a54e1c71dc649743bfdc29773612c Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:38:06 +0200
Subject: [PATCH 33/45] language: register action_record in CORE_STYLES so
 STYLE_REGISTRY contains it

action_record is in PERSISTENT_STYLES but was missing from CORE_STYLES,
so STYLE_REGISTRY (= CORE_STYLES | EXTENDED_STYLES) didn't contain it and
the PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY invariant in
test_style_registry_routes_columns failed. Add it to CORE_STYLES so the
registry, the persistent-set, and column_for_style() stay consistent.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lerobot/datasets/language.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lerobot/datasets/language.py b/src/lerobot/datasets/language.py
index aaca34e23..f3d371545 100644
--- a/src/lerobot/datasets/language.py
+++ b/src/lerobot/datasets/language.py
@@ -36,6 +36,7 @@ CORE_STYLES = {
     "vqa",
     "trace",
     "task_aug",
+    "action_record",
 }
 # Project-local styles can be registered at import time by appending to
 # ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added

From eba3ab37413035dc781db40b8f8dda7d15607e3d Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 18:30:46 +0200
Subject: [PATCH 34/45] =?UTF-8?q?annotate:=20address=20review=20feedback?=
 =?UTF-8?q?=20=E2=80=94=20bug=20fixes,=20docs/code=20drift,=20naming,=20cl?=
 =?UTF-8?q?eanup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bugs
  * validator: don't re-raise on unknown style. The second column_for_style
    lookup (used to route persistent vs event) now sits in try/except so an
    unknown style is recorded by _check_column_routing and skipped instead
    of crashing the whole validation pass.
  * general_vqa._target_cameras: when restrict_to_default_camera is set but
    the configured camera_key isn't one the provider exposes, warn and fall
    back to all cameras instead of returning a phantom key that KeyErrors
    deep in frame decode.
  * interjections: clamp interjection timestamps to frame_timestamps[0]
    rather than a hardcoded 0.0 (datasets can start at non-zero t).

Docs / code drift
  * annotation_pipeline.mdx: drop the phantom 'vocabulary discovery / phase
    0 / --vocabulary.* / canonical_vocabulary.json' section (none of it
    exists); describe the real describe->segment + coverage-stitch flow.
    Soften the src/lerobot/tools/ + TOOL_REGISTRY reference to 'not part of
    this PR' (matches tools.mdx, which already marks the runtime layer as
    not-yet-implemented). Fix the --push_to_hub/--new_repo_id wording. Note
    the default is now a single h200. Add a 'Contributing new modules'
    section inviting module / prompt / quality contributions.
  * executor docstring: six phases, no phantom phase 0.

run_hf_job.py
  * add the Apache 2.0 license header (was flagged repeatedly).
  * default to a single GPU: flavor=h200, parallel_servers=1, num_gpus=1
    (scale to h200x4 noted in the docstring).
  * pin the install to @main instead of the feature branch (won't break
    after merge).

Naming / cleanup
  * rename dest_repo_id -> new_repo_id across config / script / example /
    test to match the LeRobot dataset edit tools.
  * rename prompt templates module_N_*.txt -> descriptive (plan_*,
    interjections_*, vqa.txt) and update every load_prompt() call.
  * remove dead _messages_to_prompt (used only by the removed in-process
    backends).
  * declare _warned_decode_fail (frames) and _warned_no_camera (vqa) as
    real init=False dataclass fields instead of getattr monkey-patches.
  * scope bandit B607 to the two ffmpeg subprocess.run sites via
    '# nosec B607' and drop it from the global skip list.

Tests
  * fix stale canned-VLM markers ('ONE realistic interruption' ->
    'compact interjection', 'Update the memory' -> 'compressed semantic
    memory') and drop the dead 'concise hierarchical PLAN' plan responders
    (plan generation is deterministic now) in run_e2e_smoke,
    test_pipeline_recipe_render, test_modules.
  * run_e2e_smoke now asserts interjection + speech rows are produced so a
    stale marker can't silently pass again.
  * drop remaining 'PR 1' / 'PR 2' references from test comments / names.

Verified: tests/annotations + tests/datasets/test_language +
tests/scripts/test_lerobot_annotate (31 passed); make-style E2E smoke
(interjections=1 speech_atoms=2); pre-commit (ruff, mypy, bandit,
prettier) clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           | 52 +++++++++++--------
 examples/annotations/run_hf_job.py            | 36 +++++++++----
 pyproject.toml                                |  2 +-
 .../annotations/steerable_pipeline/config.py  | 13 ++---
 .../steerable_pipeline/executor.py            |  8 +--
 .../annotations/steerable_pipeline/frames.py  | 10 ++--
 .../steerable_pipeline/modules/general_vqa.py | 16 ++++--
 .../modules/interjections_and_speech.py       |  7 +--
 .../modules/plan_subtasks_memory.py           | 16 +++---
 ...h.txt => interjections_initial_speech.txt} |  0
 ...ion.txt => interjections_interjection.txt} |  0
 ...tion_record.txt => plan_action_record.txt} |  0
 .../{module_1_memory.txt => plan_memory.txt}  |  0
 ...describe.txt => plan_subtask_describe.txt} |  0
 ...odule_1_subtasks.txt => plan_subtasks.txt} |  0
 ...sk_aug_axes.txt => plan_task_aug_axes.txt} |  0
 ...hrasings.txt => plan_task_rephrasings.txt} |  0
 ...e_1_video_task.txt => plan_video_task.txt} |  0
 .../prompts/{module_3_vqa.txt => vqa.txt}     |  0
 .../steerable_pipeline/validator.py           |  8 ++-
 .../steerable_pipeline/vlm_client.py          | 10 ----
 src/lerobot/scripts/lerobot_annotate.py       |  8 +--
 tests/annotations/run_e2e_smoke.py            | 23 ++++++--
 tests/annotations/test_modules.py             |  7 +--
 .../test_pipeline_recipe_render.py            | 32 ++++++------
 tests/annotations/test_writer.py              |  2 +-
 tests/scripts/test_lerobot_annotate.py        |  2 +-
 27 files changed, 148 insertions(+), 104 deletions(-)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_2_initial_speech.txt => interjections_initial_speech.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_2_interjection.txt => interjections_interjection.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_action_record.txt => plan_action_record.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_memory.txt => plan_memory.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_subtask_describe.txt => plan_subtask_describe.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_subtasks.txt => plan_subtasks.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_task_aug_axes.txt => plan_task_aug_axes.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_task_rephrasings.txt => plan_task_rephrasings.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_1_video_task.txt => plan_video_task.txt} (100%)
 rename src/lerobot/annotations/steerable_pipeline/prompts/{module_3_vqa.txt => vqa.txt} (100%)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 7fd27b1f2..98ef79fb9 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -7,8 +7,7 @@
 
 ## What the pipeline produces
 
-A vocabulary-discovery phase derives a small canonical wording, then three
-modules write into a per-episode staging tree, then a single writer
+Three modules write into a per-episode staging tree, then a single writer
 rewrites the data shards in place:
 
 | Style / atom                                | Column                | Module          |
@@ -21,20 +20,15 @@ rewrites the data shards in place:
 | speech tool-call atom (`style=null`, `say`) | `language_events`     | `interjections` |
 | `vqa` (user / assistant pair)               | `language_events`     | `vqa`           |
 
-The `plan` module is constrained to a **canonical vocabulary** discovered
-once per dataset by the `vocabulary` module (phase 0). It watches a few
-sample episode videos (`--vocabulary.sample_episodes`, default `3`) and
-asks the VLM to derive a small set of imperative subtask labels and
-first-person memory milestones that recur across the demos. The VLM
-picks the right number of entries itself based on what it sees in the
-clips — short pick-and-place demos get ~6 subtask labels, longer
-multi-step recipes get more. The result lands at
-`meta/canonical_vocabulary.json` (human-readable / hand-editable) and
-is reused on every subsequent run. The `plan` module then constrains
-both subtask + memory generation to those exact strings — the
-downstream low-level policy sees a small, repeatable target
-distribution instead of thousands of LLM paraphrases. Disable with
-`--vocabulary.enabled=False` to fall back to free-form generation.
+The `plan` module generates subtasks per episode with a **describe → segment**
+grounding flow: a first pass narrates only what is visible in the chosen
+camera, and its description is fed into a second pass that segments the
+episode into consecutive atomic subtasks. The resulting spans are then
+deterministically stitched into a contiguous full-episode cover so every
+frame has exactly one active subtask. See
+[`run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
+for the production flag set (single camera, embedded frames, windowed
+subtask generation).
 
 The writer does **not** add a `tools` column to the parquet — the tool
 catalog lives at `meta/info.json["tools"]` instead (see
@@ -44,9 +38,11 @@ user pre-declared.
 
 If you want to declare additional tools for a dataset before annotation
 runs, edit `meta/info.json["tools"]` directly — the pipeline preserves
-anything already there. Implementations of those tools live under
-`src/lerobot/tools/`; one file per tool, registered via
-`TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide.
+anything already there. That makes the tool visible to the chat template
+so the model can learn to _generate_ the call. The runtime layer that
+_executes_ a generated call (the `Tool` protocol / `TOOL_REGISTRY` under
+`src/lerobot/tools/`) is not part of this PR — see the
+[Tools](./tools) doc, which marks those pieces as not-yet-implemented.
 
 ## Running on Hugging Face Jobs
 
@@ -59,19 +55,33 @@ HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 ```
 
 [`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
-spawns a multi-GPU `h200` job that:
+spawns a single-GPU `h200` job (scale up to `h200x4` for larger datasets) that:
 
 1. installs the branch under test plus the annotation extras,
 2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the
    chosen model, which the pipeline drives over the OpenAI-compatible API,
 3. runs the `plan` / `interjections` / `vqa` modules across the dataset
    via `lerobot-annotate`,
-4. uploads the annotated dataset to `--push_to_hub`.
+4. with `--push_to_hub=true`, uploads the annotated dataset to
+   `--new_repo_id` (or back to `--repo_id` in place when that is unset).
 
 To target a different dataset, model, or hub repo, edit the `CMD` block
 inside the script — every flag in there maps directly onto a CLI flag of
 `lerobot-annotate` (see `lerobot-annotate --help` for the full list).
 
+## Contributing new modules
+
+The pipeline is built to be extended, and **contributions are very
+welcome** — whether that's a brand-new annotation module (e.g. a
+trajectory-trace or affordance module), a new prompt template, a better
+grounding flow, or quality improvements to the existing `plan` /
+`interjections` / `vqa` modules. Each module lives under
+`src/lerobot/annotations/steerable_pipeline/modules/`, shares the VLM
+client and keyframe cache, writes its raw output to the per-episode
+staging tree, and is wired into the executor as an independent phase.
+If you have an idea for a module or an improvement, open an issue or PR
+on [the repo](https://github.com/huggingface/lerobot).
+
 ## Style-to-recipe consumer mapping
 
 The pipeline's outputs are designed to be consumed by recipes (see
diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 6af40a268..c335379f4 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -1,21 +1,37 @@
 #!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Launch ``lerobot-annotate`` on a Hugging Face job (vllm + Qwen3.6-27B VLM).
 
-Spawns one ``h200x4`` job that:
+Spawns one single-GPU ``h200`` job that:
 
-  1. installs this branch of ``lerobot`` plus the annotation extras,
-  2. boots four vllm servers (one per GPU) with Qwen3.6-27B (dense VLM),
+  1. installs ``lerobot`` plus the annotation extras,
+  2. boots one vllm server with Qwen3.6-27B (dense VLM),
   3. runs the plan / interjections / vqa modules across the dataset
      in free-form mode (each episode generates its own subtasks +
      memory),
-  4. uploads the annotated dataset to ``--dest_repo_id`` (when set)
+  4. uploads the annotated dataset to ``--new_repo_id`` (when set)
      or back to ``--repo_id``.
 
 Usage:
 
     HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 
-Adjust ``CMD`` below to point at your own dataset / target hub repo.
+Adjust ``CMD`` (dataset, model, hub repo) and ``flavor`` below for your
+run. For larger datasets, scale to ``h200x4`` and raise
+``--vlm.parallel_servers`` / ``--vlm.num_gpus`` to match.
 """
 
 import os
@@ -29,7 +45,7 @@ if not token:
 CMD = (
     "apt-get update -qq && apt-get install -y -qq git ffmpeg && "
     "pip install --no-deps "
-    "'lerobot @ git+https://github.com/huggingface/lerobot.git@feat/language-annotation-pipeline' && "
+    "'lerobot @ git+https://github.com/huggingface/lerobot.git@main' && "
     "pip install --upgrade-strategy only-if-needed "
     "datasets pyarrow av jsonlines draccus gymnasium torchcodec mergedeep pyyaml-include toml typing-inspect "
     "openai && "
@@ -37,12 +53,12 @@ CMD = (
     "export VLLM_VIDEO_BACKEND=pyav && "
     "lerobot-annotate "
     "--repo_id=pepijn223/robocasa_pretrain_human300_v4 "
-    "--dest_repo_id=pepijn223/robocasa_pretrain_human300_v4_annotated5 "
+    "--new_repo_id=pepijn223/robocasa_pretrain_human300_v4_annotated5 "
     "--push_to_hub=true "
     "--vlm.backend=openai "
     "--vlm.model_id=Qwen/Qwen3.6-27B "
-    "--vlm.parallel_servers=4 "
-    "--vlm.num_gpus=4 "
+    "--vlm.parallel_servers=1 "
+    "--vlm.num_gpus=1 "
     '--vlm.serve_command="vllm serve Qwen/Qwen3.6-27B '
     "--tensor-parallel-size 1 --max-model-len 32768 "
     '--gpu-memory-utilization 0.8 --uvicorn-log-level warning --port {port}" '
@@ -111,7 +127,7 @@ CMD = (
 job = run_job(
     image="vllm/vllm-openai:latest",
     command=["bash", "-c", CMD],
-    flavor="h200x4",
+    flavor="h200",
     secrets={"HF_TOKEN": token},
     timeout="2h",
 )
diff --git a/pyproject.toml b/pyproject.toml
index 86599aa31..dce61758c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -417,7 +417,7 @@ exclude_dirs = [
     "benchmarks",
     "src/lerobot/datasets/push_dataset_to_hub",
 ]
-skips = ["B101", "B311", "B404", "B603", "B607", "B615"]
+skips = ["B101", "B311", "B404", "B603", "B615"]
 
 [tool.typos]
 default.extend-ignore-re = [
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 470dccfc1..10484fd3a 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -370,13 +370,14 @@ class AnnotationPipelineConfig:
 
     # Hub dataset id. Used as the download source when ``root`` is unset,
     # and as the destination repo when ``push_to_hub`` is enabled and
-    # ``dest_repo_id`` is unset.
+    # ``new_repo_id`` is unset.
     repo_id: str | None = None
 
-    # Optional separate Hub dataset id to push the annotated result to. When
-    # unset, ``push_to_hub`` uploads back to ``repo_id`` (annotate in place);
-    # when set, the source ``repo_id`` is left untouched.
-    dest_repo_id: str | None = None
+    # Optional separate Hub dataset id to push the annotated result to (named
+    # ``new_repo_id`` to match the LeRobot dataset edit tools). When unset,
+    # ``push_to_hub`` uploads back to ``repo_id`` (annotate in place); when
+    # set, the source ``repo_id`` is left untouched.
+    new_repo_id: str | None = None
 
     root: Path | None = None
 
@@ -404,7 +405,7 @@ class AnnotationPipelineConfig:
     video_backend: str | None = None
 
     # When True, upload the annotated dataset to the Hugging Face Hub:
-    # to ``dest_repo_id`` if set, otherwise back to ``repo_id``. One of
+    # to ``new_repo_id`` if set, otherwise back to ``repo_id``. One of
     # the two must be set for this to take effect.
     push_to_hub: bool = False
     push_private: bool = False
diff --git a/src/lerobot/annotations/steerable_pipeline/executor.py b/src/lerobot/annotations/steerable_pipeline/executor.py
index 4b7eb687d..69d10bc89 100644
--- a/src/lerobot/annotations/steerable_pipeline/executor.py
+++ b/src/lerobot/annotations/steerable_pipeline/executor.py
@@ -15,14 +15,8 @@
 # limitations under the License.
 """In-process executor that runs the annotation phases.
 
-The executor plans **seven phases** in the dependency order from the plan:
+The executor runs **six phases** in dependency order:
 
-    phase 0: vocabulary discovery — derive a small canonical vocabulary
-             from the first few sample-episode videos (subtask labels +
-             memory milestones) and persist it next to the dataset; the
-             ``plan`` module then constrains every per-episode generation
-             to those strings, so the downstream policy sees a small,
-             repeatable conditioning distribution
     phase 1: ``plan`` module (plan + subtasks + memory)
     phase 2: ``interjections`` module (interjections + speech)
     phase 3: ``plan`` plan-update pass — re-runs plan emission at every
diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py
index 804dae109..a26245964 100644
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -146,6 +146,7 @@ class VideoFrameProvider:
     # ``ExecutorConfig.episode_parallelism``); guard the dict cache and the
     # one-shot warn flag against concurrent updates from worker threads.
     _lock: threading.Lock = field(default_factory=threading.Lock, init=False, repr=False)
+    _warned_decode_fail: bool = field(default=False, init=False, repr=False)
 
     def __post_init__(self) -> None:
         from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata  # noqa: PLC0415
@@ -285,7 +286,9 @@ class VideoFrameProvider:
             str(out_path),
         ]
         try:
-            subprocess.run(cmd, check=True, timeout=300)
+            # ffmpeg is invoked by name via PATH lookup (the standard way to
+            # call the CLI); the arg list is fully controlled here, not shell.
+            subprocess.run(cmd, check=True, timeout=300)  # nosec B607
         except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
             return None
         return out_path if out_path.exists() and out_path.stat().st_size > 0 else None
@@ -335,7 +338,7 @@ class VideoFrameProvider:
         # []) is debuggable from the job log instead of post-hoc parquet
         # inspection. Subsequent failures stay quiet.
         with self._lock:
-            already_warned = getattr(self, "_warned_decode_fail", False)
+            already_warned = self._warned_decode_fail
             if not already_warned:
                 self._warned_decode_fail = True
         if not already_warned:
@@ -382,7 +385,8 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
 
     frames: list[Any] = []
     for ts in timestamps:
-        proc = subprocess.run(
+        # ffmpeg invoked by name via PATH lookup; fully-controlled arg list, no shell.
+        proc = subprocess.run(  # nosec B607
             [
                 "ffmpeg",
                 "-nostdin",
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index 579007912..cdc87b579 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -95,6 +95,7 @@ class GeneralVqaModule:
     config: VqaConfig
     seed: int = 1729
     frame_provider: FrameProvider = field(default_factory=null_provider)
+    _warned_no_camera: bool = field(default=False, init=False, repr=False)
 
     @property
     def enabled(self) -> bool:
@@ -113,7 +114,7 @@ class GeneralVqaModule:
             # No camera available — emit nothing rather than producing
             # untagged rows that would fail validation. Surface a loud one-
             # time warning so this is never silently a no-op.
-            if not getattr(self, "_warned_no_camera", False):
+            if not self._warned_no_camera:
                 logging.getLogger(__name__).warning(
                     "vqa module found no cameras on the frame provider — "
                     "every episode will emit zero VQA rows. Check that the "
@@ -191,8 +192,17 @@ class GeneralVqaModule:
             default = getattr(self.frame_provider, "camera_key", None)
             if default and default in all_cameras:
                 return [default]
+            # ``restrict_to_default_camera`` is set but the configured default
+            # isn't one the provider exposes. Returning it anyway would make
+            # ``_decode`` raise a KeyError deep in frame extraction, so warn and
+            # fall through to every available camera instead.
             if default:
-                return [default]
+                logging.getLogger(__name__).warning(
+                    "restrict_to_default_camera is set but camera_key=%r is not in the "
+                    "provider's cameras %s; grounding VQA on all available cameras instead.",
+                    default,
+                    all_cameras,
+                )
         return all_cameras
 
     def _build_messages(
@@ -202,7 +212,7 @@ class GeneralVqaModule:
         frame_timestamp: float,
         camera_key: str,
     ) -> list[dict[str, Any]]:
-        prompt = load_prompt("module_3_vqa").format(
+        prompt = load_prompt("vqa").format(
             episode_task=record.episode_task,
             question_type=question_type,
         )
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
index f03e3df0d..616f9ce1b 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/interjections_and_speech.py
@@ -85,7 +85,7 @@ class InterjectionsAndSpeechModule:
         return current
 
     def _initial_speech(self, record: EpisodeRecord) -> str | None:
-        prompt = load_prompt("module_2_initial_speech").format(
+        prompt = load_prompt("interjections_initial_speech").format(
             episode_task=record.episode_task,
         )
         messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
@@ -147,7 +147,7 @@ class InterjectionsAndSpeechModule:
             # previous subtask and the start of the next one — same
             # conditioning the policy will see at training time.
             window_ts = self._window_timestamps(t_snap, record.frame_timestamps)
-            prompt = load_prompt("module_2_interjection").format(
+            prompt = load_prompt("interjections_interjection").format(
                 episode_task=record.episode_task,
                 prev_subtask=prev_subtask or "(starting from initial state)",
                 next_subtask=next_subtask,
@@ -198,11 +198,12 @@ class InterjectionsAndSpeechModule:
         # Center the window on the anchor so half lands before, half after.
         start_offset = -window / 2.0
         targets = [t_anchor + start_offset + step * i for i in range(n)]
+        first_ts = float(frame_timestamps[0])
         last_ts = float(frame_timestamps[-1])
         snapped: list[float] = []
         seen: set[float] = set()
         for tgt in targets:
-            clamped = min(last_ts, max(0.0, tgt))
+            clamped = min(last_ts, max(first_ts, tgt))
             t = snap_to_frame(clamped, frame_timestamps)
             if t not in seen:
                 seen.add(t)
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index fecd42d3a..ac5c76453 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -285,14 +285,14 @@ class PlanSubtasksMemoryModule:
 
     def _derive_task_from_video(self, record: EpisodeRecord) -> str | None:
         """Ask the VLM "what is this video about" with no task hint at all."""
-        text = self._vlm_field(self._video_message(record, load_prompt("module_1_video_task")), "task")
+        text = self._vlm_field(self._video_message(record, load_prompt("plan_video_task")), "task")
         return text.strip() if isinstance(text, str) and text.strip() else None
 
     def _generate_task_rephrasings(self, base_task: str, *, n: int) -> list[str]:
         """Generate ``n`` text-only paraphrases of ``base_task``."""
         if n <= 0 or not base_task:
             return []
-        prompt = load_prompt("module_1_task_rephrasings").format(base_task=base_task, n=n)
+        prompt = load_prompt("plan_task_rephrasings").format(base_task=base_task, n=n)
         raw = self._vlm_field(self._text_message(prompt), "rephrasings")
         if not isinstance(raw, list):
             return []
@@ -343,7 +343,7 @@ class PlanSubtasksMemoryModule:
             )
             return None
 
-        prompt = load_prompt("module_1_action_record").format(
+        prompt = load_prompt("plan_action_record").format(
             episode_task=episode_task,
             subtask_text=span.get("text", ""),
             start_time=start_t,
@@ -416,7 +416,7 @@ class PlanSubtasksMemoryModule:
         """
         if not base_task:
             return []
-        prompt = load_prompt("module_1_task_aug_axes").format(
+        prompt = load_prompt("plan_task_aug_axes").format(
             base_task=base_task,
             n_synonym=axes_cfg.synonym_paraphrase,
             n_omit_arm=axes_cfg.omit_arm,
@@ -596,7 +596,7 @@ class PlanSubtasksMemoryModule:
                 )
 
         # ---- Pass 2: segmentation ------------------------------------
-        prompt = load_prompt("module_1_subtasks").format(
+        prompt = load_prompt("plan_subtasks").format(
             episode_task=effective_task,
             min_subtask_seconds=self.config.min_subtask_seconds,
             max_steps=self.config.plan_max_steps,
@@ -679,7 +679,7 @@ class PlanSubtasksMemoryModule:
                     "action that is not in your description above.\n\n"
                 )
 
-        prompt = load_prompt("module_1_subtasks").format(
+        prompt = load_prompt("plan_subtasks").format(
             episode_task=task,
             min_subtask_seconds=self.config.min_subtask_seconds,
             max_steps=self.config.plan_max_steps,
@@ -778,7 +778,7 @@ class PlanSubtasksMemoryModule:
         self, record: EpisodeRecord, task: str, window: tuple[float, float] | None = None
     ) -> str:
         """Grounding pass: free-form chronological description of the (windowed) video."""
-        prompt = load_prompt("module_1_subtask_describe").format(episode_task=task)
+        prompt = load_prompt("plan_subtask_describe").format(episode_task=task)
         text = self._vlm_field(self._video_message(record, prompt, window=window), "description")
         return text.strip() if isinstance(text, str) and text.strip() else ""
 
@@ -882,7 +882,7 @@ class PlanSubtasksMemoryModule:
         *,
         task: str | None = None,
     ) -> str:
-        prompt = load_prompt("module_1_memory").format(
+        prompt = load_prompt("plan_memory").format(
             episode_task=(task if task is not None else record.episode_task),
             prior_memory=prior_memory or "(none)",
             completed_subtask=completed,
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt b/src/lerobot/annotations/steerable_pipeline/prompts/interjections_initial_speech.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/interjections_initial_speech.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/interjections_interjection.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/interjections_interjection.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_action_record.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_memory.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_memory.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_subtask_describe.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_subtask_describe.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_subtasks.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_subtasks.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_task_aug_axes.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_aug_axes.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_task_aug_axes.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_task_rephrasings.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_task_rephrasings.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_video_task.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_video_task.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_1_video_task.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/plan_video_task.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_3_vqa.txt b/src/lerobot/annotations/steerable_pipeline/prompts/vqa.txt
similarity index 100%
rename from src/lerobot/annotations/steerable_pipeline/prompts/module_3_vqa.txt
rename to src/lerobot/annotations/steerable_pipeline/prompts/vqa.txt
diff --git a/src/lerobot/annotations/steerable_pipeline/validator.py b/src/lerobot/annotations/steerable_pipeline/validator.py
index 203e3f157..f08074c9a 100644
--- a/src/lerobot/annotations/steerable_pipeline/validator.py
+++ b/src/lerobot/annotations/steerable_pipeline/validator.py
@@ -138,7 +138,13 @@ class StagingValidator:
         for row in all_rows:
             self._check_column_routing(row, report, record.episode_index)
             self._check_camera_field(row, report, record.episode_index, self.dataset_camera_keys)
-            if column_for_style(row.get("style")) == LANGUAGE_PERSISTENT:
+            # ``_check_column_routing`` already recorded any unknown-style error;
+            # don't let the same ``column_for_style`` lookup raise here uncaught.
+            try:
+                column = column_for_style(row.get("style"))
+            except ValueError:
+                continue
+            if column == LANGUAGE_PERSISTENT:
                 persistent.append(row)
             else:
                 events.append(row)
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index d0d9e56a9..d0ee10ad8 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -598,13 +598,3 @@ def _pil_to_data_url(image: Any) -> str:
     image.save(buf, format="PNG")
     b64 = base64.b64encode(buf.getvalue()).decode("ascii")
     return f"data:image/png;base64,{b64}"
-
-
-def _messages_to_prompt(messages: Sequence[dict[str, Any]]) -> Any:
-    """Pass-through hook used by the vllm backend.
-
-    vllm exposes its own multimodal entry points that vary by version; for the
-    base flow we simply forward the raw message list and let the caller's
-    custom backend handle templating. Real deployments override this.
-    """
-    return list(messages)
diff --git a/src/lerobot/scripts/lerobot_annotate.py b/src/lerobot/scripts/lerobot_annotate.py
index 4c18b7937..dc5e9013a 100644
--- a/src/lerobot/scripts/lerobot_annotate.py
+++ b/src/lerobot/scripts/lerobot_annotate.py
@@ -113,9 +113,9 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
             logger.warning(w)
 
     if cfg.push_to_hub:
-        if cfg.repo_id is None and cfg.dest_repo_id is None:
+        if cfg.repo_id is None and cfg.new_repo_id is None:
             raise ValueError(
-                "--push_to_hub requires --repo_id or --dest_repo_id (the dataset repo to push to)."
+                "--push_to_hub requires --repo_id or --new_repo_id (the dataset repo to push to)."
             )
         _push_to_hub(root, cfg)
 
@@ -123,11 +123,11 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
 def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
     """Upload the annotated dataset directory to the Hub.
 
-    Pushes to ``cfg.dest_repo_id`` when set, otherwise back to ``cfg.repo_id``.
+    Pushes to ``cfg.new_repo_id`` when set, otherwise back to ``cfg.repo_id``.
     """
     from huggingface_hub import HfApi  # noqa: PLC0415
 
-    repo_id = cfg.dest_repo_id or cfg.repo_id
+    repo_id = cfg.new_repo_id or cfg.repo_id
     commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
     api = HfApi()
     print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)
diff --git a/tests/annotations/run_e2e_smoke.py b/tests/annotations/run_e2e_smoke.py
index 7974a14bd..723f49a5e 100644
--- a/tests/annotations/run_e2e_smoke.py
+++ b/tests/annotations/run_e2e_smoke.py
@@ -60,13 +60,11 @@ def _stub_responder(messages):
                 {"text": "place the bottle down", "start": 2.0, "end": 3.0},
             ]
         }
-    if "concise hierarchical PLAN" in text:
-        return {"plan": "1. grasp\n2. pour\n3. place"}
-    if "Update the memory" in text:
+    if "compressed semantic memory" in text:
         return {"memory": "poured once"}
     if "acknowledgement the robot" in text:
         return {"text": "Sure."}
-    if "ONE realistic interruption" in text:
+    if "compact interjection" in text:
         return {"interjection": "use less water", "speech": "Using less water."}
     if "frame-grounded visual question" in text:
         return {"question": "How many cups?", "answer": {"label": "cup", "count": 1}}
@@ -94,6 +92,23 @@ def main() -> int:
         print(f"phases={[(p.name, p.episodes_processed) for p in summary.phases]}")
         print(f"validation: {summary.validation_report.summary()}")
         print(f"shards rewritten: {len(summary.written_paths)}")
+
+        # Assert the interjection code path actually fired — otherwise a stale
+        # canned-VLM marker would silently produce zero interjections and this
+        # smoke run would still "pass" by only printing.
+        import pyarrow.parquet as pq  # noqa: PLC0415
+
+        events = [
+            r
+            for shard in summary.written_paths
+            for ev in pq.read_table(shard).column("language_events").to_pylist()
+            for r in ev
+        ]
+        n_interjections = sum(1 for r in events if r.get("style") == "interjection")
+        n_speech = sum(1 for r in events if r.get("style") is None and r.get("role") == "assistant")
+        print(f"interjections={n_interjections} speech_atoms={n_speech}")
+        assert n_interjections > 0, "no interjection rows produced — check the interjection prompt marker"
+        assert n_speech > 0, "no speech tool-call atoms produced — check the speech prompt marker"
     return 0
 
 
diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py
index 021cd207f..125c09aa0 100644
--- a/tests/annotations/test_modules.py
+++ b/tests/annotations/test_modules.py
@@ -151,7 +151,7 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
         {
             "acknowledgement the robot": {"text": "OK."},
             # Marker matches the distinctive line of
-            # ``module_2_interjection.txt`` ("Write ONE compact
+            # ``interjections_interjection.txt`` ("Write ONE compact
             # interjection ..."). Keep this in sync with that prompt's
             # wording — the canned responder matches on substring.
             "Write ONE compact interjection": {
@@ -245,7 +245,6 @@ def test_module1_attaches_video_block_to_subtask_prompt(fixture_dataset_root: Pa
             {"text": "wipe the counter", "start": 0.5, "end": 1.1},
         ]
     }
-    plan_payload = {"plan": "1. grasp\n2. wipe"}
     memory_payload = {"memory": "wiped once"}
 
     def responder(messages):
@@ -255,9 +254,7 @@ def test_module1_attaches_video_block_to_subtask_prompt(fixture_dataset_root: Pa
             for block in m.get("content", []):
                 if isinstance(block, dict) and block.get("type") == "text":
                     text = block.get("text", "")
-        if "concise hierarchical PLAN" in text:
-            return plan_payload
-        if "Update the memory" in text:
+        if "compressed semantic memory" in text:
             return memory_payload
         return payload
 
diff --git a/tests/annotations/test_pipeline_recipe_render.py b/tests/annotations/test_pipeline_recipe_render.py
index 43a616934..614c2e45e 100644
--- a/tests/annotations/test_pipeline_recipe_render.py
+++ b/tests/annotations/test_pipeline_recipe_render.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""End-to-end smoke: pipeline output → PR 1 canonical recipe rendering."""
+"""End-to-end smoke: pipeline output → canonical recipe rendering."""
 
 from __future__ import annotations
 
@@ -49,14 +49,15 @@ from lerobot.datasets.language_render import render_sample  # noqa: E402
 from ._helpers import make_canned_responder  # noqa: E402
 
 
-def _build_pr1_style_blend_recipe() -> TrainingRecipe:
+def _build_style_blend_recipe() -> TrainingRecipe:
     """Inline blend recipe that consumes every style this pipeline produces.
 
-    PR 1 used to ship ``src/lerobot/configs/recipes/pi05_hirobot.yaml`` as
-    a canonical example, but that file was dropped during PR 1 review. The
-    cross-PR contract this test guards is "the recipe DSL can render
-    non-empty messages from pipeline output", which doesn't require a
-    specific YAML — so we build the equivalent blend in code.
+    The language schema/DSL work used to ship
+    ``src/lerobot/configs/recipes/pi05_hirobot.yaml`` as a canonical
+    example, but that file was dropped during review. The contract this
+    test guards is "the recipe DSL can render non-empty messages from
+    pipeline output", which doesn't require a specific YAML — so we build
+    the equivalent blend in code.
     """
     return TrainingRecipe(
         blend={
@@ -109,10 +110,9 @@ def _build_executor() -> Executor:
                     {"text": "place the bottle down", "start": 1.0, "end": 1.5},
                 ]
             },
-            "concise hierarchical PLAN": {"plan": "1. grasp\n2. pour\n3. place"},
-            "Update the memory": {"memory": "poured once"},
+            "compressed semantic memory": {"memory": "poured once"},
             "acknowledgement the robot": {"text": "Sure."},
-            "ONE realistic interruption": {
+            "compact interjection": {
                 "interjection": "use less water",
                 "speech": "Using less water.",
             },
@@ -137,7 +137,7 @@ def _build_executor() -> Executor:
     )
 
 
-def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output(
+def test_canonical_recipe_renders_nonempty_from_pipeline_output(
     single_episode_root: Path,
 ) -> None:
     executor = _build_executor()
@@ -150,7 +150,7 @@ def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output(
     events_lists = table.column("language_events").to_pylist()
     timestamps = table.column("timestamp").to_pylist()
 
-    recipe = _build_pr1_style_blend_recipe()
+    recipe = _build_style_blend_recipe()
 
     rendered_any = False
     for ts, persistent, events in zip(timestamps, persistent_lists, events_lists, strict=True):
@@ -168,7 +168,7 @@ def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output(
             rendered_any = True
             assert result["target_message_indices"]
             break
-    assert rendered_any, "PR 1 recipe rendered no messages from pipeline output"
+    assert rendered_any, "recipe rendered no messages from pipeline output"
 
     # Sanity: speech atom appears in events column intact
     flat_events = [r for ev in events_lists for r in ev]
@@ -177,7 +177,7 @@ def test_pr1_canonical_recipe_renders_nonempty_from_pipeline_output(
     say = speech_rows[0]["tool_calls"][0]
     assert say["function"]["name"] == "say"
     assert isinstance(say["function"]["arguments"]["text"], str)
-    # PR 2 no longer writes a ``tools`` column — the say schema lives as a
-    # constant (``SAY_TOOL_SCHEMA``) so PR 1's row struct is the single
-    # source of truth for the v3.1 schema.
+    # The pipeline does not write a ``tools`` column — the say schema lives
+    # as a constant (``SAY_TOOL_SCHEMA``) so the language row struct is the
+    # single source of truth for the v3.1 schema.
     assert "tools" not in table.column_names
diff --git a/tests/annotations/test_writer.py b/tests/annotations/test_writer.py
index 22dfbcb29..0ea550327 100644
--- a/tests/annotations/test_writer.py
+++ b/tests/annotations/test_writer.py
@@ -229,7 +229,7 @@ def test_writer_drops_subtask_index_idempotent(fixture_dataset_root: Path, tmp_p
     assert "language_events" in table_a.column_names
     # The writer no longer emits a dataset-level ``tools`` column; the
     # ``say`` tool schema lives as a code constant (``SAY_TOOL_SCHEMA``)
-    # so the parquet stays small and PR 2 doesn't extend PR 1's schema.
+    # so the parquet stays small and the pipeline doesn't extend the schema.
     assert "tools" not in table_a.column_names
 
     # second pass — must produce identical bytes for the language columns
diff --git a/tests/scripts/test_lerobot_annotate.py b/tests/scripts/test_lerobot_annotate.py
index 9f80d2e8c..a32ac0660 100644
--- a/tests/scripts/test_lerobot_annotate.py
+++ b/tests/scripts/test_lerobot_annotate.py
@@ -35,7 +35,7 @@ def test_push_to_hub_tags_uploaded_dataset_revision(tmp_path, monkeypatch):
 
     cfg = SimpleNamespace(
         repo_id="source/dataset",
-        dest_repo_id="annotated/dataset",
+        new_repo_id="annotated/dataset",
         push_private=True,
         push_commit_message=None,
     )

From c6f682b3f442732583527299432f06f458eb8158 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 11:45:38 +0200
Subject: [PATCH 35/45] annotate docs: install lerobot from main (post-merge
 wording)

The example already pins '@main'; update the doc step and the script
docstring from 'the branch under test' to 'lerobot (from main)' now that
the pipeline is merging to main.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx | 2 +-
 examples/annotations/run_hf_job.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 98ef79fb9..a2d38e417 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -57,7 +57,7 @@ HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 [`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
 spawns a single-GPU `h200` job (scale up to `h200x4` for larger datasets) that:
 
-1. installs the branch under test plus the annotation extras,
+1. installs `lerobot` (from `main`) plus the annotation extras,
 2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the
    chosen model, which the pipeline drives over the OpenAI-compatible API,
 3. runs the `plan` / `interjections` / `vqa` modules across the dataset
diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index c335379f4..61bcf8401 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -17,7 +17,7 @@
 
 Spawns one single-GPU ``h200`` job that:
 
-  1. installs ``lerobot`` plus the annotation extras,
+  1. installs ``lerobot`` from ``main`` plus the annotation extras,
   2. boots one vllm server with Qwen3.6-27B (dense VLM),
   3. runs the plan / interjections / vqa modules across the dataset
      in free-form mode (each episode generates its own subtasks +

From 7bec991cdf0ff19e00cf9c370eab126c43cfa651 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 11:48:59 +0200
Subject: [PATCH 36/45] docs(annotate): friendlier rewrite + architecture
 diagram; drop reproducibility section

Rewrite annotation_pipeline.mdx in plainer, easier-to-read language
(shorter sentences, active voice, a plain-text intro), add an ASCII
'How it fits together' architecture diagram, and remove the
'Reproducibility via seed and prompt hashes' section. Content/links are
preserved; only wording and structure change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx | 267 +++++++++++++++-------------
 1 file changed, 148 insertions(+), 119 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index a2d38e417..3fae61627 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -1,177 +1,206 @@
 # Annotation Pipeline
 
-`lerobot-annotate` populates the two language columns introduced by the
+`lerobot-annotate` watches each episode's video with a vision-language
+model (VLM) and writes natural-language annotations back into your
+dataset. It fills the two language columns from the
 [Language Columns and Recipes](./language_and_recipes) page —
-`language_persistent` and `language_events` — directly into
+`language_persistent` and `language_events` — straight into
 `data/chunk-*/file-*.parquet`.
 
+In short: point it at a LeRobot dataset, and it adds subtasks, plans,
+memory, interjections, speech, and visual Q&A that a policy can be
+trained on.
+
+## How it fits together
+
+```text
+  your dataset                lerobot-annotate
+  (LeRobot v3.1)        ┌──────────────────────────────────┐
+        │              │   read episodes                    │
+        └─────────────▶│        │                           │
+                       │        ▼                           │
+   one shared          │   ┌──────┐ ┌─────────────┐ ┌─────┐ │  each module writes
+   Qwen-VL server ────▶│   │ plan │ │interjections│ │ vqa │ │  raw JSONL into
+   (vLLM, OpenAI API)  │   └──┬───┘ └──────┬──────┘ └──┬──┘ │  .annotate_staging/
+                       │      └────────────┼───────────┘    │
+                       │                   ▼                 │
+                       │               validator             │  checks everything
+                       │                   │                 │
+                       │                   ▼                 │
+                       │                writer ──────────────┼─▶ data/chunk-*/file-*.parquet
+                       └──────────────────────────────────┘     (+ meta/info.json tools)
+```
+
+Three modules (`plan`, `interjections`, `vqa`) all talk to **one** shared
+VLM. Each module stages its output to disk, a validator checks it, and a
+single writer rewrites the dataset shards in place.
+
 ## What the pipeline produces
 
-Three modules write into a per-episode staging tree, then a single writer
-rewrites the data shards in place:
+Each module emits a few kinds of annotation ("styles"), routed to one of
+the two language columns:
 
 | Style / atom                                | Column                | Module          |
 | ------------------------------------------- | --------------------- | --------------- |
 | `subtask` (Pi0.7-style "how, not what")     | `language_persistent` | `plan`          |
 | `plan` (initial + refresh on interjection)  | `language_persistent` | `plan`          |
 | `memory` (MEM-style compression)            | `language_persistent` | `plan`          |
-| `task_aug` (rephrasings of canonical task)  | `language_persistent` | `plan`          |
+| `task_aug` (rephrasings of the task)        | `language_persistent` | `plan`          |
 | `interjection`                              | `language_events`     | `interjections` |
 | speech tool-call atom (`style=null`, `say`) | `language_events`     | `interjections` |
 | `vqa` (user / assistant pair)               | `language_events`     | `vqa`           |
 
-The `plan` module generates subtasks per episode with a **describe → segment**
-grounding flow: a first pass narrates only what is visible in the chosen
-camera, and its description is fed into a second pass that segments the
-episode into consecutive atomic subtasks. The resulting spans are then
-deterministically stitched into a contiguous full-episode cover so every
-frame has exactly one active subtask. See
+### How subtasks are generated
+
+The `plan` module doesn't ask the VLM for subtasks in one shot. Instead
+it uses a two-step **describe → segment** flow:
+
+1. **Describe** — the VLM narrates only what it actually sees in the
+   chosen camera (no guessing about the task).
+2. **Segment** — that description is fed back in, and the VLM splits the
+   episode into consecutive atomic subtasks.
+
+The resulting spans are then stitched into a gap-free, full-episode
+cover, so **every frame has exactly one active subtask**. See
 [`run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
-for the production flag set (single camera, embedded frames, windowed
+for the production settings (single camera, embedded frames, windowed
 subtask generation).
 
-The writer does **not** add a `tools` column to the parquet — the tool
-catalog lives at `meta/info.json["tools"]` instead (see
-[Tools](./tools)). After every annotation run the pipeline ensures the
-canonical `say` schema is present in that list, preserving any tools the
-user pre-declared.
+### Tools
 
-If you want to declare additional tools for a dataset before annotation
-runs, edit `meta/info.json["tools"]` directly — the pipeline preserves
-anything already there. That makes the tool visible to the chat template
-so the model can learn to _generate_ the call. The runtime layer that
-_executes_ a generated call (the `Tool` protocol / `TOOL_REGISTRY` under
-`src/lerobot/tools/`) is not part of this PR — see the
-[Tools](./tools) doc, which marks those pieces as not-yet-implemented.
+The writer does **not** add a `tools` column to the parquet. The tool
+catalog lives in `meta/info.json["tools"]` instead (see [Tools](./tools)).
+After every run, the pipeline makes sure the canonical `say` schema is in
+that list, keeping any tools you declared beforehand.
+
+Want to add your own tool? Edit `meta/info.json["tools"]` directly — the
+pipeline preserves whatever is already there. That makes the tool visible
+to the chat template, so the model can learn to _generate_ the call. The
+runtime layer that actually _executes_ a generated call (the `Tool`
+protocol / `TOOL_REGISTRY` under `src/lerobot/tools/`) is not part of
+this PR — the [Tools](./tools) doc marks those pieces as
+not-yet-implemented.
 
 ## Running on Hugging Face Jobs
 
-Distributed annotation is delegated to
-[Hugging Face Jobs](https://huggingface.co/docs/hub/en/jobs). The repo
-ships a launcher script you copy and edit for your dataset:
+Annotation runs on [Hugging Face Jobs](https://huggingface.co/docs/hub/en/jobs).
+The repo ships a launcher script you copy and tweak for your dataset:
 
 ```bash
 HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 ```
 
-[`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
-spawns a single-GPU `h200` job (scale up to `h200x4` for larger datasets) that:
+[`run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
+starts a single-GPU `h200` job (bump it to `h200x4` for big datasets)
+that:
 
 1. installs `lerobot` (from `main`) plus the annotation extras,
-2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the
-   chosen model, which the pipeline drives over the OpenAI-compatible API,
+2. boots one vLLM server per GPU (using the `vllm/vllm-openai` image) and
+   drives it over the OpenAI-compatible API,
 3. runs the `plan` / `interjections` / `vqa` modules across the dataset
-   via `lerobot-annotate`,
-4. with `--push_to_hub=true`, uploads the annotated dataset to
-   `--new_repo_id` (or back to `--repo_id` in place when that is unset).
+   with `lerobot-annotate`,
+4. with `--push_to_hub=true`, uploads the result to `--new_repo_id` (or
+   back to `--repo_id` in place if you leave that unset).
 
-To target a different dataset, model, or hub repo, edit the `CMD` block
-inside the script — every flag in there maps directly onto a CLI flag of
-`lerobot-annotate` (see `lerobot-annotate --help` for the full list).
+To use a different dataset, model, or hub repo, edit the `CMD` block in
+the script. Every flag there maps directly to a `lerobot-annotate` flag
+(run `lerobot-annotate --help` for the full list).
 
 ## Contributing new modules
 
-The pipeline is built to be extended, and **contributions are very
-welcome** — whether that's a brand-new annotation module (e.g. a
-trajectory-trace or affordance module), a new prompt template, a better
-grounding flow, or quality improvements to the existing `plan` /
-`interjections` / `vqa` modules. Each module lives under
+The pipeline is built to grow, and **contributions are very welcome** —
+a brand-new module (say, trajectory traces or affordances), a new prompt
+template, a smarter grounding flow, or quality fixes to the existing
+`plan` / `interjections` / `vqa` modules.
+
+Every module lives under
 `src/lerobot/annotations/steerable_pipeline/modules/`, shares the VLM
-client and keyframe cache, writes its raw output to the per-episode
-staging tree, and is wired into the executor as an independent phase.
-If you have an idea for a module or an improvement, open an issue or PR
-on [the repo](https://github.com/huggingface/lerobot).
+client and the keyframe cache, writes its raw output to the staging
+tree, and plugs into the executor as its own phase. Got an idea? Open an
+issue or PR on [the repo](https://github.com/huggingface/lerobot).
 
-## Style-to-recipe consumer mapping
+## How recipes consume the output
 
-The pipeline's outputs are designed to be consumed by recipes (see
-[Language Columns and Recipes](./language_and_recipes)) — typically:
+The annotations are meant to be read by recipes (see
+[Language Columns and Recipes](./language_and_recipes)). Typically:
 
-- low-level / high-level / memory-update branches consume
-  `subtask`/`plan`/`memory` from `language_persistent`.
-- An interjection-response branch consumes `interjection` events plus
-  the paired speech atom (merged into one assistant target turn via
-  `tool_calls_from`) and the same-timestamp `plan` refresh.
-- A VQA branch consumes the `(vqa, user)` and `(vqa, assistant)` pairs
-  from `language_events`.
+- low-level / high-level / memory-update branches read
+  `subtask` / `plan` / `memory` from `language_persistent`.
+- an interjection-response branch reads `interjection` events plus the
+  paired speech atom (merged into one assistant turn via `tool_calls_from`)
+  and the matching `plan` refresh at the same timestamp.
+- a VQA branch reads the `(vqa, user)` and `(vqa, assistant)` pairs from
+  `language_events`.
 
-## Why the design splits state from events
+## Why state and events are split
 
-Two things drive the scope:
+Two ideas shape the design:
 
-1. **Persistent state vs exact-event split.** Persistent rows
-   (`subtask`, `plan`, `memory`) broadcast per episode and answer "what
-   state is in force at this frame?". Event rows (`interjection`, `vqa`,
-   speech) only appear on the exact frame whose timestamp matches the
-   emission. The pipeline writes timestamps taken straight from the
-   source parquet — no floating-point recomputation.
-2. **One Qwen-VL pass.** All three modules share a single VLM client (the
-   OpenAI-compatible client talking to the job's vLLM server) so the cost
-   is one model load per dataset, not three.
+1. **Persistent state vs. exact events.** Persistent rows (`subtask`,
+   `plan`, `memory`) apply to the whole episode and answer "what's true
+   right now?". Event rows (`interjection`, `vqa`, speech) appear only on
+   the one frame whose timestamp matches. Timestamps are copied straight
+   from the source parquet — never recomputed in floating point.
+2. **One VLM pass.** All three modules share a single VLM client (the
+   OpenAI-compatible client talking to the job's vLLM server), so you pay
+   for one model load per dataset, not three.
 
-## Module independence and staged reruns
+## Re-running a single module
 
-Each module writes its raw output to
-`<root>/.annotate_staging/episode_{N:06d}/<module>.jsonl`. That makes
-prompt iteration cheap — re-running one module overwrites only its own
-JSONL file before the writer composes the final parquet. Modules can be
-disabled via `--plan.enabled=false` (and likewise `--interjections.enabled`
-/ `--vqa.enabled`) to
-test them in isolation.
+Each module stages its raw output to
+`<root>/.annotate_staging/episode_{N:06d}/<module>.jsonl`. This makes
+prompt iteration cheap: re-running one module overwrites only its own
+JSONL, then the writer recomposes the final parquet. Disable modules you
+don't want with `--plan.enabled=false` (and likewise
+`--interjections.enabled` / `--vqa.enabled`) to test one at a time.
 
-## Validation/report checks before final write
+## What the validator checks
 
-Before the writer runs, `StagingValidator` checks:
+Before the writer runs, `StagingValidator` confirms:
 
-- exact frame-timestamp alignment for every event row;
-- no orphan speech / interjection pairs;
+- every event row lands exactly on a real frame timestamp;
+- no speech / interjection pairs are left orphaned;
 - `plan` is refreshed at every interjection timestamp;
-- `memory` rows fall on subtask boundaries (warning, not error);
-- VQA assistant `content` parses as JSON in one of the
+- `memory` rows fall on subtask boundaries (a warning, not an error);
+- each VQA assistant `content` is valid JSON in one of the
   bbox / keypoint / count / attribute / spatial shapes;
-- every row routes to the column dictated by `column_for_style(style)`.
+- every row goes to the column chosen by `column_for_style(style)`.
 
-Errors abort the writer (`--skip_validation=true` overrides for debugging).
+Any error aborts the writer. Pass `--skip_validation=true` to override
+while debugging.
 
-## Paper inspirations per module
+## Where each module's ideas come from
 
-- **`plan` module — subtasks.** Hi Robot ([Shi 2025](https://arxiv.org/abs/2502.19417))
-  atom granularity ("pick up one piece of lettuce", "place bowl to box");
-  Pi0.7 ([Physical Intelligence 2025](https://pi.website/pi07)) "how, not
-  what" detail.
-- **`plan` module — memory.** MEM ([Torne 2026](https://arxiv.org/abs/2603.03596))
-  compression directive: keep only minimal relevant information; functional
-  outcomes preserved, specific attributes dropped.
-- **`interjections` module.** Hi Robot scenario taxonomy: negative task,
+- **`plan` — subtasks.** Hi Robot ([Shi 2025](https://arxiv.org/abs/2502.19417))
+  for atom granularity ("pick up one piece of lettuce", "place bowl to
+  box"); Pi0.7 ([Physical Intelligence 2025](https://pi.website/pi07))
+  for "how, not what" detail.
+- **`plan` — memory.** MEM ([Torne 2026](https://arxiv.org/abs/2603.03596)):
+  keep only the minimal relevant information — preserve outcomes, drop
+  specific attributes.
+- **`interjections`.** Hi Robot's scenario taxonomy: negative task,
   situated correction, specific constraint, preference. Speech is a
-  tool-call-only atom (`tool_calls=[{type:function, function:{name:"say",
-arguments:{text:...}}}]`).
-- **`vqa` module.** ECoT ([Zawalski 2024](https://arxiv.org/abs/2407.08693))
-  grounded features (bounding boxes in pixel `[x_min, y_min, x_max, y_max]`,
-  keypoints) and Steerable VLA Policies ([Zhao 2025](https://arxiv.org/abs/2509.07626))
-  multi-abstraction grounding. Pi0.7 also grounds answers across
-  multiple abstraction levels.
+  tool-call-only atom
+  (`tool_calls=[{type:function, function:{name:"say", arguments:{text:...}}}]`).
+- **`vqa`.** ECoT ([Zawalski 2024](https://arxiv.org/abs/2407.08693)) for
+  grounded features (pixel bounding boxes `[x_min, y_min, x_max, y_max]`,
+  keypoints) and Steerable VLA Policies
+  ([Zhao 2025](https://arxiv.org/abs/2509.07626)) for multi-abstraction
+  grounding. Pi0.7 also grounds answers across abstraction levels.
 
-Future maintainers should adjust the prompt templates in
-`src/lerobot/annotations/steerable_pipeline/prompts/` against these
-references rather than rewriting from scratch.
+When improving a module, tweak its prompt template in
+`src/lerobot/annotations/steerable_pipeline/prompts/` rather than
+rewriting from scratch.
 
-## Compute and list-size estimates
+## Roughly how much it costs
 
-Per episode, the pipeline issues O(`max_steps`) `plan`-module calls,
-O(`max_interjections_per_episode`) `interjections`-module calls, and
-O(`vqa_emission_hz × episode_seconds`) `vqa`-module calls. With defaults
-(8 subtasks, 1 interjection, 1 Hz × 3 pairs) and 30-second episodes, that
-is ~50 VLM calls per episode. `language_persistent` per episode is ~10s of
-KB at most (parquet dictionary-encodes one entry per episode);
-`language_events` is empty on most frames and is bounded by the number of
-emissions, not `num_frames × num_emissions`.
+Per episode, the pipeline makes about `max_steps` plan calls,
+`max_interjections_per_episode` interjection calls, and
+`vqa_emission_hz × episode_seconds` VQA calls. With the defaults (8
+subtasks, 1 interjection, 1 Hz × 3 pairs) on a 30-second episode, that's
+~50 VLM calls.
 
-## Reproducibility via seed and prompt hashes
-
-`--seed` (default 1729) feeds the per-episode RNGs that select interjection
-timestamps and VQA question types. Combined with the deterministic prompt
-templates checked into `prompts/`, two runs at the same seed against the
-same dataset and the same model checkpoint produce byte-identical staging
-artifacts. Prompt edits are recorded by file hash; future tooling can pin
-expected `(seed, prompt_hash)` pairs into the dataset card.
+Storage stays small: `language_persistent` is at most tens of KB per
+episode (parquet dictionary-encodes the one entry that repeats across
+frames), and `language_events` is empty on most frames — its size scales
+with the number of emissions, not `num_frames × num_emissions`.

From 2af2402a0c155da7e7fafaac4738616a6e3b2858 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 11:59:31 +0200
Subject: [PATCH 37/45] docs(annotate): cleaner architecture diagram layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Top-down flow (read episodes → 3 modules fan out → validator → writer →
parquet) with aligned boxes, instead of the cramped bordered version.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx | 41 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 3fae61627..c45617ee6 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -14,21 +14,32 @@ trained on.
 ## How it fits together
 
 ```text
-  your dataset                lerobot-annotate
-  (LeRobot v3.1)        ┌──────────────────────────────────┐
-        │              │   read episodes                    │
-        └─────────────▶│        │                           │
-                       │        ▼                           │
-   one shared          │   ┌──────┐ ┌─────────────┐ ┌─────┐ │  each module writes
-   Qwen-VL server ────▶│   │ plan │ │interjections│ │ vqa │ │  raw JSONL into
-   (vLLM, OpenAI API)  │   └──┬───┘ └──────┬──────┘ └──┬──┘ │  .annotate_staging/
-                       │      └────────────┼───────────┘    │
-                       │                   ▼                 │
-                       │               validator             │  checks everything
-                       │                   │                 │
-                       │                   ▼                 │
-                       │                writer ──────────────┼─▶ data/chunk-*/file-*.parquet
-                       └──────────────────────────────────┘     (+ meta/info.json tools)
+  your dataset                  lerobot-annotate
+  (LeRobot v3.1)
+        │
+        ▼
+  ┌─────────────────────────────────────────────────────┐
+  │                    read episodes                     │
+  └──────────────────────────┬──────────────────────────┘
+                             │
+        ┌────────────────────┼────────────────────┐
+        ▼                    ▼                     ▼
+  ┌──────────┐      ┌───────────────┐        ┌──────────┐       one shared Qwen-VL
+  │   plan   │      │ interjections │        │   vqa    │  ◀──   server (vLLM, OpenAI
+  └────┬─────┘      └───────┬───────┘        └────┬─────┘        API) drives all three
+       └────────────────────┼─────────────────────┘
+                            │   each module stages raw JSONL
+                            ▼   into .annotate_staging/
+                  ┌─────────────────┐
+                  │    validator    │  ◀──  checks everything
+                  └────────┬────────┘
+                           ▼
+                  ┌─────────────────┐
+                  │     writer      │
+                  └────────┬────────┘
+                           ▼
+              data/chunk-*/file-*.parquet
+              (+ meta/info.json tools)
 ```
 
 Three modules (`plan`, `interjections`, `vqa`) all talk to **one** shared

From 56cbb5f9ecea2bf3d4730ab84fd31b0b98aaf140 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 13:48:55 +0200
Subject: [PATCH 38/45] annotate(example): trim run_hf_job comments to one line
 each
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same flags and rationale, condensed — each plan-module flag now has a
short one/two-line comment instead of a paragraph.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py | 60 +++++++++---------------------
 1 file changed, 18 insertions(+), 42 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 61bcf8401..85ac8f17c 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -69,58 +69,34 @@ CMD = (
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
-    # Phase 1 — plan module (subtasks + plan + memory).
-    # Embed decoded frames directly (use_video_url=false) rather than
-    # handing the server a file:// clip. The embedded path is more
-    # reliable: if clip extraction ever fails, the video_url path would
-    # silently send NO video and the VLM would hallucinate subtasks from
-    # the task text alone.
-    #
-    # CONTEXT BUDGET: with embedded frames, each frame is ~250-320 vision
-    # tokens. The model's context is 32768 (see --max-model-len). 32
-    # frames sampled uniformly across the episode (~8-10k tokens) fits
-    # comfortably alongside the prompt and the describe pass.
-    # Do NOT raise max_video_frames toward 128 with embedded frames — that
-    # is ~33-39k tokens and overflows the context (BadRequestError 400,
-    # "Input length exceeds maximum context length").
+    # Phase 1 — plan module (subtasks + memory).
+    # Embed decoded frames (not a file:// clip): if clip extraction fails,
+    # the video_url path silently sends no video and the VLM hallucinates.
     "--plan.use_video_url=false "
     "--plan.frames_per_second=1.0 "
+    # 32 frames ≈ 8-10k vision tokens, fits the 32768 context. Don't push
+    # toward 128 — that overflows the context (BadRequestError 400).
     "--plan.max_video_frames=32 "
-    # Constant 1 fps density via windowing: episodes longer than 32s are
-    # split into 32-second windows (each 32 frames @ 1 fps, fits context),
-    # so long episodes get MORE subtasks instead of a sparser whole-episode
-    # view. describe->segment runs per window; spans are merged +
-    # stitched to a contiguous whole-episode cover. 0 disables.
+    # Window long episodes into 32s chunks (constant 1 fps density) so they
+    # get more subtasks; per-window spans are merged + stitched. 0 disables.
     "--plan.subtask_window_seconds=32 "
-    # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
-    # stove", "Pick the mug...") is authoritative and is what eval uses.
-    # ``derive_task_from_video=off`` keeps that canonical task driving
-    # subtask generation. Do NOT use ``always`` here — it throws the real
-    # task away, asks the VLM "what is this video about?" with no hint,
-    # and the hallucinated task then poisons every subtask + plan row.
+    # RoboCasa: the dataset task string is authoritative (eval uses it), so
+    # keep it driving subtasks. ``always`` would throw it away and hallucinate.
     "--plan.derive_task_from_video=off "
-    # NO task augmentation for RoboCasa: eval conditions on the exact task
-    # strings, so synthetic rephrasings are unused at best and (when they
-    # drift, e.g. "wander around the kitchen") harmful. 0 rephrasings +
-    # axes disabled = the policy only ever sees the canonical task.
+    # No task augmentation: eval conditions on the exact task strings, so
+    # rephrasings are unused at best and harmful when they drift.
     "--plan.n_task_rephrasings=0 "
-    # action_records OFF: the structured {verb,object,arm,grasp,dest}
-    # schema is a manipulation schema; RoboCasa navigation / atomic tasks
-    # don't fit it and the VLM hallucinates. When on, records are purely
-    # additive (emitted as style="action_record" rows) and never touch
-    # the subtask text — useful only for long composite manipulation
-    # tasks. Leave off for RoboCasa atomic / navigation.
-    # Keep subtask decomposition tight for atomic tasks:
+    # Keep subtask decomposition tight for atomic tasks.
+    # (action_records left off: the {verb,object,arm,grasp,dest} schema is for
+    # long manipulation tasks, not RoboCasa atomic/navigation.)
     "--plan.plan_max_steps=10 "
-    # Only annotate subtasks + memory — skip the numbered "plan" rows
-    # (and their per-boundary VLM call). Flip to true to re-enable plan.
+    # Only subtasks + memory — skip the numbered "plan" rows. true re-enables.
     "--plan.emit_plan=false "
-    # NOTE: the grounding pass (describe -> segment, +1 VLM call/episode)
-    # is ON BY DEFAULT. Pass --plan.subtask_describe_first=false to disable
-    # on datasets you've verified are easy and want fewer calls.
+    # The describe->segment grounding pass (+1 VLM call/episode) is ON by
+    # default; pass --plan.subtask_describe_first=false to skip it.
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
-    # Phase 4 — general VQA: DISABLED for this run.
+    # Phase 4 — general VQA: disabled for this run.
     "--vqa.enabled=false"
 )
 

From dbe02f0c4fdd73522cafaa9204f780dc551ac422 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 13:52:24 +0200
Subject: [PATCH 39/45] annotate(plan): condense verbose comments + docstrings

Trim the long inline comment blocks (effective_task / task_aug, action
records, plan-boundary rows, plan-update span closing, windowed +
coverage-stitch sections) and the _generate_plan / run_plan_updates
docstrings to a few lines each. No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../modules/plan_subtasks_memory.py           | 107 ++++++------------
 1 file changed, 34 insertions(+), 73 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index ac5c76453..d054f9eb5 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -66,19 +66,12 @@ class PlanSubtasksMemoryModule:
 
     def run_episode(self, record: EpisodeRecord, staging: EpisodeStaging) -> None:
         rows: list[dict[str, Any]] = []
-        # Resolve the task that drives every other ``plan``-module prompt.
-        # May be the canonical ``record.episode_task`` (default), or a fresh
-        # description derived from the video when the canonical task is
-        # empty / placeholder / forced-off (see PlanConfig.derive_task_*).
+        # Task driving every plan-module prompt: canonical episode_task, or a
+        # video-derived one when it's empty/placeholder (see derive_task_*).
         effective_task = self._resolve_effective_task(record)
-        # ``task_aug`` rows at t=0 (role=user), one per rephrasing — the
-        # message renderer rotates ``${task}`` deterministically through
-        # them so the policy sees diverse phrasings during training.
-        # Two paths:
-        #   * ``task_aug_axes.enabled=True`` — structured 5-axis taxonomy
-        #     (synonym / omit_arm / omit_orientation / omit_grasp_method
-        #     / combined). Replaces the free-form rephrasings flow.
-        #   * Otherwise — free-form ``n_task_rephrasings`` (original).
+        # task_aug rows at t=0: phrasings the renderer rotates ${task} through.
+        # Either the structured 5-axis taxonomy (task_aug_axes.enabled) or
+        # free-form n_task_rephrasings.
         t0 = float(record.frame_timestamps[0]) if record.frame_timestamps else 0.0
         axes_cfg = self.config.task_aug_axes
         if axes_cfg.enabled and effective_task:
@@ -101,9 +94,8 @@ class PlanSubtasksMemoryModule:
                 )
         elif self.config.n_task_rephrasings > 0 and effective_task:
             rephrasings = self._generate_task_rephrasings(effective_task, n=self.config.n_task_rephrasings)
-            # Always include the effective task itself as the first variant
-            # so the rotation is guaranteed to cover the source-of-truth
-            # phrasing, not just synthetic alternatives.
+            # Include the effective task first so the rotation always covers
+            # the source-of-truth phrasing, not just synthetic ones.
             seen = set()
             ordered = [effective_task, *rephrasings]
             for phrasing in ordered:
@@ -123,16 +115,10 @@ class PlanSubtasksMemoryModule:
 
         subtask_spans = self._generate_subtasks(record, task=effective_task)
 
-        # ----------------------------------------------------------------
-        # Phase 1a: structured per-subtask action records (additive)
-        # ----------------------------------------------------------------
-        # When enabled, for every subtask span we ask the VLM for a typed
-        # ActionRecord (verb / object / arm / grasp_type / destination /
-        # mistake) and emit it as a separate ``style="action_record"``
-        # row for downstream use. This is purely additive — it never
-        # touches the VLM's subtask text (reconstructing subtask text
-        # from these fields was too easy to hallucinate on tasks that
-        # don't fit the manipulation schema).
+        # Phase 1a: optional per-subtask action records. When enabled, emit a
+        # typed ActionRecord (verb/object/arm/grasp_type/destination/mistake)
+        # per span as a separate style="action_record" row. Purely additive —
+        # never touches the subtask text.
         records_cfg = self.config.action_records
         action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
         if records_cfg.enabled and subtask_spans:
@@ -162,14 +148,10 @@ class PlanSubtasksMemoryModule:
                         "tool_calls": None,
                     }
                 )
-        # Plan rows at every subtask boundary — including t=0 (start of
-        # the first subtask). Because the plan is just a numbered list
-        # of *still-todo* subtasks, re-emitting at each boundary makes
-        # the active plan shrink as work progresses: at frame t the
-        # rendered ``${plan}`` is the most recent emission, which
-        # contains exactly the subtasks that started at or after the
-        # current span. Saves the runtime from having to derive
-        # "what's still left" at inference time.
+        # Plan rows at every subtask boundary (incl. t=0). The plan is a
+        # numbered list of still-todo subtasks, so re-emitting at each
+        # boundary makes it shrink as work progresses — ${plan} at frame t is
+        # exactly what's left to do.
         if self.config.emit_plan:
             for span in subtask_spans:
                 boundary_t = snap_to_frame(span["start"], record.frame_timestamps)
@@ -252,9 +234,8 @@ class PlanSubtasksMemoryModule:
         return task.lower() in self._PLACEHOLDER_TASKS
 
     # ------------------------------------------------------------------
-    # VLM call helpers (factored out: every ``plan``-module prompt below follows
-    # the same "build messages → single VLM call → pull a named field"
-    # shape, only differing in field name + post-processing).
+    # VLM call helpers — every plan-module prompt follows the same shape:
+    # build messages → single VLM call → pull a named field.
     # ------------------------------------------------------------------
 
     def _vlm_field(self, messages: list[dict[str, Any]], field: str) -> Any:
@@ -510,20 +491,15 @@ class PlanSubtasksMemoryModule:
     ) -> None:
         """Append additional ``plan`` rows at every interjection timestamp.
 
-        Plans refresh ONLY on user interjections — subtask generation
-        runs ~1 Hz at inference, but plan re-emission is event-driven.
-        Now also forwards the interjection's own text into the prompt so
-        the refreshed plan can actually reflect the user's correction
-        (the previous version told the model "an interjection happened"
-        without telling it what the user said).
+        Plans refresh ONLY on user interjections (event-driven). The
+        interjection text is forwarded into the prompt so the refreshed plan
+        reflects the user's correction.
         """
         if not self.config.emit_plan:
             return
         existing = staging.read("plan")
-        # Pass the episode's last frame timestamp so the final subtask
-        # span is closed (otherwise its ``end`` equals its ``start``,
-        # zero duration, and the "current subtask at refresh_t" lookup
-        # in ``_generate_plan`` misses any refresh that lands inside it).
+        # Pass the last frame timestamp so the final span is closed (else its
+        # end == start, zero duration, and a refresh inside it is missed).
         episode_end_t = float(record.frame_timestamps[-1]) if record.frame_timestamps else None
         spans = reconstruct_subtask_spans(existing, episode_end_t=episode_end_t)
         already_planned: set[float] = {float(r["timestamp"]) for r in existing if r.get("style") == "plan"}
@@ -571,12 +547,9 @@ class PlanSubtasksMemoryModule:
         effective_task = task if task is not None else record.episode_task
 
         # ---- Windowed path (constant temporal density) ---------------
-        # When ``subtask_window_seconds > 0`` and the episode is longer
-        # than one window, process the episode in fixed-length windows so
-        # the VLM always sees ``frames_per_second`` density (instead of a
-        # sparse 32-frame whole-episode view). Each window runs the full
-        # describe -> segment chain on its own frames; results are merged +
-        # stitched into a contiguous whole-episode cover.
+        # If subtask_window_seconds > 0 and the episode exceeds one window,
+        # process fixed-length windows so the VLM always sees
+        # frames_per_second density; results are merged + stitched.
         window_s = float(getattr(self.config, "subtask_window_seconds", 0.0) or 0.0)
         if window_s > 0.0 and episode_duration > window_s:
             return self._generate_subtasks_windowed(record, effective_task, window_s)
@@ -609,12 +582,9 @@ class PlanSubtasksMemoryModule:
             return []
 
         # ---- Full-episode coverage stitch ----------------------------
-        # The VLM can leave the first subtask starting after t0 or leave
-        # gaps between spans, so the subtask timeline no longer tiles the
-        # whole episode and frames fall through with no active subtask.
-        # Always stitch the surviving spans into a contiguous cover of
-        # [t0, t_last] — there is no scenario where a sparse, gap-ridden
-        # subtask timeline is desirable for conditioning.
+        # The VLM can start after t0 or leave gaps, so frames fall through
+        # with no active subtask. Always stitch into a contiguous
+        # [t0, t_last] cover.
         cleaned = self._stitch_full_coverage(cleaned, record)
 
         return cleaned
@@ -841,25 +811,16 @@ class PlanSubtasksMemoryModule:
     ) -> str | None:
         """Deterministic plan = numbered list of *still-todo* subtasks.
 
-        Previously this called the VLM with a prompt that asked it to
-        compress the subtasks into a "compact hierarchical plan". That
-        produced longer-than-necessary plans, cost an extra VLM round-trip
-        per episode (plus one per interjection on refresh), and could
-        diverge from the actual subtask sequence the model is going to
-        execute. Replacing it with a plain summarisation keeps the plan
-        tightly aligned with the upcoming subtasks and removes the VLM
-        call entirely.
-
-        Layout — short imperative fragments prefixed by "N. ":
+        No VLM call: a plain numbered list keeps the plan aligned with the
+        upcoming subtasks (the old VLM "compact hierarchical plan" prompt
+        cost a round-trip per episode/refresh and could diverge).
 
             1. <subtask 1>
             2. <subtask 2>
-            ...
 
-        On a refresh at ``refresh_t`` (called from ``run_plan_updates``
-        on interjection events, and from ``run_episode`` at every subtask
-        boundary), only subtasks whose start is at or after ``refresh_t``
-        are included — the plan shrinks as work progresses, so it always
+        On a refresh at ``refresh_t`` (from ``run_plan_updates`` on
+        interjections, and ``run_episode`` at each boundary), only subtasks
+        starting at or after ``refresh_t`` are included — so it always
         describes what's left.
         """
         if not subtask_spans:

From 20c7a12dd56aa75016c8c96e7cc101620d701633 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 14:05:46 +0200
Subject: [PATCH 40/45] annotate: remove dead code, document CLI options,
 compact config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dead code (defined but never referenced anywhere in src/tests/examples):
  * reader.py: keyframe_indices, episode_frame_timestamps, lookup_data_path,
    and the now-orphaned gather_data_paths + episode_offsets_per_path
    (lookup_data_path was their only caller).
  * staging.py: iter_staged_episodes.
  * writer.py: normalize_rows_for_writer.
  * config.py VlmConfig: json_mode, batch_size, tensor_parallel_size,
    gpu_memory_utilization, trust_remote_code — consumed only by the
    in-process vllm/transformers backends that were removed; the openai
    auto-serve path carries those vLLM flags via serve_command instead.
    Kept max_model_len (still used as the serve-command default).
  * config.py TaskAugAxesConfig.total property.

Docs: new 'Key options' section in annotation_pipeline.mdx — grouped
tables (dataset in/out, module toggles, --vlm.*, --plan.*, interjections
+ vqa) describing the flags users actually reach for, with defaults.

config.py: compact the verbose field comments + ActionRecordsConfig /
TaskAugAxesConfig docstrings; fix two stale 'verify' references (the
verify pass was removed — it's describe -> segment now) and the stale
'renders record back to subtask text' note (that path was removed).
vlm_client docstring no longer mentions the removed json_mode field.

Verified: tests/annotations + tests/datasets/test_language +
tests/scripts/test_lerobot_annotate (40 passed); pre-commit clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           |  59 +++++++
 .../annotations/steerable_pipeline/config.py  | 156 ++++++------------
 .../annotations/steerable_pipeline/reader.py  |  58 -------
 .../annotations/steerable_pipeline/staging.py |  14 +-
 .../steerable_pipeline/vlm_client.py          |   3 +-
 .../annotations/steerable_pipeline/writer.py  |  18 +-
 6 files changed, 111 insertions(+), 197 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index c45617ee6..c9eefeb3e 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -117,6 +117,65 @@ To use a different dataset, model, or hub repo, edit the `CMD` block in
 the script. Every flag there maps directly to a `lerobot-annotate` flag
 (run `lerobot-annotate --help` for the full list).
 
+## Key options
+
+These are the flags you'll reach for most often. Run
+`lerobot-annotate --help` for everything else; the defaults are tuned for
+short manipulation episodes.
+
+### Dataset in / out
+
+| Flag              | Default | What it does                                                            |
+| ----------------- | ------- | ----------------------------------------------------------------------- |
+| `--repo_id`       | —       | Hub dataset to annotate (downloaded if `--root` unset).                 |
+| `--root`          | —       | Annotate a local dataset directory instead.                             |
+| `--new_repo_id`   | —       | Push the result to a new repo (leaves the source repo untouched).       |
+| `--push_to_hub`   | `false` | Upload after annotating (to `--new_repo_id`, else back to `--repo_id`). |
+| `--only_episodes` | all     | Annotate just these episode indices (handy for a test run).             |
+| `--seed`          | `1729`  | Seeds the RNGs that pick interjection timestamps + VQA question types.  |
+
+### Which modules run
+
+Each module can be turned off independently to iterate on one at a time:
+`--plan.enabled`, `--interjections.enabled`, `--vqa.enabled` (all
+`true` by default).
+
+### The VLM (`--vlm.*`)
+
+| Flag                       | Default            | What it does                                                                        |
+| -------------------------- | ------------------ | ----------------------------------------------------------------------------------- |
+| `--vlm.model_id`           | `Qwen/Qwen3.6-27B` | The model to serve and prompt.                                                      |
+| `--vlm.camera_key`         | first `images.*`   | Which camera every prompt is grounded on.                                           |
+| `--vlm.serve_command`      | auto               | The exact `vllm serve …` command (set TP size, GPU memory, `--max-model-len` here). |
+| `--vlm.parallel_servers`   | `1`                | Independent servers for round-robin routing (one per GPU).                          |
+| `--vlm.num_gpus`           | `0`                | GPUs per server (`0` = one each).                                                   |
+| `--vlm.client_concurrency` | `16`               | In-flight requests across all servers.                                              |
+| `--vlm.max_new_tokens`     | `512`              | Generation cap per call.                                                            |
+| `--vlm.temperature`        | `0.2`              | Sampling temperature.                                                               |
+
+### Subtasks / plan / memory (`--plan.*`)
+
+| Flag                            | Default    | What it does                                                                                                              |
+| ------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `--plan.frames_per_second`      | `1.0`      | How densely the episode video is sampled.                                                                                 |
+| `--plan.max_video_frames`       | `32`       | Hard cap on frames per call (context-budget guard — don't exceed ~32 for a 32k context).                                  |
+| `--plan.subtask_window_seconds` | `0`        | Split long episodes into fixed windows for constant frame density (`0` = whole episode).                                  |
+| `--plan.plan_max_steps`         | `8`        | Upper bound on subtasks per episode.                                                                                      |
+| `--plan.subtask_describe_first` | `true`     | Run the describe→segment grounding pass (best subtask quality; +1 call/episode).                                          |
+| `--plan.emit_plan`              | `true`     | Emit the numbered `plan` rows (`false` = subtasks + memory only).                                                         |
+| `--plan.n_task_rephrasings`     | `10`       | How many `task_aug` rephrasings to emit (`0` disables).                                                                   |
+| `--plan.derive_task_from_video` | `if_short` | Use the dataset task as-is (`off`), only when it's missing/short (`if_short`), or always re-derive from video (`always`). |
+| `--plan.use_video_url`          | `false`    | Send a server-side video clip instead of embedded frames.                                                                 |
+
+### Interjections + VQA
+
+| Flag                                            | Default | What it does                                               |
+| ----------------------------------------------- | ------- | ---------------------------------------------------------- |
+| `--interjections.max_interjections_per_episode` | `3`     | Cap on interjection/speech pairs per episode.              |
+| `--vqa.vqa_emission_hz`                         | `1.0`   | How often VQA pairs are emitted.                           |
+| `--vqa.restrict_to_default_camera`              | `false` | Ground VQA only on `--vlm.camera_key` (else every camera). |
+| `--executor.episode_parallelism`                | `16`    | Episodes processed concurrently within each phase.         |
+
 ## Contributing new modules
 
 The pipeline is built to grow, and **contributions are very welcome** —
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 10484fd3a..744de9a29 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -44,78 +44,56 @@ class PlanConfig:
     derive_task_from_video: str = "if_short"
     derive_task_min_words: int = 3
 
-    # Frame sampling for the subtask-decomposition prompt. Frames are
-    # sampled uniformly across the whole episode up to ``max_video_frames``
-    # (so longer episodes are subsampled, not truncated).
-    #
-    # ``max_video_frames`` is a HARD context-budget cap. With the embedded-
-    # frame path (use_video_url=false), every frame becomes ~250-320 vision
-    # tokens, so 128 frames ≈ 33-39k tokens — over a 32k-context VLM. 32
-    # frames (~8-10k tokens) leaves ample room for the prompt + the
-    # describe / verify passes. Raise only if your serving context is
-    # larger AND your episodes need finer temporal resolution; if you hit
-    # "Input length exceeds maximum context length", lower this.
+    # Frames are sampled uniformly across the episode, capped at
+    # ``max_video_frames`` (a HARD context-budget cap, not an annotation
+    # knob). Each embedded frame is ~250-320 vision tokens, so 32 frames
+    # (~8-10k tokens) fit a 32k-context VLM; 128 would overflow it. Lower
+    # this if you hit "Input length exceeds maximum context length".
     frames_per_second: float = 1.0
     max_video_frames: int = 32
 
     # Windowed subtask generation for CONSTANT temporal density. When > 0
-    # and an episode is longer than this many seconds, the plan module
-    # processes the episode in consecutive windows of this length, each
-    # sampled at ``frames_per_second``, instead of subsampling the whole
-    # episode to ``max_video_frames`` (which makes long episodes sparse).
-    # The describe -> segment -> verify chain runs per window; results are
-    # offset to absolute time, merged, and stitched into a contiguous
-    # whole-episode cover. Cost scales with episode length (≈ chain calls
-    # × ceil(duration / window)). Set to ~max_video_frames / frames_per_
-    # second (e.g. 32s at 1 fps) so each window fills — but never exceeds —
-    # the per-call frame budget. 0 disables (single whole-episode call).
+    # and the episode is longer than this, the plan module processes it in
+    # consecutive windows of this length (each sampled at
+    # ``frames_per_second``) instead of subsampling the whole episode to a
+    # sparse ``max_video_frames``. The describe -> segment chain runs per
+    # window; spans are merged + stitched. Set to ~max_video_frames /
+    # frames_per_second (e.g. 32s at 1 fps). 0 disables.
     subtask_window_seconds: float = 0.0
 
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # ``subtask_describe_first``: run a grounding pass that narrates ONLY
-    # what is visible in the video (no subtask JSON yet), then inject that
-    # description into the segmentation prompt. Forces the model to observe
-    # before committing to structured output — the strongest lever against
-    # subtasks invented from the task text. ON by default; +1 VLM call/ep.
-    # Set False to trade quality for fewer calls on easy datasets.
+    # Run a grounding pass that narrates ONLY what's visible (no subtask
+    # JSON yet), then feed that into the segmentation prompt — the strongest
+    # lever against subtasks invented from the task text. ON by default;
+    # +1 VLM call/episode. False trades quality for fewer calls.
     subtask_describe_first: bool = True
 
-    # Emit ``style="plan"`` rows (the numbered still-todo list re-emitted at
-    # every subtask boundary). Set False to keep only subtasks + memory and
-    # skip the plan rows entirely — saves one ``_generate_plan`` VLM call per
-    # subtask boundary. Subtask and memory generation are unaffected.
+    # Emit ``style="plan"`` rows (the numbered still-todo list, re-emitted at
+    # every subtask boundary). False keeps only subtasks + memory and skips
+    # the per-boundary ``_generate_plan`` call.
     emit_plan: bool = True
 
-    # NOTE: subtask spans are ALWAYS stitched into a contiguous
-    # full-episode cover (first subtask pulled back to t0, gaps closed,
-    # last span extended to t_last) as a deterministic post-step in
-    # ``_generate_subtasks._stitch_full_coverage``. This is not
-    # configurable — a sparse / gap-ridden subtask timeline is never
-    # desirable for conditioning, so it is unconditional.
+    # NOTE: subtask spans are ALWAYS stitched into a contiguous full-episode
+    # cover (see ``_stitch_full_coverage``) — not configurable, since a
+    # sparse / gap-ridden timeline is never useful for conditioning.
 
-    # When True (and backend supports it, e.g. ``openai``), the ``plan``
-    # module sends a ``video_url`` block pointing at a per-episode mp4
-    # subclip and lets the server sample frames at ``use_video_url_fps``.
+    # When True (with a backend that supports it, e.g. ``openai``), send a
+    # ``video_url`` block pointing at a per-episode mp4 subclip and let the
+    # server sample frames at ``use_video_url_fps``.
     use_video_url: bool = False
     use_video_url_fps: float = 1.0
 
-    # Structured per-subtask action records (Phase 1a + 1b, inspired by
-    # EgoMimic's annotator form). For each generated subtask span, the
-    # VLM extracts a typed record (verb / object / arm / grasp_type /
-    # destination / mistake). A deterministic Python template renders
-    # that record back to canonical subtask text — reducing the VLM's
-    # "creative" surface to just the perception step. See
-    # ``ActionRecordsConfig`` for details. Off by default (back-compat).
+    # Optional structured per-subtask action records (EgoMimic-style). When
+    # enabled, the VLM extracts a typed record per subtask span; see
+    # ``ActionRecordsConfig``. Purely additive — off by default.
     action_records: ActionRecordsConfig = field(default_factory=lambda: ActionRecordsConfig())
 
-    # Structured 5-axis augmentation taxonomy for the t=0 task variants
-    # (replaces the free-form ``n_task_rephrasings`` flow when enabled).
-    # Mirrors EgoMimic's ``augment_prompt.txt`` taxonomy: instead of N
-    # free-form rephrasings, the VLM produces variants along named
-    # axes (synonym / omit_arm / omit_orientation / omit_grasp_method /
-    # combined). Off by default (back-compat).
+    # Optional 5-axis task-augmentation taxonomy for the t=0 variants
+    # (EgoMimic-style: synonym / omit_arm / omit_orientation /
+    # omit_grasp_method / combined). Replaces the free-form
+    # ``n_task_rephrasings`` flow when enabled; see ``TaskAugAxesConfig``.
     task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())
 
 
@@ -123,9 +101,8 @@ class PlanConfig:
 class ActionRecordsConfig:
     """Structured per-subtask action record extraction.
 
-    When ``enabled=True``, after the existing subtask-span generation in
-    ``plan_subtasks_memory.py``, the module makes one extra VLM call per
-    subtask to extract a typed record::
+    When ``enabled=True``, after subtask-span generation the module makes
+    one extra VLM call per subtask to extract a typed record::
 
         {
             "verb": "pick" | "place" | "press" | ...,  # closed vocabulary
@@ -136,20 +113,13 @@ class ActionRecordsConfig:
             "mistake": "<short text>" | null,
         }
 
-    The record is emitted as a separate row with ``style="action_record"``
-    (``content=json.dumps(record)``) at the subtask's start timestamp.
-    It is PURELY ADDITIVE — it never touches the VLM's subtask text.
-    Downstream training can consume the typed schema directly (e.g.
-    auxiliary supervision on verb / arm / grasp classification heads)
-    while the subtask string the policy conditions on stays exactly what
-    the subtask module produced. (Reconstructing subtask text from these
-    fields was too easy for the VLM to hallucinate on tasks that don't
-    fit the manipulation schema — navigation tasks yielded nonsense like
-    ``move stove to stove`` — so that path was removed.)
+    Emitted as a separate ``style="action_record"`` row at the subtask's
+    start timestamp. PURELY ADDITIVE — it never touches the subtask text,
+    so downstream training can use the typed schema (e.g. auxiliary
+    verb/arm/grasp heads) while the conditioning string stays unchanged.
 
-    Cost: one extra VLM call per subtask. For an 8-subtask episode this
-    means ~8x more VLM calls in the plan module — still cheap relative
-    to the action-expert training cost, but worth knowing.
+    Cost: one extra VLM call per subtask (~8x plan-module calls on an
+    8-subtask episode).
     """
 
     enabled: bool = False
@@ -204,26 +174,14 @@ class TaskAugAxesConfig:
     """Structured 5-axis augmentation taxonomy for t=0 task variants.
 
     When ``enabled=True``, replaces the free-form ``n_task_rephrasings``
-    flow with a structured prompt that produces variants along five
-    named axes (mirroring EgoMimic's ``augment_prompt.txt``):
+    flow with variants along five named axes (EgoMimic-style):
+    ``synonym_paraphrase`` (reword, keep all info), ``omit_arm``,
+    ``omit_orientation``, ``omit_grasp_method``, and ``combined_omissions``
+    (drop two at once).
 
-      * ``synonym_paraphrase`` — different wording / verbs, all
-        information preserved.
-      * ``omit_arm`` — drop the left/right/both arm specification.
-      * ``omit_orientation`` — drop orientation cues (upright,
-        sideways, ...).
-      * ``omit_grasp_method`` — drop grip / grasp method specification.
-      * ``combined_omissions`` — combine two of the above
-        simultaneously.
-
-    Default counts (3+3+2+2+2 = 12 variants per task) match EgoMimic.
-    Axes that have nothing to omit in the source task (e.g. ``omit_arm``
-    when the task doesn't mention an arm) emit fewer entries rather
-    than pad — the prompt instructs the VLM accordingly.
-
-    Each variant is emitted as a ``task_aug`` row at ``t=0`` (same
-    style as the free-form variants), so the rest of the pipeline /
-    training recipe doesn't need to know about the taxonomy.
+    Default counts (3+3+2+2+2 = 12) match EgoMimic. Axes with nothing to
+    omit emit fewer entries rather than pad. Each variant becomes a
+    ``task_aug`` row at ``t=0``, identical in style to the free-form ones.
     """
 
     enabled: bool = False
@@ -234,17 +192,6 @@ class TaskAugAxesConfig:
     omit_grasp_method: int = 2
     combined_omissions: int = 2
 
-    @property
-    def total(self) -> int:
-        """Sum of requested variants across all axes (upper bound)."""
-        return (
-            self.synonym_paraphrase
-            + self.omit_arm
-            + self.omit_orientation
-            + self.omit_grasp_method
-            + self.combined_omissions
-        )
-
 
 @dataclass
 class InterjectionsConfig:
@@ -326,16 +273,11 @@ class VlmConfig:
 
     max_new_tokens: int = 512
     temperature: float = 0.2
-    json_mode: bool = True
-    batch_size: int = 4
-    tensor_parallel_size: int = 1
 
-    # Fraction of GPU memory vllm allocates for weights + KV cache.
-    gpu_memory_utilization: float = 0.9
-    # Cap context length (None = model default). On 80 GB H100 a 30B BF16
-    # model often needs <= 8192 to leave KV-cache headroom.
+    # Context length for the auto-spawned vLLM server (None → 32768). vLLM
+    # tuning flags (tensor-parallel size, GPU memory fraction, ...) go in
+    # ``serve_command`` directly, not here.
     max_model_len: int | None = None
-    trust_remote_code: bool = False
 
     # Override the camera stream used for keyframe attachment. None picks
     # the first ``observation.images.*`` key the dataset declares.
diff --git a/src/lerobot/annotations/steerable_pipeline/reader.py b/src/lerobot/annotations/steerable_pipeline/reader.py
index 6310a5b5e..22fe4ac26 100644
--- a/src/lerobot/annotations/steerable_pipeline/reader.py
+++ b/src/lerobot/annotations/steerable_pipeline/reader.py
@@ -214,61 +214,3 @@ def _iter_one_path(path: Path, tasks: dict[int, str], only_set: set[int] | None)
         rec = _build(cur_ep, start_offset, len(episode_col), cur_task_idx, ts_buf, fi_buf)
         if rec is not None:
             yield rec
-
-
-def gather_data_paths(root: Path) -> list[Path]:
-    """Return every ``data/chunk-*/file-*.parquet`` path under ``root``."""
-    return sorted((root / "data").rglob("*.parquet"))
-
-
-def episode_offsets_per_path(path: Path) -> dict[int, tuple[int, int]]:
-    """Return ``{episode_index: (row_offset, row_count)}`` for one parquet."""
-    table = pq.read_table(path, columns=["episode_index"])
-    episode_col = table.column("episode_index").to_pylist()
-    out: dict[int, tuple[int, int]] = {}
-    cur_ep: int | None = None
-    start = 0
-    for i, ep in enumerate(episode_col):
-        if cur_ep is None:
-            cur_ep = ep
-            start = i
-            continue
-        if ep != cur_ep:
-            out[cur_ep] = (start, i - start)
-            cur_ep = ep
-            start = i
-    if cur_ep is not None:
-        out[cur_ep] = (start, len(episode_col) - start)
-    return out
-
-
-def keyframe_indices(record: EpisodeRecord, k: int) -> list[int]:
-    """Return ``k`` evenly spaced row indices into the episode (relative)."""
-    n = record.row_count
-    if k <= 0 or n == 0:
-        return []
-    if k >= n:
-        return list(range(n))
-    step = (n - 1) / (k - 1) if k > 1 else 0.0
-    return [int(round(i * step)) for i in range(k)] if k > 1 else [n // 2]
-
-
-def lookup_data_path(root: Path, episode_index: int) -> tuple[Path, int, int] | None:
-    """Find the parquet file containing ``episode_index`` and its slice bounds."""
-    for path in gather_data_paths(root):
-        offsets = episode_offsets_per_path(path)
-        if episode_index in offsets:
-            start, count = offsets[episode_index]
-            return path, start, count
-    return None
-
-
-def episode_frame_timestamps(root: Path, episode_index: int) -> tuple[Any, list[float]]:
-    """Return the parquet path and per-frame timestamps for ``episode_index``."""
-    found = lookup_data_path(root, episode_index)
-    if found is None:
-        raise ValueError(f"Episode {episode_index} not found under {root}/data/")
-    path, start, count = found
-    table = pq.read_table(path, columns=["timestamp"])
-    timestamps = table.column("timestamp").to_pylist()[start : start + count]
-    return path, [float(t) for t in timestamps]
diff --git a/src/lerobot/annotations/steerable_pipeline/staging.py b/src/lerobot/annotations/steerable_pipeline/staging.py
index da8f82097..0b47c4dd6 100644
--- a/src/lerobot/annotations/steerable_pipeline/staging.py
+++ b/src/lerobot/annotations/steerable_pipeline/staging.py
@@ -28,7 +28,7 @@ intermediate.
 from __future__ import annotations
 
 import json
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@@ -90,15 +90,3 @@ class EpisodeStaging:
 
     def has(self, module: ModuleName) -> bool:
         return self.path_for(module).exists()
-
-
-def iter_staged_episodes(root: Path) -> Iterator[int]:
-    """Yield episode indices for which any staging artifact exists."""
-    if not root.exists():
-        return
-    for child in sorted(root.iterdir()):
-        if child.is_dir() and child.name.startswith("episode_"):
-            try:
-                yield int(child.name.removeprefix("episode_"))
-            except ValueError:
-                continue
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index d0ee10ad8..7f5e9da3c 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -23,8 +23,7 @@ into a real model.
 The client speaks one method, :meth:`VlmClient.generate_json`, which:
 
 - accepts a list of OpenAI/HF-style multimodal messages,
-- requests JSON output (``json_mode=True`` enables guided decoding when the
-  backend supports it),
+- requests JSON output from the server,
 - batches requests transparently,
 - and reprompts once on a JSON parse failure with an inline correction
   message before raising.
diff --git a/src/lerobot/annotations/steerable_pipeline/writer.py b/src/lerobot/annotations/steerable_pipeline/writer.py
index 52dd7f850..6710d08bd 100644
--- a/src/lerobot/annotations/steerable_pipeline/writer.py
+++ b/src/lerobot/annotations/steerable_pipeline/writer.py
@@ -46,7 +46,7 @@ from __future__ import annotations
 
 import logging
 from collections import defaultdict
-from collections.abc import Iterable, Sequence
+from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@@ -338,19 +338,3 @@ def speech_atom(timestamp: float, text: str) -> dict[str, Any]:
             }
         ],
     }
-
-
-def normalize_rows_for_writer(
-    rows: Iterable[dict[str, Any]],
-) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
-    """Helper used by tests/validators to partition a flat row list into
-    (persistent_rows, event_rows) using ``column_for_style``.
-    """
-    persistent: list[dict[str, Any]] = []
-    events: list[dict[str, Any]] = []
-    for row in rows:
-        if column_for_style(row.get("style")) == LANGUAGE_PERSISTENT:
-            persistent.append(row)
-        else:
-            events.append(row)
-    return persistent, events

From 7471a6b1ed9f1d58196e4bbda693c91ae0622f43 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 14:12:04 +0200
Subject: [PATCH 41/45] annotate: compress conftest + pyproject comments (fix
 stale backend note)

The pyproject annotations-extra comment still described the removed
vllm/transformers in-process backends ('vllm preferred ... transformers
fallback', '_make_vllm_client'); rewrite it for the openai-only reality
and trim it. Also condense the conftest lazy-import NOTE. Comments only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml                | 25 +++++++++----------------
 tests/annotations/conftest.py | 11 ++++-------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dce61758c..1b65aa664 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -222,26 +222,19 @@ hilserl = ["lerobot[transformers-dep]", "lerobot[dataset]", "gym-hil>=0.1.13,<0.
 async = ["lerobot[grpcio-dep]", "lerobot[matplotlib-dep]"]
 peft = ["lerobot[transformers-dep]", "lerobot[peft-dep]"]
 
-# Annotation pipeline (lerobot-annotate). vllm is the preferred backend
-# on Linux, with a transformers fallback elsewhere; openai is the default
-# backend and talks to any OpenAI-compatible server (``vllm serve`` /
-# ``transformers serve`` / hosted endpoints). Distributed execution is
-# delegated to Hugging Face Jobs (see examples/annotations/run_hf_job.py).
+# Annotation pipeline (lerobot-annotate). The only backend is ``openai``,
+# which talks to any OpenAI-compatible server (``vllm serve`` /
+# ``transformers serve`` / hosted). Distributed runs use Hugging Face Jobs
+# (see examples/annotations/run_hf_job.py).
 annotations = [
     "lerobot[dataset]",
     "lerobot[transformers-dep]",
     "openai>=1.40,<2.0",
-    # NOTE: ``vllm`` is intentionally NOT a hard dependency here. vLLM
-    # hard-pins an older torch (via xformers/xgrammar), and because uv
-    # resolves a single unified lock across all extras, including it would
-    # cap ``torch`` for every other extra too (e.g. forcing torch 2.8 while
-    # ``torchcodec`` in the ``dataset`` extra needs torch 2.11 -> ABI break
-    # in CI). vLLM is also not needed by the shipped workflow: the HF Jobs
-    # launcher (``examples/annotations/run_hf_job.py``) gets it from the
-    # ``vllm/vllm-openai`` image and talks to it over the OpenAI-compatible
-    # API (``--vlm.backend=openai``), and ``vlm_client._make_vllm_client``
-    # imports vllm lazily with an actionable error. To use the in-process
-    # ``--vlm.backend=vllm`` locally, ``pip install vllm`` separately.
+    # ``vllm`` is intentionally NOT a hard dep: it pins an older torch, and
+    # uv's single unified lock would then cap ``torch`` for every extra
+    # (e.g. forcing 2.8 while ``torchcodec`` in [dataset] needs 2.11 -> ABI
+    # break in CI). The HF Jobs image (``vllm/vllm-openai``) provides vLLM;
+    # install it locally only if you run your own ``vllm serve``.
 ]
 
 # Development
diff --git a/tests/annotations/conftest.py b/tests/annotations/conftest.py
index 198e90319..69e0d595e 100644
--- a/tests/annotations/conftest.py
+++ b/tests/annotations/conftest.py
@@ -26,13 +26,10 @@ from pathlib import Path
 
 import pytest
 
-# NOTE: ``build_annotation_dataset`` pulls in ``lerobot.datasets`` (-> the HF
-# ``datasets`` library + ``pandas``), which only ship under the ``dataset``
-# extra. It is imported LAZILY inside the fixtures below so this conftest
-# imports cleanly in dependency tiers without that extra (e.g. the base
-# ``--extra test`` fast-test tier). The annotation test modules guard
-# themselves with a module-level ``pytest.importorskip("datasets")`` so
-# their collection is skipped — never erroring — when the extra is absent.
+# ``build_annotation_dataset`` pulls in ``lerobot.datasets`` (HF ``datasets``
+# + ``pandas``, only in the ``dataset`` extra), so it's imported lazily inside
+# each fixture — this conftest stays importable without that extra. The test
+# modules ``pytest.importorskip("datasets")`` so they skip rather than error.
 
 
 @pytest.fixture

From 973318ef65dab6371034ab53fbdc3eb01732db79 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 14:18:36 +0200
Subject: [PATCH 42/45] annotate: dedup task_aug + row-normalization; docs
 module on/off table

Two behavior-preserving simplifications:
  * plan_subtasks_memory.run_episode: the task_aug 'axes' and free-form
    branches built identical deduped rows via copy-pasted seen/append
    loops. Collapse to one branch that picks the variant source, then a
    shared _task_aug_rows() helper does the dedup + row build (-~25 LOC).
  * writer: _normalize_persistent_row / _normalize_event_row shared the
    same camera-validate + struct construction. Extract _normalize_row(),
    keeping the exact key order (the parquet struct schema is inferred
    from insertion order, so timestamp must stay between style and camera).

docs: 'Which modules run' is now a table giving each module's on/off flag
(--plan.enabled / --interjections.enabled / --vqa.enabled) and what it
turns off.

Verified: 40 tests pass (incl. test_writer struct round-trip); pre-commit clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           | 11 +++-
 .../modules/plan_subtasks_memory.py           | 62 +++++++------------
 .../annotations/steerable_pipeline/writer.py  | 49 ++++++++-------
 3 files changed, 56 insertions(+), 66 deletions(-)

diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index c9eefeb3e..cb2012249 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -136,9 +136,14 @@ short manipulation episodes.
 
 ### Which modules run
 
-Each module can be turned off independently to iterate on one at a time:
-`--plan.enabled`, `--interjections.enabled`, `--vqa.enabled` (all
-`true` by default).
+Every module is on by default and can be toggled independently (set to
+`false` to skip it, e.g. to iterate on one module at a time):
+
+| Flag                      | Default | Turns off                           |
+| ------------------------- | ------- | ----------------------------------- |
+| `--plan.enabled`          | `true`  | subtasks + plan + memory + task_aug |
+| `--interjections.enabled` | `true`  | interjections + speech atoms        |
+| `--vqa.enabled`           | `true`  | the VQA pairs                       |
 
 ### The VLM (`--vlm.*`)
 
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index d054f9eb5..c76a6acad 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -71,47 +71,16 @@ class PlanSubtasksMemoryModule:
         effective_task = self._resolve_effective_task(record)
         # task_aug rows at t=0: phrasings the renderer rotates ${task} through.
         # Either the structured 5-axis taxonomy (task_aug_axes.enabled) or
-        # free-form n_task_rephrasings.
+        # free-form n_task_rephrasings; the effective task is always emitted
+        # first so the rotation covers the source-of-truth phrasing.
         t0 = float(record.frame_timestamps[0]) if record.frame_timestamps else 0.0
-        axes_cfg = self.config.task_aug_axes
-        if axes_cfg.enabled and effective_task:
-            variants = self._generate_task_aug_by_axes(effective_task, axes_cfg)
-            seen: set[str] = set()
-            ordered = [effective_task, *variants]
-            for phrasing in ordered:
-                key = phrasing.strip()
-                if not key or key in seen:
-                    continue
-                seen.add(key)
-                rows.append(
-                    {
-                        "role": "user",
-                        "content": key,
-                        "style": "task_aug",
-                        "timestamp": t0,
-                        "tool_calls": None,
-                    }
-                )
+        variants: list[str] | None = None
+        if self.config.task_aug_axes.enabled and effective_task:
+            variants = self._generate_task_aug_by_axes(effective_task, self.config.task_aug_axes)
         elif self.config.n_task_rephrasings > 0 and effective_task:
-            rephrasings = self._generate_task_rephrasings(effective_task, n=self.config.n_task_rephrasings)
-            # Include the effective task first so the rotation always covers
-            # the source-of-truth phrasing, not just synthetic ones.
-            seen = set()
-            ordered = [effective_task, *rephrasings]
-            for phrasing in ordered:
-                key = phrasing.strip()
-                if not key or key in seen:
-                    continue
-                seen.add(key)
-                rows.append(
-                    {
-                        "role": "user",
-                        "content": key,
-                        "style": "task_aug",
-                        "timestamp": t0,
-                        "tool_calls": None,
-                    }
-                )
+            variants = self._generate_task_rephrasings(effective_task, n=self.config.n_task_rephrasings)
+        if variants is not None:
+            rows.extend(self._task_aug_rows([effective_task, *variants], t0))
 
         subtask_spans = self._generate_subtasks(record, task=effective_task)
 
@@ -233,6 +202,21 @@ class PlanSubtasksMemoryModule:
             return True
         return task.lower() in self._PLACEHOLDER_TASKS
 
+    @staticmethod
+    def _task_aug_rows(phrasings: Sequence[str], t0: float) -> list[dict[str, Any]]:
+        """Build deduplicated ``task_aug`` rows (role=user) at ``t0``."""
+        seen: set[str] = set()
+        rows: list[dict[str, Any]] = []
+        for phrasing in phrasings:
+            key = phrasing.strip()
+            if not key or key in seen:
+                continue
+            seen.add(key)
+            rows.append(
+                {"role": "user", "content": key, "style": "task_aug", "timestamp": t0, "tool_calls": None}
+            )
+        return rows
+
     # ------------------------------------------------------------------
     # VLM call helpers — every plan-module prompt follows the same shape:
     # build messages → single VLM call → pull a named field.
diff --git a/src/lerobot/annotations/steerable_pipeline/writer.py b/src/lerobot/annotations/steerable_pipeline/writer.py
index 6710d08bd..e1a544c80 100644
--- a/src/lerobot/annotations/steerable_pipeline/writer.py
+++ b/src/lerobot/annotations/steerable_pipeline/writer.py
@@ -89,6 +89,27 @@ def _row_event_sort_key(row: dict[str, Any]) -> tuple:
     )
 
 
+def _normalize_row(row: dict[str, Any], style: str | None, *, with_timestamp: bool) -> dict[str, Any]:
+    """Coerce a staged row into the language-column struct shape.
+
+    Key order matches ``PERSISTENT_ROW_FIELDS`` / ``EVENT_ROW_FIELDS`` — the
+    writer infers the parquet struct schema from insertion order, so
+    ``timestamp`` (persistent rows only) sits between ``style`` and ``camera``.
+    """
+    camera = row.get("camera")
+    validate_camera_field(style, camera)
+    out: dict[str, Any] = {
+        "role": str(row["role"]),
+        "content": None if row.get("content") is None else str(row["content"]),
+        "style": style,
+    }
+    if with_timestamp:
+        out["timestamp"] = float(row["timestamp"])
+    out["camera"] = None if camera is None else str(camera)
+    out["tool_calls"] = _normalize_tool_calls(row.get("tool_calls"))
+    return out
+
+
 def _normalize_persistent_row(row: dict[str, Any]) -> dict[str, Any]:
     """Coerce a staged row into the persistent column's struct shape."""
     style = row.get("style")
@@ -100,22 +121,10 @@ def _normalize_persistent_row(row: dict[str, Any]) -> dict[str, Any]:
     if "timestamp" not in row:
         raise ValueError(f"persistent row missing timestamp: {row!r}")
     if "role" not in row:
-        # Surface a friendly error from the writer rather than letting
-        # the raw KeyError bubble out of the dict access below — modules
-        # are expected to always emit ``role``, but the validator
-        # currently doesn't check this so a future bug would otherwise
-        # be hard to triage.
+        # Friendly error from the writer instead of a raw KeyError below;
+        # the validator doesn't check ``role`` yet.
         raise ValueError(f"persistent row missing role: {row!r}")
-    camera = row.get("camera")
-    validate_camera_field(style, camera)
-    return {
-        "role": str(row["role"]),
-        "content": None if row.get("content") is None else str(row["content"]),
-        "style": style,
-        "timestamp": float(row["timestamp"]),
-        "camera": None if camera is None else str(camera),
-        "tool_calls": _normalize_tool_calls(row.get("tool_calls")),
-    }
+    return _normalize_row(row, style, with_timestamp=True)
 
 
 def _normalize_event_row(row: dict[str, Any]) -> dict[str, Any]:
@@ -129,15 +138,7 @@ def _normalize_event_row(row: dict[str, Any]) -> dict[str, Any]:
         raise ValueError(f"event row with style {style!r} would not route to language_events")
     if "role" not in row:
         raise ValueError(f"event row missing role: {row!r}")
-    camera = row.get("camera")
-    validate_camera_field(style, camera)
-    return {
-        "role": str(row["role"]),
-        "content": None if row.get("content") is None else str(row["content"]),
-        "style": style,
-        "camera": None if camera is None else str(camera),
-        "tool_calls": _normalize_tool_calls(row.get("tool_calls")),
-    }
+    return _normalize_row(row, style, with_timestamp=False)
 
 
 def _normalize_tool_calls(value: Any) -> list[Any] | None:

From 99baae012f484a67010f49a4a8e92038ae2d5e5f Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 14:36:02 +0200
Subject: [PATCH 43/45] annotate(config): further compact field comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tighten the remaining multi-line comment blocks in config.py (derive_task,
frames/window, describe_first, action-record/vqa/vlm fields, video_backend,
repo ids, executor) to 1-3 lines each. Also fix a stale path typo
('examples/annotation' -> the docstring now just says HF Jobs). Comments
only — no field or behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 143 +++++++-----------
 1 file changed, 52 insertions(+), 91 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 744de9a29..1b16a927b 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -36,38 +36,30 @@ class PlanConfig:
     # ``${task}`` binding rotates among them per ``sample_idx``. ``0`` disables.
     n_task_rephrasings: int = 10
 
-    # When to derive the task from the video instead of using
-    # ``record.episode_task``: ``off``, ``if_short`` (short / placeholder /
-    # missing canonical task), or ``always``. The derived task replaces the
-    # canonical one for every ``plan``-module prompt; ``meta/tasks.parquet``
-    # is never modified.
+    # Derive the task from video instead of ``record.episode_task``: ``off``,
+    # ``if_short`` (canonical task short/placeholder/missing), or ``always``.
+    # Affects prompts only; ``meta/tasks.parquet`` is untouched.
     derive_task_from_video: str = "if_short"
     derive_task_min_words: int = 3
 
-    # Frames are sampled uniformly across the episode, capped at
-    # ``max_video_frames`` (a HARD context-budget cap, not an annotation
-    # knob). Each embedded frame is ~250-320 vision tokens, so 32 frames
-    # (~8-10k tokens) fit a 32k-context VLM; 128 would overflow it. Lower
-    # this if you hit "Input length exceeds maximum context length".
+    # Frames sampled uniformly, capped at ``max_video_frames`` — a HARD context
+    # cap (~250-320 tokens/frame, so 32 fit a 32k VLM; 128 overflow). Lower it
+    # if you hit "Input length exceeds maximum context length".
     frames_per_second: float = 1.0
     max_video_frames: int = 32
 
-    # Windowed subtask generation for CONSTANT temporal density. When > 0
-    # and the episode is longer than this, the plan module processes it in
-    # consecutive windows of this length (each sampled at
-    # ``frames_per_second``) instead of subsampling the whole episode to a
-    # sparse ``max_video_frames``. The describe -> segment chain runs per
-    # window; spans are merged + stitched. Set to ~max_video_frames /
-    # frames_per_second (e.g. 32s at 1 fps). 0 disables.
+    # Windowed subtask generation for constant temporal density: when > 0 and
+    # the episode is longer, process it in windows of this length (each at
+    # ``frames_per_second``) instead of subsampling the whole episode; spans are
+    # merged + stitched. ~max_video_frames / frames_per_second. 0 disables.
     subtask_window_seconds: float = 0.0
 
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # Run a grounding pass that narrates ONLY what's visible (no subtask
-    # JSON yet), then feed that into the segmentation prompt — the strongest
-    # lever against subtasks invented from the task text. ON by default;
-    # +1 VLM call/episode. False trades quality for fewer calls.
+    # Grounding pass that narrates ONLY what's visible before segmenting — the
+    # strongest lever against subtasks invented from the task text. ON by
+    # default (+1 VLM call/episode); False trades quality for fewer calls.
     subtask_describe_first: bool = True
 
     # Emit ``style="plan"`` rows (the numbered still-todo list, re-emitted at
@@ -76,12 +68,10 @@ class PlanConfig:
     emit_plan: bool = True
 
     # NOTE: subtask spans are ALWAYS stitched into a contiguous full-episode
-    # cover (see ``_stitch_full_coverage``) — not configurable, since a
-    # sparse / gap-ridden timeline is never useful for conditioning.
+    # cover (see ``_stitch_full_coverage``) — not configurable.
 
-    # When True (with a backend that supports it, e.g. ``openai``), send a
-    # ``video_url`` block pointing at a per-episode mp4 subclip and let the
-    # server sample frames at ``use_video_url_fps``.
+    # When True, send a server-side ``video_url`` clip (sampled at
+    # ``use_video_url_fps``) instead of embedded frames.
     use_video_url: bool = False
     use_video_url_fps: float = 1.0
 
@@ -124,19 +114,15 @@ class ActionRecordsConfig:
 
     enabled: bool = False
 
-    # When True (default), emit a separate row with ``style="action_record"``
-    # and ``content=json.dumps(record)`` at the subtask's start timestamp.
-    # This is the only output of the feature — set ``enabled=False`` to
-    # skip the extra VLM calls entirely.
+    # Emit the ``style="action_record"`` row (JSON content) at the subtask
+    # start — the only output of the feature. ``enabled=False`` skips it.
     emit_record_row: bool = True
 
-    # Frame sampling for the per-subtask VLM call (similar to the
-    # interjection module's window). Anchored to the subtask span.
+    # Frames sampled from the subtask span for the per-subtask VLM call.
     frames_per_subtask: int = 4
 
-    # Closed verb vocabulary. The prompt instructs the VLM to pick
-    # exactly one. Override per-dataset (e.g. ``["pick", "place", "open",
-    # "close"]`` for door-only manipulation) for tighter constraint.
+    # Closed verb vocabulary; the prompt picks exactly one. Override
+    # per-dataset (e.g. door-only manipulation) for a tighter constraint.
     verb_vocabulary: tuple[str, ...] = (
         "pick",
         "place",
@@ -157,9 +143,8 @@ class ActionRecordsConfig:
         "dump",
     )
 
-    # Closed grasp-type vocabulary. ``null`` is always allowed (no
-    # contact / unclear). Adjust per-hardware (e.g. drop ``hook`` /
-    # ``key`` for parallel-jaw grippers).
+    # Closed grasp-type vocabulary (``null`` always allowed). Adjust
+    # per-hardware (e.g. drop ``hook`` / ``key`` for parallel-jaw grippers).
     grasp_vocabulary: tuple[str, ...] = (
         "pinch",
         "wrap",
@@ -199,15 +184,13 @@ class InterjectionsConfig:
 
     enabled: bool = True
 
-    # Each interjection emits a paired ``(interjection, speech)`` event row
-    # and triggers a ``plan`` refresh at the same timestamp via the
-    # ``plan`` module.
+    # Each interjection emits a paired (interjection, speech) event row and
+    # triggers a ``plan`` refresh at the same timestamp.
     max_interjections_per_episode: int = 3
     interjection_min_t: float = 2.0
 
-    # Visual context attached to the interjection prompt: a short window
-    # of frames centered on the chosen timestamp so the VLM sees the
-    # ongoing motion rather than a single frozen frame.
+    # A short frame window centered on the timestamp so the VLM sees the
+    # motion, not one frozen frame.
     interjection_window_seconds: float = 2.0
     interjection_window_frames: int = 4
 
@@ -219,22 +202,14 @@ class VqaConfig:
     enabled: bool = True
     vqa_emission_hz: float = 1.0
     K: int = 1
-    """How many *consecutive* frames each emission tick anchors a VQA pair
-    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
-    against the *first* anchored frame's image, so anchoring K>1 frames
-    copies that same answer onto later frames where the scene has already
-    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
-    emission frame, no temporal smear. Raise it only to trade label
-    precision for more (noisier) VQA frames."""
+    """Consecutive frames each emission tick anchors a VQA pair to. The VLM
+    grounds its answer on the FIRST anchored frame, so K>1 copies that answer
+    onto later (moved) frames — stale labels. Default 1 (no smear)."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
 
-    # Camera restriction. By default VQA iterates EVERY camera the
-    # dataset declares (one VQA pair per camera per emission tick). Set
-    # ``restrict_to_default_camera=True`` to ground VQA on only the
-    # single ``--vlm.camera_key`` stream — the same camera the plan /
-    # interjection modules use — so the whole pipeline focuses on one
-    # view. Use this when you want every annotation grounded on, e.g.,
-    # ``observation.images.base`` and nothing else.
+    # By default VQA iterates every camera (one pair per camera per tick). Set
+    # True to ground VQA only on ``--vlm.camera_key`` — the single view the
+    # plan / interjection modules use.
     restrict_to_default_camera: bool = False
 
 
@@ -242,11 +217,9 @@ class VqaConfig:
 class VlmConfig:
     """Shared Qwen-VL client configuration."""
 
-    # Only ``openai`` is supported for now (the in-process ``vllm`` /
-    # ``transformers`` local backends were removed — the shipped workflow
-    # is Hugging Face Jobs). ``openai`` talks to an OpenAI-compatible vLLM
-    # server; the CLI auto-spawns one in-job when ``auto_serve=True``.
-    # ``stub`` is for unit tests (construct ``StubVlmClient`` directly).
+    # Only ``openai`` is supported (in-process vllm/transformers were removed;
+    # the shipped workflow is HF Jobs). Talks to an OpenAI-compatible vLLM
+    # server, auto-spawned in-job when ``auto_serve=True``. ``stub`` is for tests.
     backend: str = "openai"
     model_id: str = "Qwen/Qwen3.6-27B"
 
@@ -263,9 +236,8 @@ class VlmConfig:
     # when ``parallel_servers > 1``.
     serve_command: str | None = None
 
-    # Run multiple independent inference servers for round-robin client
-    # routing (each pinned to a GPU via ``CUDA_VISIBLE_DEVICES`` and bound
-    # to ``serve_port + i``). ``num_gpus=0`` means one GPU per replica.
+    # Independent servers for round-robin routing (each pinned to a GPU,
+    # bound to ``serve_port + i``). ``num_gpus=0`` = one GPU per replica.
     parallel_servers: int = 1
     num_gpus: int = 0
     client_concurrency: int = 16
@@ -289,16 +261,12 @@ class VlmConfig:
 
 @dataclass
 class ExecutorConfig:
-    """Executor settings.
+    """Executor settings — intra-process episode concurrency only
+    (distributed execution is delegated to Hugging Face Jobs)."""
 
-    Distributed execution is provided by Hugging Face Jobs (see
-    ``examples/annotation/run_hf_job.py``); this config only controls
-    intra-process episode concurrency.
-    """
-
-    # Episodes processed concurrently within each module phase. Each
-    # in-flight episode dispatches 3-5 dependent VLM calls, so this is the
-    # main knob for saturating ``parallel_servers`` and ``client_concurrency``.
+    # Episodes processed concurrently per module phase. Each dispatches 3-5 VLM
+    # calls, so this is the main knob for saturating ``parallel_servers`` /
+    # ``client_concurrency``.
     episode_parallelism: int = 16
 
 
@@ -310,15 +278,12 @@ class AnnotationPipelineConfig:
     revisions of the same dataset live in separate copies.
     """
 
-    # Hub dataset id. Used as the download source when ``root`` is unset,
-    # and as the destination repo when ``push_to_hub`` is enabled and
-    # ``new_repo_id`` is unset.
+    # Hub dataset id: download source when ``root`` is unset, and push target
+    # when ``push_to_hub`` is on and ``new_repo_id`` is unset.
     repo_id: str | None = None
 
-    # Optional separate Hub dataset id to push the annotated result to (named
-    # ``new_repo_id`` to match the LeRobot dataset edit tools). When unset,
-    # ``push_to_hub`` uploads back to ``repo_id`` (annotate in place); when
-    # set, the source ``repo_id`` is left untouched.
+    # Optional separate push target (named to match the LeRobot dataset edit
+    # tools). Unset → push back to ``repo_id`` in place; set → source untouched.
     new_repo_id: str | None = None
 
     root: Path | None = None
@@ -338,17 +303,13 @@ class AnnotationPipelineConfig:
     skip_validation: bool = False
     only_episodes: tuple[int, ...] | None = None
 
-    # Keyframe decode backend. When unset, the pipeline decodes with the
-    # ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child
-    # process, which is both crash-safe and safe under the concurrent
-    # decode the executor performs (torchcodec is not thread-safe and
-    # SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an
-    # in-process decoder when its build is known thread-safe.
+    # Keyframe decode backend. Unset → ffmpeg CLI: decodes AV1 in an isolated
+    # child process, so it's crash- and thread-safe under concurrent decode
+    # (torchcodec SIGSEGVs there). Set ``"torchcodec"`` / ``"pyav"`` to pin one.
     video_backend: str | None = None
 
-    # When True, upload the annotated dataset to the Hugging Face Hub:
-    # to ``new_repo_id`` if set, otherwise back to ``repo_id``. One of
-    # the two must be set for this to take effect.
+    # Upload the annotated dataset to the Hub (to ``new_repo_id`` if set, else
+    # back to ``repo_id`` — one of the two must be set).
     push_to_hub: bool = False
     push_private: bool = False
     push_commit_message: str | None = None

From cd59c8b312ec9832648d90184334f94684725037 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 14:40:34 +0200
Subject: [PATCH 44/45] annotate: remove the action_record style/feature
 entirely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the optional structured per-subtask action records — not a feature
we want to ship.

  * language.py: remove 'action_record' from CORE_STYLES + PERSISTENT_STYLES
    (and the matching assertion in tests/datasets/test_language.py).
  * config.py: delete ActionRecordsConfig (verb/grasp vocabularies,
    frames_per_subtask, emit_record_row) and the PlanConfig.action_records
    field.
  * plan_subtasks_memory.py: delete _extract_action_record and the
    run_episode block that emitted style='action_record' rows; drop the
    now-unused json / to_image_blocks imports.
  * remove the plan_action_record.txt prompt.
  * run_hf_job.py: drop the action_records comment.

Verified: 40 tests pass; pre-commit (ruff, mypy, bandit) clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |   2 -
 .../annotations/steerable_pipeline/config.py  |  72 ----------
 .../modules/plan_subtasks_memory.py           | 127 +-----------------
 .../prompts/plan_action_record.txt            |  64 ---------
 src/lerobot/datasets/language.py              |   3 +-
 tests/datasets/test_language.py               |   2 +-
 6 files changed, 3 insertions(+), 267 deletions(-)
 delete mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 85ac8f17c..3fb730d4a 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -87,8 +87,6 @@ CMD = (
     # rephrasings are unused at best and harmful when they drift.
     "--plan.n_task_rephrasings=0 "
     # Keep subtask decomposition tight for atomic tasks.
-    # (action_records left off: the {verb,object,arm,grasp,dest} schema is for
-    # long manipulation tasks, not RoboCasa atomic/navigation.)
     "--plan.plan_max_steps=10 "
     # Only subtasks + memory — skip the numbered "plan" rows. true re-enables.
     "--plan.emit_plan=false "
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 1b16a927b..439201993 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -75,11 +75,6 @@ class PlanConfig:
     use_video_url: bool = False
     use_video_url_fps: float = 1.0
 
-    # Optional structured per-subtask action records (EgoMimic-style). When
-    # enabled, the VLM extracts a typed record per subtask span; see
-    # ``ActionRecordsConfig``. Purely additive — off by default.
-    action_records: ActionRecordsConfig = field(default_factory=lambda: ActionRecordsConfig())
-
     # Optional 5-axis task-augmentation taxonomy for the t=0 variants
     # (EgoMimic-style: synonym / omit_arm / omit_orientation /
     # omit_grasp_method / combined). Replaces the free-form
@@ -87,73 +82,6 @@ class PlanConfig:
     task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())
 
 
-@dataclass
-class ActionRecordsConfig:
-    """Structured per-subtask action record extraction.
-
-    When ``enabled=True``, after subtask-span generation the module makes
-    one extra VLM call per subtask to extract a typed record::
-
-        {
-            "verb": "pick" | "place" | "press" | ...,  # closed vocabulary
-            "object": "<canonical_object_name>",
-            "arm": "left" | "right" | "both" | null,
-            "grasp_type": "pinch" | "wrap" | "hook" | ... | null,
-            "destination": "<canonical_destination>" | null,
-            "mistake": "<short text>" | null,
-        }
-
-    Emitted as a separate ``style="action_record"`` row at the subtask's
-    start timestamp. PURELY ADDITIVE — it never touches the subtask text,
-    so downstream training can use the typed schema (e.g. auxiliary
-    verb/arm/grasp heads) while the conditioning string stays unchanged.
-
-    Cost: one extra VLM call per subtask (~8x plan-module calls on an
-    8-subtask episode).
-    """
-
-    enabled: bool = False
-
-    # Emit the ``style="action_record"`` row (JSON content) at the subtask
-    # start — the only output of the feature. ``enabled=False`` skips it.
-    emit_record_row: bool = True
-
-    # Frames sampled from the subtask span for the per-subtask VLM call.
-    frames_per_subtask: int = 4
-
-    # Closed verb vocabulary; the prompt picks exactly one. Override
-    # per-dataset (e.g. door-only manipulation) for a tighter constraint.
-    verb_vocabulary: tuple[str, ...] = (
-        "pick",
-        "place",
-        "push",
-        "pull",
-        "open",
-        "close",
-        "turn",
-        "press",
-        "lift",
-        "insert",
-        "pour",
-        "move",
-        "reach",
-        "grasp",
-        "release",
-        "wipe",
-        "dump",
-    )
-
-    # Closed grasp-type vocabulary (``null`` always allowed). Adjust
-    # per-hardware (e.g. drop ``hook`` / ``key`` for parallel-jaw grippers).
-    grasp_vocabulary: tuple[str, ...] = (
-        "pinch",
-        "wrap",
-        "hook",
-        "key",
-        "lateral",
-    )
-
-
 @dataclass
 class TaskAugAxesConfig:
     """Structured 5-axis augmentation taxonomy for t=0 task variants.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index c76a6acad..8f25fcfba 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -17,7 +17,6 @@
 
 from __future__ import annotations
 
-import json
 import logging
 from collections.abc import Sequence
 from dataclasses import dataclass, field
@@ -29,7 +28,6 @@ from ..frames import (
     FrameProvider,
     VideoFrameProvider,
     null_provider,
-    to_image_blocks,
     to_video_block,
     to_video_url_block,
 )
@@ -84,20 +82,8 @@ class PlanSubtasksMemoryModule:
 
         subtask_spans = self._generate_subtasks(record, task=effective_task)
 
-        # Phase 1a: optional per-subtask action records. When enabled, emit a
-        # typed ActionRecord (verb/object/arm/grasp_type/destination/mistake)
-        # per span as a separate style="action_record" row. Purely additive —
-        # never touches the subtask text.
-        records_cfg = self.config.action_records
-        action_records: list[dict[str, Any] | None] = [None] * len(subtask_spans)
-        if records_cfg.enabled and subtask_spans:
-            for i, span in enumerate(subtask_spans):
-                rec = self._extract_action_record(record, span, effective_task)
-                if rec is not None:
-                    action_records[i] = rec
-
         # subtask rows
-        for i, span in enumerate(subtask_spans):
+        for span in subtask_spans:
             rows.append(
                 {
                     "role": "assistant",
@@ -107,16 +93,6 @@ class PlanSubtasksMemoryModule:
                     "tool_calls": None,
                 }
             )
-            if records_cfg.enabled and records_cfg.emit_record_row and action_records[i] is not None:
-                rows.append(
-                    {
-                        "role": "assistant",
-                        "content": json.dumps(action_records[i], sort_keys=True),
-                        "style": "action_record",
-                        "timestamp": snap_to_frame(span["start"], record.frame_timestamps),
-                        "tool_calls": None,
-                    }
-                )
         # Plan rows at every subtask boundary (incl. t=0). The plan is a
         # numbered list of still-todo subtasks, so re-emitting at each
         # boundary makes it shrink as work progresses — ${plan} at frame t is
@@ -264,107 +240,6 @@ class PlanSubtasksMemoryModule:
         out = [item.strip().strip('"').strip("'") for item in raw if isinstance(item, str)]
         return [s for s in out if s][:n]
 
-    # ------------------------------------------------------------------
-    # Phase 1a + 1b: structured per-subtask action records
-    # ------------------------------------------------------------------
-
-    def _extract_action_record(
-        self,
-        record: EpisodeRecord,
-        span: dict[str, Any],
-        episode_task: str,
-    ) -> dict[str, Any] | None:
-        """Ask the VLM to extract a typed ``ActionRecord`` from a subtask span.
-
-        Sends ``frames_per_subtask`` frames uniformly sampled from
-        ``[span.start, span.end]`` plus the canonical subtask text. The
-        VLM is constrained to verb + grasp vocabularies from the config
-        — invalid values are silently dropped at this layer (the
-        validator catches structural problems pre-write).
-
-        Returns ``None`` when the call fails or the VLM returns something
-        unrecognizable; callers fall back to the free-form subtask text.
-        """
-        cfg = self.config.action_records
-        start_t = float(span.get("start", 0.0))
-        end_t = float(span.get("end", start_t))
-        duration = max(0.0, end_t - start_t)
-
-        # Uniform timestamps within the span; fall back to a single
-        # center frame for very short spans.
-        n = max(1, int(cfg.frames_per_subtask))
-        if n == 1 or duration <= 0.0:
-            timestamps = [0.5 * (start_t + end_t)]
-        else:
-            step = duration / (n - 1)
-            timestamps = [start_t + i * step for i in range(n)]
-        frames = self.frame_provider.frames_at(record, timestamps)
-        if not frames:
-            logger.debug(
-                "action_record: no frames at span %.2f-%.2f for ep %s; skipping",
-                start_t,
-                end_t,
-                record.episode_index,
-            )
-            return None
-
-        prompt = load_prompt("plan_action_record").format(
-            episode_task=episode_task,
-            subtask_text=span.get("text", ""),
-            start_time=start_t,
-            end_time=end_t,
-            duration=duration,
-            n_frames=len(frames),
-            verb_vocabulary=", ".join(cfg.verb_vocabulary),
-            grasp_vocabulary=" | ".join(f'"{g}"' for g in cfg.grasp_vocabulary),
-        )
-        message = [
-            {
-                "role": "user",
-                "content": [*to_image_blocks(frames), {"type": "text", "text": prompt}],
-            }
-        ]
-        result = self.vlm.generate_json([message])[0]
-        if not isinstance(result, dict):
-            return None
-
-        # Light validation + normalisation. Verb is required; everything
-        # else may be null. Verb / grasp_type are clamped to the
-        # vocabularies (out-of-vocab → reject or null).
-        verb = (result.get("verb") or "").strip().lower()
-        if not verb or verb not in {v.lower() for v in cfg.verb_vocabulary}:
-            return None
-        obj = (result.get("object") or "").strip()
-        if not obj:
-            return None
-        grasp = result.get("grasp_type")
-        if isinstance(grasp, str):
-            grasp = grasp.strip().lower()
-            if grasp not in {g.lower() for g in cfg.grasp_vocabulary}:
-                grasp = None
-        else:
-            grasp = None
-        arm = result.get("arm")
-        if isinstance(arm, str):
-            arm = arm.strip().lower()
-            if arm not in {"left", "right", "both"}:
-                arm = None
-        else:
-            arm = None
-        destination = result.get("destination")
-        destination = destination.strip() if isinstance(destination, str) and destination.strip() else None
-        mistake = result.get("mistake")
-        mistake = mistake.strip() if isinstance(mistake, str) and mistake.strip() else None
-
-        return {
-            "verb": verb,
-            "object": obj,
-            "arm": arm,
-            "grasp_type": grasp,
-            "destination": destination,
-            "mistake": mistake,
-        }
-
     # ------------------------------------------------------------------
     # Structured 5-axis task augmentation (EgoMimic-style taxonomy)
     # ------------------------------------------------------------------
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt b/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt
deleted file mode 100644
index 1bd127048..000000000
--- a/src/lerobot/annotations/steerable_pipeline/prompts/plan_action_record.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-You are extracting a structured action record from a subtask span of a
-teleoperated robot demonstration. This is Phase 1a of a two-step
-process: you extract a typed record; a deterministic template then
-renders it back to canonical subtask text. Your job is the PERCEPTION
-step — not the language step.
-
-The user originally asked: "{episode_task}"
-The subtask span is:        "{subtask_text}"
-Span time window:           [{start_time:.2f}s, {end_time:.2f}s]
-                            ({duration:.2f}s of robot activity)
-
-You are shown {n_frames} frames sampled uniformly from the subtask
-window. Fill in a structured record describing the action that takes
-place between the first and last frame.
-
-Hard rules:
-- Use ONLY information visible in the frames. Do not infer details from
-  outside the span. Do not extrapolate from the original task wording.
-- Use canonical object names from the original task VERBATIM. Never
-  introduce synonyms: if the task says "cube", the record says "cube",
-  never "block" / "object" / "item".
-- For non-applicable fields, use ``null`` (not "n/a", not "none", not
-  an empty string).
-- For ``verb`` and ``grasp_type``, pick EXACTLY one value from the
-  vocabulary below. Never invent a new one.
-
-Field schema:
-
-  verb (required) — the imperative verb of the action. Vocabulary:
-    {verb_vocabulary}
-
-  object (required) — the manipulated object. Use the canonical noun
-    from the original task above.
-
-  arm — which arm performs the action. One of:
-    "left" | "right" | "both" | null
-    Use ``null`` when the source robot is single-arm or when the arm
-    is genuinely not visible in the frames.
-
-  grasp_type — which grip the gripper uses on contact. One of:
-    {grasp_vocabulary} | null
-    Use ``null`` when there is no contact in this span (e.g. a pure
-    ``move`` / ``reach`` subtask) or the grip is genuinely unclear.
-
-  destination — the target location for actions like ``place``,
-    ``move``, ``insert``, ``pour``. Use canonical names from the
-    original task. Use ``null`` for in-place actions (``press``,
-    ``turn``, ``grasp``, ``release``).
-
-  mistake — a brief one-clause description of any visible failure or
-    recovery during the span (e.g. "dropped the cube and re-grasped",
-    "missed the target on first attempt"). Use ``null`` when the span
-    completes cleanly with no visible recovery.
-
-Output strictly valid JSON of shape:
-
-  {{
-    "verb": "<one of vocabulary>",
-    "object": "<canonical noun>",
-    "arm": "left" | "right" | "both" | null,
-    "grasp_type": "<one of vocabulary>" | null,
-    "destination": "<canonical noun>" | null,
-    "mistake": "<short description>" | null
-  }}
diff --git a/src/lerobot/datasets/language.py b/src/lerobot/datasets/language.py
index f3d371545..124c25221 100644
--- a/src/lerobot/datasets/language.py
+++ b/src/lerobot/datasets/language.py
@@ -36,7 +36,6 @@ CORE_STYLES = {
     "vqa",
     "trace",
     "task_aug",
-    "action_record",
 }
 # Project-local styles can be registered at import time by appending to
 # ``EXTENDED_STYLES`` before ``column_for_style`` is called. Anything added
@@ -47,7 +46,7 @@ CORE_STYLES = {
 EXTENDED_STYLES: set[str] = set()
 STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
 
-PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug", "action_record"}
+PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion", "task_aug"}
 EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}
 
 # Styles whose ``content`` is grounded in a specific camera view. Rows of these
diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py
index 2846dab1d..52c7b3708 100644
--- a/tests/datasets/test_language.py
+++ b/tests/datasets/test_language.py
@@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
 
 
 def test_style_registry_routes_columns():
-    assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES
+    assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES
     assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES
     assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY
 

From cdd94a703f1226132e1014e9df7709e498f55c54 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Thu, 4 Jun 2026 15:12:31 +0200
Subject: [PATCH 45/45] annotate(config): tighten field comments to one line
 each
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collapse the remaining multi-line field comments / docstrings in config.py
to single lines (or two where a knob genuinely needs it), keeping the
essential rationale. Comments only — no field or behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../annotations/steerable_pipeline/config.py  | 132 ++++++------------
 1 file changed, 41 insertions(+), 91 deletions(-)

diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 439201993..d349694c6 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -23,79 +23,53 @@ from typing import Any
 
 @dataclass
 class PlanConfig:
-    """``plan`` module: plan + subtasks + memory + task augmentation.
-
-    The ``plan`` module attaches the whole episode as one Qwen-VL video
-    block; ``max_video_frames`` only caps the frames packed in (a
-    model-capacity bound, not an annotation-logic knob).
-    """
+    """``plan`` module: subtasks + plan + memory + task augmentation."""
 
     enabled: bool = True
 
-    # Number of ``task_aug`` rephrasings emitted at ``t=0``. The renderer's
-    # ``${task}`` binding rotates among them per ``sample_idx``. ``0`` disables.
+    # ``task_aug`` rephrasings at t=0 (renderer rotates ${task} among them); 0 disables.
     n_task_rephrasings: int = 10
 
-    # Derive the task from video instead of ``record.episode_task``: ``off``,
-    # ``if_short`` (canonical task short/placeholder/missing), or ``always``.
+    # Derive the task from video instead of episode_task: off / if_short / always.
     # Affects prompts only; ``meta/tasks.parquet`` is untouched.
     derive_task_from_video: str = "if_short"
     derive_task_min_words: int = 3
 
-    # Frames sampled uniformly, capped at ``max_video_frames`` — a HARD context
-    # cap (~250-320 tokens/frame, so 32 fit a 32k VLM; 128 overflow). Lower it
-    # if you hit "Input length exceeds maximum context length".
+    # Frames sampled uniformly, capped at max_video_frames — a hard context cap
+    # (~300 tokens/frame, so 32 fit a 32k VLM; 128 overflow).
     frames_per_second: float = 1.0
     max_video_frames: int = 32
 
-    # Windowed subtask generation for constant temporal density: when > 0 and
-    # the episode is longer, process it in windows of this length (each at
-    # ``frames_per_second``) instead of subsampling the whole episode; spans are
-    # merged + stitched. ~max_video_frames / frames_per_second. 0 disables.
+    # >0: split long episodes into windows of this length (constant fps density)
+    # instead of subsampling the whole episode; spans merged + stitched. 0 disables.
     subtask_window_seconds: float = 0.0
 
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
-    # Grounding pass that narrates ONLY what's visible before segmenting — the
-    # strongest lever against subtasks invented from the task text. ON by
-    # default (+1 VLM call/episode); False trades quality for fewer calls.
+    # Narrate-only grounding pass before segmenting — best defense against subtasks
+    # invented from the task text (+1 VLM call/episode).
     subtask_describe_first: bool = True
 
-    # Emit ``style="plan"`` rows (the numbered still-todo list, re-emitted at
-    # every subtask boundary). False keeps only subtasks + memory and skips
-    # the per-boundary ``_generate_plan`` call.
+    # Emit ``style="plan"`` rows at each boundary; False = subtasks + memory only.
     emit_plan: bool = True
 
-    # NOTE: subtask spans are ALWAYS stitched into a contiguous full-episode
-    # cover (see ``_stitch_full_coverage``) — not configurable.
+    # (subtask spans are always stitched to a contiguous full-episode cover; not configurable.)
 
-    # When True, send a server-side ``video_url`` clip (sampled at
-    # ``use_video_url_fps``) instead of embedded frames.
+    # Send a server-side ``video_url`` clip (at use_video_url_fps) instead of embedded frames.
     use_video_url: bool = False
     use_video_url_fps: float = 1.0
 
-    # Optional 5-axis task-augmentation taxonomy for the t=0 variants
-    # (EgoMimic-style: synonym / omit_arm / omit_orientation /
-    # omit_grasp_method / combined). Replaces the free-form
-    # ``n_task_rephrasings`` flow when enabled; see ``TaskAugAxesConfig``.
+    # Optional EgoMimic-style 5-axis task augmentation; replaces n_task_rephrasings.
     task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())
 
 
 @dataclass
 class TaskAugAxesConfig:
-    """Structured 5-axis augmentation taxonomy for t=0 task variants.
-
-    When ``enabled=True``, replaces the free-form ``n_task_rephrasings``
-    flow with variants along five named axes (EgoMimic-style):
-    ``synonym_paraphrase`` (reword, keep all info), ``omit_arm``,
-    ``omit_orientation``, ``omit_grasp_method``, and ``combined_omissions``
-    (drop two at once).
-
-    Default counts (3+3+2+2+2 = 12) match EgoMimic. Axes with nothing to
-    omit emit fewer entries rather than pad. Each variant becomes a
-    ``task_aug`` row at ``t=0``, identical in style to the free-form ones.
-    """
+    """5-axis t=0 task augmentation (EgoMimic-style): synonym / omit_arm /
+    omit_orientation / omit_grasp_method / combined. Replaces n_task_rephrasings
+    when enabled; each variant becomes a ``task_aug`` row. Axes with nothing to
+    omit emit fewer entries. Defaults (3+3+2+2+2) match EgoMimic."""
 
     enabled: bool = False
 
@@ -112,13 +86,11 @@ class InterjectionsConfig:
 
     enabled: bool = True
 
-    # Each interjection emits a paired (interjection, speech) event row and
-    # triggers a ``plan`` refresh at the same timestamp.
+    # Each emits a paired (interjection, speech) row + a plan refresh at that ts.
     max_interjections_per_episode: int = 3
     interjection_min_t: float = 2.0
 
-    # A short frame window centered on the timestamp so the VLM sees the
-    # motion, not one frozen frame.
+    # Frame window centered on the timestamp so the VLM sees motion, not one frame.
     interjection_window_seconds: float = 2.0
     interjection_window_frames: int = 4
 
@@ -130,14 +102,11 @@ class VqaConfig:
     enabled: bool = True
     vqa_emission_hz: float = 1.0
     K: int = 1
-    """Consecutive frames each emission tick anchors a VQA pair to. The VLM
-    grounds its answer on the FIRST anchored frame, so K>1 copies that answer
-    onto later (moved) frames — stale labels. Default 1 (no smear)."""
+    """Consecutive frames per emission tick. The VLM grounds on the FIRST frame,
+    so K>1 smears stale labels onto moved frames. Default 1 (no smear)."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
 
-    # By default VQA iterates every camera (one pair per camera per tick). Set
-    # True to ground VQA only on ``--vlm.camera_key`` — the single view the
-    # plan / interjection modules use.
+    # True: ground VQA only on --vlm.camera_key (default: every camera).
     restrict_to_default_camera: bool = False
 
 
@@ -145,27 +114,22 @@ class VqaConfig:
 class VlmConfig:
     """Shared Qwen-VL client configuration."""
 
-    # Only ``openai`` is supported (in-process vllm/transformers were removed;
-    # the shipped workflow is HF Jobs). Talks to an OpenAI-compatible vLLM
-    # server, auto-spawned in-job when ``auto_serve=True``. ``stub`` is for tests.
+    # Only ``openai`` (OpenAI-compatible vLLM server, auto-spawned when
+    # auto_serve=True); ``stub`` is for tests.
     backend: str = "openai"
     model_id: str = "Qwen/Qwen3.6-27B"
 
-    # OpenAI-compatible server endpoint; ``EMPTY`` works for local servers.
+    # OpenAI-compatible endpoint; ``EMPTY`` key works for local servers.
     api_base: str = "http://localhost:8000/v1"
     api_key: str = "EMPTY"
 
-    # When True with ``backend=openai``, the CLI probes ``api_base`` and
-    # spawns a server if none answers (default: ``transformers serve``).
-    # Set to False to fail fast when pointing at a remote endpoint.
+    # Spawn a server if none answers api_base; False = fail fast on a remote.
     auto_serve: bool = True
     serve_port: int = 8000
-    # Override the auto-serve command. ``{port}`` is substituted per replica
-    # when ``parallel_servers > 1``.
+    # Override the auto-serve command; ``{port}`` substituted per replica.
     serve_command: str | None = None
 
-    # Independent servers for round-robin routing (each pinned to a GPU,
-    # bound to ``serve_port + i``). ``num_gpus=0`` = one GPU per replica.
+    # Independent servers for round-robin routing (one per GPU). num_gpus=0 = one each.
     parallel_servers: int = 1
     num_gpus: int = 0
     client_concurrency: int = 16
@@ -174,49 +138,37 @@ class VlmConfig:
     max_new_tokens: int = 512
     temperature: float = 0.2
 
-    # Context length for the auto-spawned vLLM server (None → 32768). vLLM
-    # tuning flags (tensor-parallel size, GPU memory fraction, ...) go in
-    # ``serve_command`` directly, not here.
+    # Auto-serve context length (None → 32768); other vLLM flags go in serve_command.
     max_model_len: int | None = None
 
-    # Override the camera stream used for keyframe attachment. None picks
-    # the first ``observation.images.*`` key the dataset declares.
+    # Camera for keyframes; None → first ``observation.images.*`` key.
     camera_key: str | None = None
-    # Forwarded as ``extra_body.chat_template_kwargs`` on every chat call;
-    # use to pass model-specific flags such as ``{"enable_thinking": false}``.
+    # Forwarded as extra_body.chat_template_kwargs (e.g. {"enable_thinking": false}).
     chat_template_kwargs: dict[str, Any] | None = None
 
 
 @dataclass
 class ExecutorConfig:
-    """Executor settings — intra-process episode concurrency only
-    (distributed execution is delegated to Hugging Face Jobs)."""
+    """Executor settings (intra-process episode concurrency; distribution via HF Jobs)."""
 
-    # Episodes processed concurrently per module phase. Each dispatches 3-5 VLM
-    # calls, so this is the main knob for saturating ``parallel_servers`` /
-    # ``client_concurrency``.
+    # Episodes processed concurrently per phase; main knob for saturating the servers.
     episode_parallelism: int = 16
 
 
 @dataclass
 class AnnotationPipelineConfig:
-    """Top-level config for ``lerobot-annotate``.
+    """Top-level config for ``lerobot-annotate`` (rewrites data shards in place)."""
 
-    The writer rewrites ``data/chunk-*/file-*.parquet`` in place. Multiple
-    revisions of the same dataset live in separate copies.
-    """
-
-    # Hub dataset id: download source when ``root`` is unset, and push target
-    # when ``push_to_hub`` is on and ``new_repo_id`` is unset.
+    # Hub dataset: download source when ``root`` unset; push target when push_to_hub
+    # is on and ``new_repo_id`` unset.
     repo_id: str | None = None
 
-    # Optional separate push target (named to match the LeRobot dataset edit
-    # tools). Unset → push back to ``repo_id`` in place; set → source untouched.
+    # Separate push target (matches the LeRobot edit tools). Unset → push in place.
     new_repo_id: str | None = None
 
     root: Path | None = None
 
-    # Defaults to ``<root>/.annotate_staging/`` when unset.
+    # Defaults to ``<root>/.annotate_staging/``.
     staging_dir: Path | None = None
 
     seed: int = 1729
@@ -231,13 +183,11 @@ class AnnotationPipelineConfig:
     skip_validation: bool = False
     only_episodes: tuple[int, ...] | None = None
 
-    # Keyframe decode backend. Unset → ffmpeg CLI: decodes AV1 in an isolated
-    # child process, so it's crash- and thread-safe under concurrent decode
-    # (torchcodec SIGSEGVs there). Set ``"torchcodec"`` / ``"pyav"`` to pin one.
+    # Keyframe decode backend. None → ffmpeg CLI (crash-/thread-safe; torchcodec
+    # SIGSEGVs under concurrent decode). Or ``"torchcodec"`` / ``"pyav"``.
     video_backend: str | None = None
 
-    # Upload the annotated dataset to the Hub (to ``new_repo_id`` if set, else
-    # back to ``repo_id`` — one of the two must be set).
+    # Upload to the Hub (new_repo_id if set, else repo_id; one must be set).
     push_to_hub: bool = False
     push_private: bool = False
     push_commit_message: str | None = None