From f161e27e9637cdf7298a30793d2adb7b2ee4c113 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Fri, 15 May 2026 13:40:15 +0200 Subject: [PATCH] recipe+runtime: condition the action expert on the task, not the subtask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-robot runs shook and failed the task despite a low flow loss. Root cause: train/inference conditioning mismatch — not a flow-loss bug (``_compute_fused_loss``'s flow path is byte-identical to ``SmolVLAModel.forward``). At training, ``low_level_execution`` conditioned the action expert on ``${subtask}``, and every frame's subtask was the correct one for that frame. At inference the runtime has no high-level subtask generator (VQA-only pipeline), so ``current_subtask`` was frozen — the action expert got "move towards the blue cube" for the entire episode. Once the arm reached the cube, that (image, subtask) pair never occurred in training → OOD conditioning → incoherent flow output → shaking. Fix: ``low_level_execution`` now renders ``user(${task})``. The task is stable for the whole episode and always available, so the action expert's conditioning is identical at train and inference with no high-level loop required. ``LowLevelForward`` updated to build the same ``[user(task)]`` prompt. ``high_level_subtask`` still trains the text head to predict subtasks (kept for when a reliable subtask loop is reintroduced) — it's just no longer on the action expert's critical path. Requires re-training for the recipe change to take effect. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lerobot/configs/recipes/hirobot.yaml | 21 +++++++++++-------- .../policies/smolvla2/inference/steps.py | 15 ++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/lerobot/configs/recipes/hirobot.yaml b/src/lerobot/configs/recipes/hirobot.yaml index 3cf5bc573..3ee88690a 100644 --- a/src/lerobot/configs/recipes/hirobot.yaml +++ b/src/lerobot/configs/recipes/hirobot.yaml @@ -3,11 +3,11 @@ # # Trains two things only: subtasks and VQA. Plan and memory are # intentionally left out for now — keeps the prompt short and the -# training surface small while the core subtask + action loop is -# validated. +# training surface small while the core action loop is validated. # -# high_level_subtask — predict the subtask from the task. -# low_level_execution — flow loss with [images, subtask, state]. +# high_level_subtask — predict the subtask from the task (text +# head only; not on the inference path yet). +# low_level_execution — flow loss with [images, task, state]. # ask_vqa_{top,wrist} — camera-grounded VQA. # # Each backbone's text tokenizer renders these messages differently @@ -25,12 +25,15 @@ blend: low_level_execution: weight: 0.40 messages: - # π0.5-style action conditioning. The action expert sees only - # [images, this user turn (= bare subtask), state]. No text-CE - # target — subtask prediction is owned by ``high_level_subtask``. + # The action expert is conditioned on the TASK (not the subtask). + # The task is always available at inference with no high-level + # generation loop, so this removes the train/inference mismatch + # that a subtask-conditioned action head would have while there + # is no reliable runtime subtask source. ``high_level_subtask`` + # still trains the text head to predict subtasks for later use. # ``stream: low_level`` flips ``predict_actions=True`` so the - # flow loss fires. - - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} + # flow loss fires; no text-CE target here. + - {role: user, content: "${task}", stream: low_level} ask_vqa_top: weight: 0.10 diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 3bd34074d..b255e7417 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -111,15 +111,12 @@ class LowLevelForward(InferenceStep): if observation is None: return None - # π0.5-style: the action expert is conditioned on just the - # subtask (+ images + state). No task / plan / memory in the - # low-level prompt — those are only used by the high-level - # loop to *generate* the subtask. Matches the training-time - # ``low_level_execution`` recipe shape (single user turn, - # no assistant target since text-CE is owned by the - # high-level recipe). - subtask = state.get("current_subtask") or state.get("task") or "" - ctx = [{"role": "user", "content": subtask}] + # The action expert is conditioned on the TASK string — the + # ``low_level_execution`` recipe renders ``user(${task})``. + # The task is stable for the whole episode and always present, + # so there is no train/inference mismatch and no dependency on + # a (currently unreliable) high-level subtask generator. + ctx = [{"role": "user", "content": state.get("task") or ""}] # ``add_generation_prompt=False`` to match the training-time # prefix shape: at training the action expert sees the rendered # user turn ending at ``<|im_end|>`` (no trailing