From 6ce1f3600229946dfc0acaef50cc1ee7bcf1055b Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 12 May 2026 18:42:59 +0200 Subject: [PATCH] fix(smolvla2): supervise high-level head with *current* subtask at every frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The high_level_subtask recipe targeted ``nth_next(style=subtask, offset=1)``, which on the last span of any episode resolves to None. The recipe had no ``if_present`` guard on the target, so the renderer emitted an empty assistant turn and cross-entropy supervised the model on the chat template's structural newlines (``\n``). Across the dataset this trained the LM head's argmax at position 0 to collapse to ``\n`` whenever no transition was imminent (i.e. most frames). Visible failure mode at inference: the head emits 40+ newlines + ```` every chunk boundary while the action expert keeps working — confirmed by running the dry-run on dataset frame 0 with the dataset's own image and seeing the same ``\n × 44`` collapse. Switch to the Pi 0.5 / Pi 0.7 supervision pattern: at every frame, the assistant target is the *current* active subtask span text (via ``${subtask}`` → ``active_at(t, style=subtask)``). Always non-empty, always scene-grounded, ``if_present: subtask`` skips frames with no active span instead of emitting a degenerate empty turn. Runtime callsite update: ``_msgs_for_subtask`` no longer feeds a "Current subtask: X" user message into the prompt (that would be circular — we'd be telling the model the answer). Transition detection moves into the runtime — when the predicted subtask differs from ``state['current_subtask']``, the existing ``set_if_changed`` path fires ``subtask_change`` and downstream memory updates. Same event surface, supervision target is now always meaningful. Requires re-annotating the dataset and retraining for the fix to land in the checkpoint, but the recipe + runtime change is what enables it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/recipes/smolvla2_hirobot.yaml | 27 ++++++++++++++++--- .../policies/smolvla2/inference/steps.py | 24 ++++++++++------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml index 2586d9529..ce5cdd3a9 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml @@ -41,14 +41,33 @@ blend: - {role: user, content: "${interjection}", stream: high_level, if_present: interjection} - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan, tool_calls_from: speech} + # PR3 Hi-Robot v2: supervise the high-level head with the *current* + # active subtask, not the *next*. Pi 0.5 / Pi 0.7 both do this: at every + # frame the assistant target is "what is the robot doing right now" + # grounded in the current image + state + context, so the supervision + # target is always a non-empty span string. + # + # The original target was ``nth_next(style=subtask, offset=1)`` — at + # most frames within a single span this resolves to the next-span + # string (fine), but on the LAST span of an episode it resolves to + # empty/None. The recipe had no ``if_present`` guard on the target, + # so the renderer emitted an empty assistant turn and cross-entropy + # ended up supervising the chat-template's structural newlines. + # Across a dataset annotated this way, the LM head's argmax at + # position 0 collapses to ``\n`` whenever no transition is happening + # (which is most of the time). At inference: head silently emits + # newlines every chunk boundary while the action expert keeps working. + # + # With ``${subtask}`` (binds to ``active_at(t, style=subtask)``) the + # target is the current span's text — always non-empty, scene- + # grounded. The runtime detects subtask transitions by comparing the + # predicted subtask string to the last known one, the same way Pi 0.5 + # does. No information loss. high_level_subtask: weight: 0.15 - bindings: - next_subtask: "nth_next(style=subtask, offset=1)" messages: - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - - {role: user, content: "Current subtask: ${subtask}", stream: high_level, if_present: subtask} - - {role: assistant, content: "${next_subtask}", stream: high_level, target: true} + - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} low_level_execution: weight: 0.35 diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 5fc04bcdb..b35897552 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -717,20 +717,26 @@ def _control_context_messages( def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]: - """``high_level_subtask`` recipe layout.""" + """``high_level_subtask`` recipe layout (v2 — predict current subtask). + + The training-time recipe was changed to supervise the model on the + *current* active subtask span at every frame, not the next-span text + only at transitions. So the inference-time prompt no longer feeds a + "Current subtask: X" user message — that would be circular (we'd be + telling the model the answer). The model now decides the subtask + purely from the task + plan + memory context plus the visual prefix. + + Transition detection moves into the runtime: when the predicted + subtask differs from ``state['current_subtask']``, fire the + ``subtask_change`` event so memory updates. Same downstream signal + as before, just produced by an always-non-empty supervision target. + """ head_parts = [state.get("task") or ""] if state.get("current_plan"): head_parts.append(f"Plan: {state['current_plan']}") if state.get("current_memory"): head_parts.append(f"Memory: {state['current_memory']}") - msgs: list[dict[str, Any]] = [ - {"role": "user", "content": "\n".join(head_parts)} - ] - if state.get("current_subtask"): - msgs.append( - {"role": "user", "content": f"Current subtask: {state['current_subtask']}"} - ) - return msgs + return [{"role": "user", "content": "\n".join(head_parts)}] def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]: