From 6ce1f3600229946dfc0acaef50cc1ee7bcf1055b Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 12 May 2026 18:42:59 +0200
Subject: [PATCH] fix(smolvla2): supervise high-level head with *current*
 subtask at every frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The high_level_subtask recipe targeted ``nth_next(style=subtask, offset=1)``,
which on the last span of any episode resolves to None. The recipe had no
``if_present`` guard on the target, so the renderer emitted an empty
assistant turn and cross-entropy supervised the model on the chat
template's structural newlines (``\n``). Across the dataset this trained
the LM head's argmax at position 0 to collapse to ``\n`` whenever no
transition was imminent (i.e. most frames). Visible failure mode at
inference: the head emits 40+ newlines + ``<end_of_utterance>`` every
chunk boundary while the action expert keeps working — confirmed by
running the dry-run on dataset frame 0 with the dataset's own image
and seeing the same ``\n × 44`` collapse.

Switch to the Pi 0.5 / Pi 0.7 supervision pattern: at every frame, the
assistant target is the *current* active subtask span text (via
``${subtask}`` → ``active_at(t, style=subtask)``). Always non-empty,
always scene-grounded, ``if_present: subtask`` skips frames with no
active span instead of emitting a degenerate empty turn.

Runtime callsite update: ``_msgs_for_subtask`` no longer feeds a
"Current subtask: X" user message into the prompt (that would be
circular — we'd be telling the model the answer). Transition
detection moves into the runtime — when the predicted subtask differs
from ``state['current_subtask']``, the existing ``set_if_changed``
path fires ``subtask_change`` and downstream memory updates. Same
event surface, supervision target is now always meaningful.

Requires re-annotating the dataset and retraining for the fix to land
in the checkpoint, but the recipe + runtime change is what enables it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../configs/recipes/smolvla2_hirobot.yaml     | 27 ++++++++++++++++---
 .../policies/smolvla2/inference/steps.py      | 24 ++++++++++-------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
index 2586d9529..ce5cdd3a9 100644
--- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
+++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
@@ -41,14 +41,33 @@ blend:
       - {role: user, content: "${interjection}", stream: high_level, if_present: interjection}
       - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan, tool_calls_from: speech}
 
+  # PR3 Hi-Robot v2: supervise the high-level head with the *current*
+  # active subtask, not the *next*. Pi 0.5 / Pi 0.7 both do this: at every
+  # frame the assistant target is "what is the robot doing right now"
+  # grounded in the current image + state + context, so the supervision
+  # target is always a non-empty span string.
+  #
+  # The original target was ``nth_next(style=subtask, offset=1)`` — at
+  # most frames within a single span this resolves to the next-span
+  # string (fine), but on the LAST span of an episode it resolves to
+  # empty/None. The recipe had no ``if_present`` guard on the target,
+  # so the renderer emitted an empty assistant turn and cross-entropy
+  # ended up supervising the chat-template's structural newlines.
+  # Across a dataset annotated this way, the LM head's argmax at
+  # position 0 collapses to ``\n`` whenever no transition is happening
+  # (which is most of the time). At inference: head silently emits
+  # newlines every chunk boundary while the action expert keeps working.
+  #
+  # With ``${subtask}`` (binds to ``active_at(t, style=subtask)``) the
+  # target is the current span's text — always non-empty, scene-
+  # grounded. The runtime detects subtask transitions by comparing the
+  # predicted subtask string to the last known one, the same way Pi 0.5
+  # does. No information loss.
   high_level_subtask:
     weight: 0.15
-    bindings:
-      next_subtask: "nth_next(style=subtask, offset=1)"
     messages:
       - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level}
-      - {role: user, content: "Current subtask: ${subtask}", stream: high_level, if_present: subtask}
-      - {role: assistant, content: "${next_subtask}", stream: high_level, target: true}
+      - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
 
   low_level_execution:
     weight: 0.35
diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index 5fc04bcdb..b35897552 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -717,20 +717,26 @@ def _control_context_messages(
 
 
 def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]:
-    """``high_level_subtask`` recipe layout."""
+    """``high_level_subtask`` recipe layout (v2 — predict current subtask).
+
+    The training-time recipe was changed to supervise the model on the
+    *current* active subtask span at every frame, not the next-span text
+    only at transitions. So the inference-time prompt no longer feeds a
+    "Current subtask: X" user message — that would be circular (we'd be
+    telling the model the answer). The model now decides the subtask
+    purely from the task + plan + memory context plus the visual prefix.
+
+    Transition detection moves into the runtime: when the predicted
+    subtask differs from ``state['current_subtask']``, fire the
+    ``subtask_change`` event so memory updates. Same downstream signal
+    as before, just produced by an always-non-empty supervision target.
+    """
     head_parts = [state.get("task") or ""]
     if state.get("current_plan"):
         head_parts.append(f"Plan: {state['current_plan']}")
     if state.get("current_memory"):
         head_parts.append(f"Memory: {state['current_memory']}")
-    msgs: list[dict[str, Any]] = [
-        {"role": "user", "content": "\n".join(head_parts)}
-    ]
-    if state.get("current_subtask"):
-        msgs.append(
-            {"role": "user", "content": f"Current subtask: {state['current_subtask']}"}
-        )
-    return msgs
+    return [{"role": "user", "content": "\n".join(head_parts)}]
 
 
 def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]: