From 83d7250a22ea6c1566db476cef7aa02914d25c73 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 12:22:45 +0200 Subject: [PATCH] fix(recipes): low_level_execution needs if_present:subtask guard too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same bug we fixed for high_level_subtask, just on the other subtask-supervised sub-recipe. ``low_level_execution`` targets ``${subtask}`` (the current active span) but had no ``if_present`` guard. When ``active_at(t, style=subtask)`` returned None at a frame (gaps in the annotation, or the very first/last frames of an episode if the annotator's spans don't fully tile), the assistant message rendered with empty content. The chat tokenizer still included it in ``target_message_indices`` → text CE supervised whatever the chat-template's empty assistant turn decoded to (usually a single ``\n``). That trains the LM head's prior at the first generation position toward ``\n``, the same collapse we observed with the original ``${next_subtask}`` target. Fix: ``if_present: subtask`` on the assistant target in ``low_level_execution`` for both ``smolvla2_hirobot.yaml`` and ``pi052_hirobot.yaml``. Side effect: frames without an active subtask span no longer contribute to the flow loss either (the only ``low_level`` target is skipped, ``predict_actions = bool(targets_by_stream.get("low_level"))`` becomes False). For a well-annotated dataset where subtask spans tile the whole episode this is a no-op. For datasets with gaps, those gap frames lose flow supervision — strictly better than the degenerate text-CE alternative. Sub-recipe audit summary (no other changes needed): * memory_update — all if_present guards present, OK * user_interjection_response — all if_present guards present, OK * high_level_subtask — fixed earlier, OK * low_level_execution — fixed by this commit * ask_vqa_top / ask_vqa_wrist — query+answer both guarded, OK Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/recipes/pi052_hirobot.yaml | 6 +++++- .../configs/recipes/smolvla2_hirobot.yaml | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml index f0c8982d6..b5a410712 100644 --- a/src/lerobot/configs/recipes/pi052_hirobot.yaml +++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml @@ -57,11 +57,15 @@ blend: - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} + # Same ``if_present: subtask`` guard as high_level_subtask above — + # see smolvla2_hirobot.yaml for the full rationale. Skips the + # action-loss supervision on frames without an active subtask span + # rather than emitting a degenerate empty target. low_level_execution: weight: 0.35 messages: - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - - {role: assistant, content: "${subtask}", stream: low_level, target: true} + - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} ask_vqa_top: weight: 0.10 diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml index ce5cdd3a9..ad7eef465 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml @@ -69,11 +69,28 @@ blend: - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} + # PR3 fix: same ``if_present: subtask`` guard as high_level_subtask + # above. Without it, frames where ``active_at(t, style=subtask)`` + # returns None render the assistant turn with empty content, which + # the chat tokenizer still includes in target_message_indices → + # text-CE supervises predicting ``\n`` (the chat template's + # structural newline) and the LM head collapses to that prior. + # The same bug we fixed for high_level_subtask, just on a + # different sub-recipe. + # + # Trade-off of adding the guard: frames without an active subtask + # span no longer contribute to the flow loss either (because + # ``predict_actions = bool(targets_by_stream.get("low_level"))`` + # and the only low_level target message is now skipped). For a + # well-annotated dataset where subtask spans tile the whole + # episode this is a no-op. For datasets with gaps, those gap + # frames lose flow supervision — which is strictly better than + # the degenerate alternative. low_level_execution: weight: 0.35 messages: - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level} - - {role: assistant, content: "${subtask}", stream: low_level, target: true} + - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} # Per-camera VQA sub-recipes (PR 1's view-dependent style routing). # Adjust the camera keys (and add more sub-recipes) to match the