From 2c920ab1781f644a4d8cc6da5444e311ea097b4a Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 13 May 2026 15:16:28 +0200
Subject: [PATCH] refactor(recipes): consolidate to shared hirobot.yaml + audit
 fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The smolvla2 and pi052 recipe blends had drifted to identical content
twice in a row; collapse them to a single ``recipes/hirobot.yaml``
both policies point at. Each backbone's text tokenizer (chat-template
for SmolVLA2, plain ``Role: content`` for PI052) handles the
rendering differences downstream — the recipe spec is shared.

Audit fixes folded into the same commit:

* **Train/inference prefix mismatch on the action expert**
  ``_build_text_batch`` always passed ``add_generation_prompt=True``,
  appending ``<|im_start|>assistant\\n`` tokens that the action
  expert never saw at training (the chat tokenizer renders with
  ``add_generation_prompt=False``). Parameterized the helper and
  pass ``False`` from ``LowLevelForward``; ``select_message`` paths
  still default to ``True`` for AR text generation.

* **PI052 fallthrough could silently train flow on text-only frames**
  When ``text_loss_weight=0`` AND every sample was high-level
  (``predict_actions.any()==False``), the previous heuristic
  delegated to ``PI05Policy.forward``, which ignores
  ``predict_actions`` and runs flow on every sample. Reverted to
  delegating only on fully unannotated batches.

* **SmolVLA2 silent zero-loss training**
  ``forward`` returned ``loss=0`` (no error) when neither flow nor
  text path fired. Now raises ``RuntimeError`` with the weights and
  routing flags — fails loud like PI052 already does.

* **PI052 dropout-seed key**
  Was reading ``complementary["dataset_index"]`` (only set by
  ``MultiDataset`` and means "which sub-dataset", not row index)
  with fallback to ``frame_index`` (never set) — every sample got
  seed=0, so per-component dropout was deterministic across the
  epoch. Switched to ``complementary["index"]`` to match SmolVLA2
  and the canonical ``BatchProcessor`` convention.

* **Dead ``DEFAULT_TOOLS`` import**
  Removed from ``chat_processor_smolvla2.py`` — unused since the
  default-tools list was switched to ``[]`` in the prior commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../{smolvla2_hirobot.yaml => hirobot.yaml}   | 39 ++++++-----
 .../configs/recipes/pi052_hirobot.yaml        | 65 -------------------
 src/lerobot/policies/pi052/__init__.py        |  2 +-
 .../policies/pi052/configuration_pi052.py     |  2 +-
 src/lerobot/policies/pi052/modeling_pi052.py  | 25 +++----
 .../policies/pi052/text_processor_pi052.py    |  8 ++-
 .../smolvla2/chat_processor_smolvla2.py       |  3 +-
 .../smolvla2/configuration_smolvla2.py        |  2 +-
 .../policies/smolvla2/inference/steps.py      | 23 +++++--
 .../policies/smolvla2/modeling_smolvla2.py    | 15 +++++
 .../policies/smolvla2/processor_smolvla2.py   |  2 +-
 11 files changed, 71 insertions(+), 115 deletions(-)
 rename src/lerobot/configs/recipes/{smolvla2_hirobot.yaml => hirobot.yaml} (64%)
 delete mode 100644 src/lerobot/configs/recipes/pi052_hirobot.yaml

diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/hirobot.yaml
similarity index 64%
rename from src/lerobot/configs/recipes/smolvla2_hirobot.yaml
rename to src/lerobot/configs/recipes/hirobot.yaml
index ffbb6b92b..8eb21cc3c 100644
--- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
+++ b/src/lerobot/configs/recipes/hirobot.yaml
@@ -1,18 +1,19 @@
-# SmolVLA2 Hi-Robot blend — π0.5-style split:
+# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and
+# PI052 (PaliGemma backbone). π0.5-style split:
 #
-#   The action expert is conditioned on (images, state, subtask)
-#   only — NOT on task / plan / memory. We achieve this by splitting
-#   the work across two main sub-recipes:
+#   The action expert is conditioned on (images, state, subtask) only.
+#   Hierarchical context (task + plan + memory) only flows into the
+#   high-level text head.
 #
-#   1. high_level_subtask  — text-only. Trains the LM head to predict
-#      the current subtask from (task + plan + memory). At a memory
-#      boundary, also predicts the new memory in the same forward.
-#   2. low_level_execution — action. Renders just the subtask as the
-#      language conditioning so the action expert's prefix is
-#      [images, subtask, state]. Flow loss + (redundant) text CE on
-#      the subtask itself.
-#   3. plan_generation     — text only. task → plan.
-#   4. ask_vqa_{top,wrist} — text only. camera-grounded VQA.
+#     high_level_subtask  — predict subtask from (task+plan+memory),
+#                           and the new memory at boundary frames.
+#     low_level_execution — flow loss with [images, subtask, state].
+#     plan_generation     — task → plan.
+#     ask_vqa_{top,wrist} — camera-grounded VQA.
+#
+# Each backbone's text tokenizer renders these messages differently
+# (SmolVLA2 uses the chat template; PI052 concatenates as plain
+# ``Role: content`` text), but the recipe spec is identical.
 
 blend:
 
@@ -32,13 +33,11 @@ blend:
   low_level_execution:
     weight: 0.30
     messages:
-      # π0.5-style action conditioning: the action expert sees just
-      # the subtask (plus images + state). No text-CE target here —
-      # ``high_level_subtask`` (w=0.50) already trains subtask
-      # prediction from real context; supervising it again as a
-      # copy-from-user turn would dilute the LM head. ``stream:
-      # low_level`` on either turn is enough to flip
-      # ``predict_actions=True`` so the flow loss fires.
+      # π0.5-style action conditioning. The action expert sees only
+      # [images, this user turn (= bare subtask), state]. No text-CE
+      # target — subtask prediction is owned by ``high_level_subtask``.
+      # ``stream: low_level`` flips ``predict_actions=True`` so the
+      # flow loss fires.
       - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
 
   plan_generation:
diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml
deleted file mode 100644
index 0aa19c72f..000000000
--- a/src/lerobot/configs/recipes/pi052_hirobot.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# π0.5 v2 (pi052) Hi-Robot blend.
-#
-# Same shape as ``smolvla2_hirobot.yaml`` — see that file for the
-# flavor breakdown. The only difference here is the backbone:
-# PaliGemma isn't chat-pretrained, so ``PI052TextTokenizerStep``
-# concatenates messages as ``Role: content`` plain text instead
-# of calling ``apply_chat_template``.
-
-blend:
-
-  high_level_subtask:
-    weight: 0.50
-    bindings:
-      new_memory: "emitted_at(t, style=memory)"
-    messages:
-      - role: user
-        stream: high_level
-        content: "${task}\nPlan: ${plan}\nMemory: ${memory}"
-      - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask}
-      - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory}
-
-  low_level_execution:
-    weight: 0.30
-    messages:
-      # Action expert prefix = [images, subtask, state] only — π0.5 style.
-      # No text-CE target: ``high_level_subtask`` already supervises
-      # subtask prediction from real context. ``stream: low_level``
-      # flips ``predict_actions=True`` so the flow loss fires.
-      - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
-
-  plan_generation:
-    weight: 0.10
-    bindings:
-      current_plan: "active_at(t, style=plan)"
-    messages:
-      - {role: user, content: "${task}", stream: high_level}
-      - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan}
-
-  ask_vqa_top:
-    weight: 0.05
-    bindings:
-      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)"
-      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)"
-    messages:
-      - role: user
-        stream: high_level
-        if_present: vqa_query
-        content:
-          - {type: image, feature: observation.images.front}
-          - {type: text, text: "${vqa_query}"}
-      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
-
-  ask_vqa_wrist:
-    weight: 0.05
-    bindings:
-      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
-      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
-    messages:
-      - role: user
-        stream: high_level
-        if_present: vqa_query
-        content:
-          - {type: image, feature: observation.images.wrist}
-          - {type: text, text: "${vqa_query}"}
-      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
diff --git a/src/lerobot/policies/pi052/__init__.py b/src/lerobot/policies/pi052/__init__.py
index 3e4c42f1c..d94e17007 100644
--- a/src/lerobot/policies/pi052/__init__.py
+++ b/src/lerobot/policies/pi052/__init__.py
@@ -24,7 +24,7 @@ Extends :class:`lerobot.policies.pi05.PI05Policy` with:
 * per-component prompt dropout (Pi 0.7 §V.E) for regularising the
   text head against missing context at inference.
 
-See ``src/lerobot/configs/recipes/pi052_hirobot.yaml`` for the
+See ``src/lerobot/configs/recipes/hirobot.yaml`` for the
 canonical training recipe and
 ``examples/training/pi052_hirobot.slurm`` for the launcher.
 """
diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py
index 3c4e73897..32bb46810 100644
--- a/src/lerobot/policies/pi052/configuration_pi052.py
+++ b/src/lerobot/policies/pi052/configuration_pi052.py
@@ -57,7 +57,7 @@ class PI052Config(PI05Config):
     """
 
     # Recipe / language stack ---------------------------------------------
-    recipe_path: str | None = "recipes/pi052_hirobot.yaml"
+    recipe_path: str | None = "recipes/hirobot.yaml"
     """Path (absolute or relative to ``src/lerobot/configs/``) to a
     ``TrainingRecipe`` YAML. Defaults to the canonical Hi-Robot blend
     shipped alongside this policy. Set to ``None`` to disable recipe
diff --git a/src/lerobot/policies/pi052/modeling_pi052.py b/src/lerobot/policies/pi052/modeling_pi052.py
index 9553fd89a..34b07168a 100644
--- a/src/lerobot/policies/pi052/modeling_pi052.py
+++ b/src/lerobot/policies/pi052/modeling_pi052.py
@@ -366,26 +366,17 @@ class PI052Policy(PI05Policy):
         text_labels = batch.get("text_labels")
         predict_actions_t = batch.get("predict_actions")
 
-        # Unannotated datasets / batches with nothing to train: fall
-        # through to PI05Policy so the plain flow-only training surface
-        # keeps working. Triggers when:
-        #   * the recipe wasn't applied (no text_labels, no
-        #     predict_actions), OR
-        #   * every sample's recipe is text-only AND text is disabled
-        #     (would otherwise hit the "nothing to train" raise below).
-        text_disabled = (
-            self.config.text_loss_weight <= 0 or text_labels is None
-        )
-        fast_disabled = not getattr(self.config, "enable_fast_action_loss", False)
-        no_flow_samples = (
-            predict_actions_t is not None
-            and not bool(predict_actions_t.any().item())
-        )
+        # Fall through to PI05Policy only on fully unannotated batches
+        # (no recipe applied → no routing fields). For recipe-applied
+        # batches we keep control of the loss dispatch even if all
+        # samples are text-only — delegating would silently train flow
+        # on text-only frames (PI05Policy.forward ignores
+        # ``predict_actions``).
         if (
             text_labels is None
             and predict_actions_t is None
-            and fast_disabled
-        ) or (text_disabled and no_flow_samples and fast_disabled):
+            and not getattr(self.config, "enable_fast_action_loss", False)
+        ):
             return super().forward(batch, reduction=reduction)
 
         run_flow = (
diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py
index 649e67b90..1dcedbfc1 100644
--- a/src/lerobot/policies/pi052/text_processor_pi052.py
+++ b/src/lerobot/policies/pi052/text_processor_pi052.py
@@ -252,8 +252,14 @@ class PI052TextTokenizerStep(ProcessorStep):
 
         seed = self.dropout_seed
         if seed is None:
-            seed_src = complementary.get("dataset_index") or complementary.get("frame_index") or 0
+            # Canonical row-index key set by ``BatchProcessor`` /
+            # ``render_messages_processor``. Falling back to other
+            # keys silently gave every sample seed=0 → identical
+            # dropout pattern across the whole epoch.
+            seed_src = complementary.get("index", 0)
             try:
+                if hasattr(seed_src, "item"):
+                    seed_src = seed_src.item()
                 seed = int(seed_src)
             except (TypeError, ValueError):
                 seed = 0
diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
index 454a1c2d8..1cf88b0fd 100644
--- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
@@ -45,7 +45,6 @@ from typing import Any
 import torch
 
 from lerobot.configs import PipelineFeatureType, PolicyFeature
-from lerobot.datasets.language import DEFAULT_TOOLS
 from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry
 from lerobot.types import EnvTransition, TransitionKey
 from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
@@ -283,7 +282,7 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
         """Probabilistically drop non-target context messages.
 
         Heuristic content sniffing — matches the prefix strings that
-        ``smolvla2_hirobot.yaml``'s recipes use when injecting plan /
+        ``hirobot.yaml``'s recipes use when injecting plan /
         memory / subtask / interjection content. Anything else is
         kept unchanged. Target messages are never dropped (we still
         need their tokens for supervision).
diff --git a/src/lerobot/policies/smolvla2/configuration_smolvla2.py b/src/lerobot/policies/smolvla2/configuration_smolvla2.py
index bc24139fd..8b7b1e5e8 100644
--- a/src/lerobot/policies/smolvla2/configuration_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/configuration_smolvla2.py
@@ -56,7 +56,7 @@ class SmolVLA2Config(SmolVLAConfig):
     """
 
     # Recipe / language stack ---------------------------------------------
-    recipe_path: str | None = "recipes/smolvla2_hirobot.yaml"
+    recipe_path: str | None = "recipes/hirobot.yaml"
     """Path (absolute or relative to ``src/lerobot/configs/``) to a
     ``TrainingRecipe`` YAML. The default points at the canonical Hi Robot
     blend shipped alongside SmolVLA2. Set to ``None`` to disable recipe
diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index e638ca636..1d7a28853 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -17,7 +17,7 @@ Each step is a tiny class with a ``trigger`` and an ``__call__(state)``;
 the runtime applies them in order each tick. When a step's trigger
 doesn't fire, the step is a no-op and the runtime moves on.
 
-Stream-to-step mapping mirrors the ``smolvla2_hirobot.yaml`` recipe:
+Stream-to-step mapping mirrors the ``hirobot.yaml`` recipe:
 
 * ``LowLevelForward``        — calls ``policy.select_action`` for the
                                 action chunk; trained by
@@ -120,7 +120,13 @@ class LowLevelForward(InferenceStep):
         # high-level recipe).
         subtask = state.get("current_subtask") or state.get("task") or ""
         ctx = [{"role": "user", "content": subtask}]
-        text_batch = _build_text_batch(self.policy, ctx)
+        # ``add_generation_prompt=False`` to match the training-time
+        # prefix shape: at training the action expert sees the rendered
+        # user turn ending at ``<|im_end|>`` (no trailing
+        # ``<|im_start|>assistant\n``). Passing True here would append
+        # extra role-marker tokens the action expert never saw during
+        # training.
+        text_batch = _build_text_batch(self.policy, ctx, add_generation_prompt=False)
         from lerobot.utils.constants import (  # noqa: PLC0415
             OBS_LANGUAGE_ATTENTION_MASK,
             OBS_LANGUAGE_TOKENS,
@@ -232,7 +238,12 @@ class DispatchAction(InferenceStep):
 # ---------------------------------------------------------------------------
 
 
-def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dict[str, Any]:
+def _build_text_batch(
+    policy: Any,
+    prompt_messages: list[dict[str, Any]],
+    *,
+    add_generation_prompt: bool = True,
+) -> dict[str, Any]:
     """Tokenize a list of chat messages into the batch shape
     ``select_message`` expects.
 
@@ -263,7 +274,7 @@ def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dic
     text_messages = [_strip_lerobot_blocks(m) for m in prompt_messages]
     encoded = tokenizer.apply_chat_template(
         text_messages,
-        add_generation_prompt=True,
+        add_generation_prompt=add_generation_prompt,
         tokenize=True,
         return_tensors="pt",
     )
@@ -690,7 +701,7 @@ def _control_context_messages(
 ) -> list[dict[str, Any]]:
     """Build a chat-template-ready prompt from current runtime state.
 
-    Mirrors what ``smolvla2_hirobot.yaml`` renders into ``${task}\nPlan:
+    Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan:
     ${plan}\nMemory: ${memory}`` for the high-level branches.
     """
     parts: list[str] = []
@@ -711,7 +722,7 @@ def _control_context_messages(
 
 # ---------------------------------------------------------------------------
 # Per-recipe prompt builders. Each one mirrors a single sub-recipe's
-# message layout in ``smolvla2_hirobot.yaml`` so the chat-templated
+# message layout in ``hirobot.yaml`` so the chat-templated
 # prompt at inference matches what the model saw during training.
 # Generic ``_control_context_messages`` is kept around as a fallback
 # for ad-hoc callers but the four high-level steps now use these.
diff --git a/src/lerobot/policies/smolvla2/modeling_smolvla2.py b/src/lerobot/policies/smolvla2/modeling_smolvla2.py
index 7c1bcc9b9..557f1f72b 100644
--- a/src/lerobot/policies/smolvla2/modeling_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/modeling_smolvla2.py
@@ -246,6 +246,21 @@ class SmolVLA2Policy(SmolVLAPolicy):
             text_loss = self._compute_text_loss(batch, text_labels)
             total = total + self.config.text_loss_weight * text_loss
             loss_dict["text_loss"] = float(text_loss.detach().item())
+        else:
+            # No path fired — happens when both loss weights are 0 or
+            # the batch has neither action samples nor supervised text.
+            # Fail loud rather than train silently on a zero loss.
+            raise RuntimeError(
+                "SmolVLA2Policy.forward: nothing to train — "
+                "flow_loss_weight=%s, text_loss_weight=%s, "
+                "predict_actions.any()=%s, has_text_data=%s"
+                % (
+                    self.config.flow_loss_weight,
+                    self.config.text_loss_weight,
+                    bool(predict_actions_t.any().item()) if has_per_sample_routing else None,
+                    has_text_data,
+                )
+            )
 
         loss_dict["loss"] = float(total.detach().item())
 
diff --git a/src/lerobot/policies/smolvla2/processor_smolvla2.py b/src/lerobot/policies/smolvla2/processor_smolvla2.py
index 93cbd0252..a76608502 100644
--- a/src/lerobot/policies/smolvla2/processor_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/processor_smolvla2.py
@@ -121,7 +121,7 @@ def _load_recipe(path_str: str) -> TrainingRecipe:
 
     Accepts an absolute path or a path relative to
     ``src/lerobot/configs/`` so recipe authors can write
-    ``--policy.recipe_path=recipes/smolvla2_hirobot.yaml``.
+    ``--policy.recipe_path=recipes/hirobot.yaml``.
     """
     p = Path(path_str)
     if not p.is_absolute() and not p.exists():