From 2c920ab1781f644a4d8cc6da5444e311ea097b4a Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 15:16:28 +0200 Subject: [PATCH] refactor(recipes): consolidate to shared hirobot.yaml + audit fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smolvla2 and pi052 recipe blends had drifted to identical content twice in a row; collapse them to a single ``recipes/hirobot.yaml`` both policies point at. Each backbone's text tokenizer (chat-template for SmolVLA2, plain ``Role: content`` for PI052) handles the rendering differences downstream — the recipe spec is shared. Audit fixes folded into the same commit: * **Train/inference prefix mismatch on the action expert** ``_build_text_batch`` always passed ``add_generation_prompt=True``, appending ``<|im_start|>assistant\\n`` tokens that the action expert never saw at training (the chat tokenizer renders with ``add_generation_prompt=False``). Parameterized the helper and pass ``False`` from ``LowLevelForward``; ``select_message`` paths still default to ``True`` for AR text generation. * **PI052 fallthrough could silently train flow on text-only frames** When ``text_loss_weight=0`` AND every sample was high-level (``predict_actions.any()==False``), the previous heuristic delegated to ``PI05Policy.forward``, which ignores ``predict_actions`` and runs flow on every sample. Reverted to delegating only on fully unannotated batches. * **SmolVLA2 silent zero-loss training** ``forward`` returned ``loss=0`` (no error) when neither flow nor text path fired. Now raises ``RuntimeError`` with the weights and routing flags — fails loud like PI052 already does. * **PI052 dropout-seed key** Was reading ``complementary["dataset_index"]`` (only set by ``MultiDataset`` and means "which sub-dataset", not row index) with fallback to ``frame_index`` (never set) — every sample got seed=0, so per-component dropout was deterministic across the epoch. Switched to ``complementary["index"]`` to match SmolVLA2 and the canonical ``BatchProcessor`` convention. * **Dead ``DEFAULT_TOOLS`` import** Removed from ``chat_processor_smolvla2.py`` — unused since the default-tools list was switched to ``[]`` in the prior commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../{smolvla2_hirobot.yaml => hirobot.yaml} | 39 ++++++----- .../configs/recipes/pi052_hirobot.yaml | 65 ------------------- src/lerobot/policies/pi052/__init__.py | 2 +- .../policies/pi052/configuration_pi052.py | 2 +- src/lerobot/policies/pi052/modeling_pi052.py | 25 +++---- .../policies/pi052/text_processor_pi052.py | 8 ++- .../smolvla2/chat_processor_smolvla2.py | 3 +- .../smolvla2/configuration_smolvla2.py | 2 +- .../policies/smolvla2/inference/steps.py | 23 +++++-- .../policies/smolvla2/modeling_smolvla2.py | 15 +++++ .../policies/smolvla2/processor_smolvla2.py | 2 +- 11 files changed, 71 insertions(+), 115 deletions(-) rename src/lerobot/configs/recipes/{smolvla2_hirobot.yaml => hirobot.yaml} (64%) delete mode 100644 src/lerobot/configs/recipes/pi052_hirobot.yaml diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/hirobot.yaml similarity index 64% rename from src/lerobot/configs/recipes/smolvla2_hirobot.yaml rename to src/lerobot/configs/recipes/hirobot.yaml index ffbb6b92b..8eb21cc3c 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/hirobot.yaml @@ -1,18 +1,19 @@ -# SmolVLA2 Hi-Robot blend — π0.5-style split: +# Hi-Robot blend — shared between SmolVLA2 (SmolVLM2 backbone) and +# PI052 (PaliGemma backbone). π0.5-style split: # -# The action expert is conditioned on (images, state, subtask) -# only — NOT on task / plan / memory. We achieve this by splitting -# the work across two main sub-recipes: +# The action expert is conditioned on (images, state, subtask) only. +# Hierarchical context (task + plan + memory) only flows into the +# high-level text head. # -# 1. high_level_subtask — text-only. Trains the LM head to predict -# the current subtask from (task + plan + memory). At a memory -# boundary, also predicts the new memory in the same forward. -# 2. low_level_execution — action. Renders just the subtask as the -# language conditioning so the action expert's prefix is -# [images, subtask, state]. Flow loss + (redundant) text CE on -# the subtask itself. -# 3. plan_generation — text only. task → plan. -# 4. ask_vqa_{top,wrist} — text only. camera-grounded VQA. +# high_level_subtask — predict subtask from (task+plan+memory), +# and the new memory at boundary frames. +# low_level_execution — flow loss with [images, subtask, state]. +# plan_generation — task → plan. +# ask_vqa_{top,wrist} — camera-grounded VQA. +# +# Each backbone's text tokenizer renders these messages differently +# (SmolVLA2 uses the chat template; PI052 concatenates as plain +# ``Role: content`` text), but the recipe spec is identical. blend: @@ -32,13 +33,11 @@ blend: low_level_execution: weight: 0.30 messages: - # π0.5-style action conditioning: the action expert sees just - # the subtask (plus images + state). No text-CE target here — - # ``high_level_subtask`` (w=0.50) already trains subtask - # prediction from real context; supervising it again as a - # copy-from-user turn would dilute the LM head. ``stream: - # low_level`` on either turn is enough to flip - # ``predict_actions=True`` so the flow loss fires. + # π0.5-style action conditioning. The action expert sees only + # [images, this user turn (= bare subtask), state]. No text-CE + # target — subtask prediction is owned by ``high_level_subtask``. + # ``stream: low_level`` flips ``predict_actions=True`` so the + # flow loss fires. - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} plan_generation: diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml deleted file mode 100644 index 0aa19c72f..000000000 --- a/src/lerobot/configs/recipes/pi052_hirobot.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# π0.5 v2 (pi052) Hi-Robot blend. -# -# Same shape as ``smolvla2_hirobot.yaml`` — see that file for the -# flavor breakdown. The only difference here is the backbone: -# PaliGemma isn't chat-pretrained, so ``PI052TextTokenizerStep`` -# concatenates messages as ``Role: content`` plain text instead -# of calling ``apply_chat_template``. - -blend: - - high_level_subtask: - weight: 0.50 - bindings: - new_memory: "emitted_at(t, style=memory)" - messages: - - role: user - stream: high_level - content: "${task}\nPlan: ${plan}\nMemory: ${memory}" - - {role: assistant, content: "${subtask}", stream: high_level, target: true, if_present: subtask} - - {role: assistant, content: "${new_memory}", stream: high_level, target: true, if_present: new_memory} - - low_level_execution: - weight: 0.30 - messages: - # Action expert prefix = [images, subtask, state] only — π0.5 style. - # No text-CE target: ``high_level_subtask`` already supervises - # subtask prediction from real context. ``stream: low_level`` - # flips ``predict_actions=True`` so the flow loss fires. - - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} - - plan_generation: - weight: 0.10 - bindings: - current_plan: "active_at(t, style=plan)" - messages: - - {role: user, content: "${task}", stream: high_level} - - {role: assistant, content: "${current_plan}", stream: high_level, target: true, if_present: current_plan} - - ask_vqa_top: - weight: 0.05 - bindings: - vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.front)" - vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.front)" - messages: - - role: user - stream: high_level - if_present: vqa_query - content: - - {type: image, feature: observation.images.front} - - {type: text, text: "${vqa_query}"} - - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} - - ask_vqa_wrist: - weight: 0.05 - bindings: - vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)" - vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)" - messages: - - role: user - stream: high_level - if_present: vqa_query - content: - - {type: image, feature: observation.images.wrist} - - {type: text, text: "${vqa_query}"} - - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa} diff --git a/src/lerobot/policies/pi052/__init__.py b/src/lerobot/policies/pi052/__init__.py index 3e4c42f1c..d94e17007 100644 --- a/src/lerobot/policies/pi052/__init__.py +++ b/src/lerobot/policies/pi052/__init__.py @@ -24,7 +24,7 @@ Extends :class:`lerobot.policies.pi05.PI05Policy` with: * per-component prompt dropout (Pi 0.7 §V.E) for regularising the text head against missing context at inference. -See ``src/lerobot/configs/recipes/pi052_hirobot.yaml`` for the +See ``src/lerobot/configs/recipes/hirobot.yaml`` for the canonical training recipe and ``examples/training/pi052_hirobot.slurm`` for the launcher. """ diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py index 3c4e73897..32bb46810 100644 --- a/src/lerobot/policies/pi052/configuration_pi052.py +++ b/src/lerobot/policies/pi052/configuration_pi052.py @@ -57,7 +57,7 @@ class PI052Config(PI05Config): """ # Recipe / language stack --------------------------------------------- - recipe_path: str | None = "recipes/pi052_hirobot.yaml" + recipe_path: str | None = "recipes/hirobot.yaml" """Path (absolute or relative to ``src/lerobot/configs/``) to a ``TrainingRecipe`` YAML. Defaults to the canonical Hi-Robot blend shipped alongside this policy. Set to ``None`` to disable recipe diff --git a/src/lerobot/policies/pi052/modeling_pi052.py b/src/lerobot/policies/pi052/modeling_pi052.py index 9553fd89a..34b07168a 100644 --- a/src/lerobot/policies/pi052/modeling_pi052.py +++ b/src/lerobot/policies/pi052/modeling_pi052.py @@ -366,26 +366,17 @@ class PI052Policy(PI05Policy): text_labels = batch.get("text_labels") predict_actions_t = batch.get("predict_actions") - # Unannotated datasets / batches with nothing to train: fall - # through to PI05Policy so the plain flow-only training surface - # keeps working. Triggers when: - # * the recipe wasn't applied (no text_labels, no - # predict_actions), OR - # * every sample's recipe is text-only AND text is disabled - # (would otherwise hit the "nothing to train" raise below). - text_disabled = ( - self.config.text_loss_weight <= 0 or text_labels is None - ) - fast_disabled = not getattr(self.config, "enable_fast_action_loss", False) - no_flow_samples = ( - predict_actions_t is not None - and not bool(predict_actions_t.any().item()) - ) + # Fall through to PI05Policy only on fully unannotated batches + # (no recipe applied → no routing fields). For recipe-applied + # batches we keep control of the loss dispatch even if all + # samples are text-only — delegating would silently train flow + # on text-only frames (PI05Policy.forward ignores + # ``predict_actions``). if ( text_labels is None and predict_actions_t is None - and fast_disabled - ) or (text_disabled and no_flow_samples and fast_disabled): + and not getattr(self.config, "enable_fast_action_loss", False) + ): return super().forward(batch, reduction=reduction) run_flow = ( diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py index 649e67b90..1dcedbfc1 100644 --- a/src/lerobot/policies/pi052/text_processor_pi052.py +++ b/src/lerobot/policies/pi052/text_processor_pi052.py @@ -252,8 +252,14 @@ class PI052TextTokenizerStep(ProcessorStep): seed = self.dropout_seed if seed is None: - seed_src = complementary.get("dataset_index") or complementary.get("frame_index") or 0 + # Canonical row-index key set by ``BatchProcessor`` / + # ``render_messages_processor``. Falling back to other + # keys silently gave every sample seed=0 → identical + # dropout pattern across the whole epoch. + seed_src = complementary.get("index", 0) try: + if hasattr(seed_src, "item"): + seed_src = seed_src.item() seed = int(seed_src) except (TypeError, ValueError): seed = 0 diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py index 454a1c2d8..1cf88b0fd 100644 --- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py @@ -45,7 +45,6 @@ from typing import Any import torch from lerobot.configs import PipelineFeatureType, PolicyFeature -from lerobot.datasets.language import DEFAULT_TOOLS from lerobot.processor.pipeline import ProcessorStep, ProcessorStepRegistry from lerobot.types import EnvTransition, TransitionKey from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS @@ -283,7 +282,7 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep): """Probabilistically drop non-target context messages. Heuristic content sniffing — matches the prefix strings that - ``smolvla2_hirobot.yaml``'s recipes use when injecting plan / + ``hirobot.yaml``'s recipes use when injecting plan / memory / subtask / interjection content. Anything else is kept unchanged. Target messages are never dropped (we still need their tokens for supervision). diff --git a/src/lerobot/policies/smolvla2/configuration_smolvla2.py b/src/lerobot/policies/smolvla2/configuration_smolvla2.py index bc24139fd..8b7b1e5e8 100644 --- a/src/lerobot/policies/smolvla2/configuration_smolvla2.py +++ b/src/lerobot/policies/smolvla2/configuration_smolvla2.py @@ -56,7 +56,7 @@ class SmolVLA2Config(SmolVLAConfig): """ # Recipe / language stack --------------------------------------------- - recipe_path: str | None = "recipes/smolvla2_hirobot.yaml" + recipe_path: str | None = "recipes/hirobot.yaml" """Path (absolute or relative to ``src/lerobot/configs/``) to a ``TrainingRecipe`` YAML. The default points at the canonical Hi Robot blend shipped alongside SmolVLA2. Set to ``None`` to disable recipe diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index e638ca636..1d7a28853 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -17,7 +17,7 @@ Each step is a tiny class with a ``trigger`` and an ``__call__(state)``; the runtime applies them in order each tick. When a step's trigger doesn't fire, the step is a no-op and the runtime moves on. -Stream-to-step mapping mirrors the ``smolvla2_hirobot.yaml`` recipe: +Stream-to-step mapping mirrors the ``hirobot.yaml`` recipe: * ``LowLevelForward`` — calls ``policy.select_action`` for the action chunk; trained by @@ -120,7 +120,13 @@ class LowLevelForward(InferenceStep): # high-level recipe). subtask = state.get("current_subtask") or state.get("task") or "" ctx = [{"role": "user", "content": subtask}] - text_batch = _build_text_batch(self.policy, ctx) + # ``add_generation_prompt=False`` to match the training-time + # prefix shape: at training the action expert sees the rendered + # user turn ending at ``<|im_end|>`` (no trailing + # ``<|im_start|>assistant\n``). Passing True here would append + # extra role-marker tokens the action expert never saw during + # training. + text_batch = _build_text_batch(self.policy, ctx, add_generation_prompt=False) from lerobot.utils.constants import ( # noqa: PLC0415 OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS, @@ -232,7 +238,12 @@ class DispatchAction(InferenceStep): # --------------------------------------------------------------------------- -def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dict[str, Any]: +def _build_text_batch( + policy: Any, + prompt_messages: list[dict[str, Any]], + *, + add_generation_prompt: bool = True, +) -> dict[str, Any]: """Tokenize a list of chat messages into the batch shape ``select_message`` expects. @@ -263,7 +274,7 @@ def _build_text_batch(policy: Any, prompt_messages: list[dict[str, Any]]) -> dic text_messages = [_strip_lerobot_blocks(m) for m in prompt_messages] encoded = tokenizer.apply_chat_template( text_messages, - add_generation_prompt=True, + add_generation_prompt=add_generation_prompt, tokenize=True, return_tensors="pt", ) @@ -690,7 +701,7 @@ def _control_context_messages( ) -> list[dict[str, Any]]: """Build a chat-template-ready prompt from current runtime state. - Mirrors what ``smolvla2_hirobot.yaml`` renders into ``${task}\nPlan: + Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan: ${plan}\nMemory: ${memory}`` for the high-level branches. """ parts: list[str] = [] @@ -711,7 +722,7 @@ def _control_context_messages( # --------------------------------------------------------------------------- # Per-recipe prompt builders. Each one mirrors a single sub-recipe's -# message layout in ``smolvla2_hirobot.yaml`` so the chat-templated +# message layout in ``hirobot.yaml`` so the chat-templated # prompt at inference matches what the model saw during training. # Generic ``_control_context_messages`` is kept around as a fallback # for ad-hoc callers but the four high-level steps now use these. diff --git a/src/lerobot/policies/smolvla2/modeling_smolvla2.py b/src/lerobot/policies/smolvla2/modeling_smolvla2.py index 7c1bcc9b9..557f1f72b 100644 --- a/src/lerobot/policies/smolvla2/modeling_smolvla2.py +++ b/src/lerobot/policies/smolvla2/modeling_smolvla2.py @@ -246,6 +246,21 @@ class SmolVLA2Policy(SmolVLAPolicy): text_loss = self._compute_text_loss(batch, text_labels) total = total + self.config.text_loss_weight * text_loss loss_dict["text_loss"] = float(text_loss.detach().item()) + else: + # No path fired — happens when both loss weights are 0 or + # the batch has neither action samples nor supervised text. + # Fail loud rather than train silently on a zero loss. + raise RuntimeError( + "SmolVLA2Policy.forward: nothing to train — " + "flow_loss_weight=%s, text_loss_weight=%s, " + "predict_actions.any()=%s, has_text_data=%s" + % ( + self.config.flow_loss_weight, + self.config.text_loss_weight, + bool(predict_actions_t.any().item()) if has_per_sample_routing else None, + has_text_data, + ) + ) loss_dict["loss"] = float(total.detach().item()) diff --git a/src/lerobot/policies/smolvla2/processor_smolvla2.py b/src/lerobot/policies/smolvla2/processor_smolvla2.py index 93cbd0252..a76608502 100644 --- a/src/lerobot/policies/smolvla2/processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/processor_smolvla2.py @@ -121,7 +121,7 @@ def _load_recipe(path_str: str) -> TrainingRecipe: Accepts an absolute path or a path relative to ``src/lerobot/configs/`` so recipe authors can write - ``--policy.recipe_path=recipes/smolvla2_hirobot.yaml``. + ``--policy.recipe_path=recipes/hirobot.yaml``. """ p = Path(path_str) if not p.is_absolute() and not p.exists():